class OnlineQuantizationConfig(QuantizationConfig):
"""Model-level config for online quantization (quantize fp16/bf16 weights
during model loading, without requiring a pre-quantized checkpoint)."""
def __init__(
self,
args: QuantizationConfigArgs,
) -> None:
super().__init__()
if args.linear is None and args.moe is None:
raise ValueError(
"OnlineQuantizationConfig requires at least one of "
"quantization_config.linear or quantization_config.moe "
"to be set."
)
self.args = args
self.ignored_layers: list[str] = args.ignore
@classmethod
def get_name(cls) -> QuantizationMethods:
return "online"
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
return [torch.bfloat16, torch.half]
@classmethod
def get_min_capability(cls) -> int:
# Note: as more online quant schemes will be added, this
# value will become the minimum across all supported schemes.
return 75
@classmethod
def get_config_filenames(cls) -> list[str]:
return []
@classmethod
def from_config(cls, config: dict[str, Any]) -> "OnlineQuantizationConfig":
raise NotImplementedError(
"OnlineQuantizationConfig does not support loading from a "
"checkpoint config. Use quantization_config or "
"quantization='fp8_per_tensor'/'fp8_per_block' instead."
)
def _dispatch(
self,
spec: QuantSpec | None,
table: dict[QuantKey, type],
layer: torch.nn.Module,
) -> "QuantizeMethodBase | None":
if spec is None or spec.weight is None:
return None
cls = table.get(spec.weight)
if cls is None:
raise ValueError(
f"online quantization for {type(layer).__name__} with "
f"weight={spec.weight} is not supported; supported weight "
f"keys: {sorted(str(k) for k in table)}"
)
# Online method classes pick their own activation format internally.
# Per-class activation overrides are not yet wired through; reject
# explicit overrides until the relevant method class opts in.
if spec.activation is not None:
raise ValueError(
f"activation override (activation={spec.activation}) is not "
f"yet supported for online {cls.__name__}"
)
if isinstance(layer, RoutedExperts):
return cls(layer=layer)
return cls()
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> "QuantizeMethodBase | None":
if isinstance(layer, LinearBase):
if should_ignore_layer(
prefix,
ignore=self.ignored_layers,
fused_mapping=self.packed_modules_mapping,
):
return UnquantizedLinearMethod()
method = self._dispatch(self.args.linear, _ONLINE_LINEAR_METHODS, layer)
return method if method is not None else UnquantizedLinearMethod()
elif isinstance(layer, RoutedExperts):
if should_ignore_layer(
prefix,
ignore=self.ignored_layers,
fused_mapping=self.packed_modules_mapping,
):
return UnquantizedFusedMoEMethod(layer.moe_config)
method = self._dispatch(self.args.moe, _ONLINE_MOE_METHODS, layer)
return (
method
if method is not None
else UnquantizedFusedMoEMethod(layer.moe_config)
)
return None