vllm.model_executor.layers.fused_moe.oracle.mxfp4 ¶

_backend_activation_key ¶

_backend_activation_key(
    backend: Mxfp4MoeBackend,
) -> QuantKey | None

Map backend to its activation key (FP8, MXFP8, or None for BF16).

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def _backend_activation_key(backend: Mxfp4MoeBackend) -> QuantKey | None:
    """Map backend to its activation key (FP8, MXFP8, or None for BF16)."""
    if backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
        return kFp8Dynamic128Sym
    if backend in (
        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
    ):
        return kMxfp8Dynamic
    if backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
        return kFp8StaticTensorSym
    if backend == Mxfp4MoeBackend.AITER_MXFP4_MXFP4:
        return kMxfp4Dynamic
    return None  # BF16 activation

_filter_by_activation ¶

_filter_by_activation(
    backends: list[Mxfp4MoeBackend],
    requested_activation_key: QuantKey | None,
) -> list[Mxfp4MoeBackend]

Pick variants matching requested_activation_key; without one, prefer BF16 if the list has any, else keep the list as-is so explicit non-BF16 picks (e.g. the _afp8 aliases) still land.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def _filter_by_activation(
    backends: list[Mxfp4MoeBackend],
    requested_activation_key: QuantKey | None,
) -> list[Mxfp4MoeBackend]:
    """Pick variants matching ``requested_activation_key``; without one,
    prefer BF16 if the list has any, else keep the list as-is so explicit
    non-BF16 picks (e.g. the ``_afp8`` aliases) still land."""
    if requested_activation_key is not None:
        return [
            b
            for b in backends
            if _backend_activation_key(b) == requested_activation_key
            or b == Mxfp4MoeBackend.EMULATION
        ]
    bf16 = [b for b in backends if _backend_activation_key(b) is None]
    return bf16 if bf16 else backends

_get_priority_backends ¶

_get_priority_backends() -> list[Mxfp4MoeBackend]

Get available backends in priority order. SM100+ prefers DeepGEMM FP4 / TRTLLM MXFP8; SM90 falls through to Triton_unfused or Marlin (the backend-level is_supported_config check filters by device capability).

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def _get_priority_backends() -> list[Mxfp4MoeBackend]:
    """
    Get available backends in priority order. SM100+ prefers DeepGEMM FP4 /
    TRTLLM MXFP8; SM90 falls through to Triton_unfused or Marlin (the
    backend-level ``is_supported_config`` check filters by device capability).
    """
    if current_platform.is_rocm():
        return [Mxfp4MoeBackend.AITER_MXFP4_BF16]
    _AVAILABLE_BACKENDS = [
        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
        Mxfp4MoeBackend.DEEPGEMM_MXFP4,
        # TRITON_UNFUSED has bug with MTP support
        # TODO re-enable after kernel is fixed
        # TRITON_UNFUSED
        Mxfp4MoeBackend.MARLIN,
        Mxfp4MoeBackend.BATCHED_MARLIN,
    ]
    return _AVAILABLE_BACKENDS

_get_priority_backends_for_gpt_oss ¶

_get_priority_backends_for_gpt_oss() -> list[
    Mxfp4MoeBackend
]

Available backends in priority order, BF16-act variant before activation-quantized variant within each vendor family.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def _get_priority_backends_for_gpt_oss() -> list[Mxfp4MoeBackend]:
    """Available backends in priority order, BF16-act variant before
    activation-quantized variant within each vendor family."""
    _AVAILABLE_BACKENDS = [
        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
        Mxfp4MoeBackend.AITER_MXFP4_BF16,
        Mxfp4MoeBackend.AITER_MXFP4_FP8,
        Mxfp4MoeBackend.AITER_MXFP4_MXFP4,
        Mxfp4MoeBackend.TRITON,
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
        # TRITON_UNFUSED has bug with MTP support
        # TODO re-enable after kernel is fixed
        # TRITON_UNFUSED
        Mxfp4MoeBackend.MARLIN,
        Mxfp4MoeBackend.BATCHED_MARLIN,
        Mxfp4MoeBackend.XPU,
    ]
    return _AVAILABLE_BACKENDS

_resolve_activation_key ¶

_resolve_activation_key(
    model_activation_key: QuantKey | None,
) -> QuantKey | None

Combine the model-supplied activation key with the user override. Raises on conflict (both set and disagreeing).

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def _resolve_activation_key(
    model_activation_key: QuantKey | None,
) -> QuantKey | None:
    """Combine the model-supplied activation key with the user override.
    Raises on conflict (both set and disagreeing)."""
    user_override = _user_moe_activation_override()
    if user_override is None:
        return model_activation_key
    if model_activation_key is None or model_activation_key == user_override:
        return user_override
    raise ValueError(
        f"checkpoint declares MoE activation={model_activation_key} but "
        f"quantization_config.moe.activation={user_override}; remove the "
        f"override or align it with the checkpoint."
    )

_user_moe_activation_override ¶

_user_moe_activation_override() -> QuantKey | None

User's MoE activation override from quantization_config, or None.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def _user_moe_activation_override() -> QuantKey | None:
    """User's MoE activation override from quantization_config, or None."""
    args = get_current_vllm_config().model_config.quantization_config
    if not isinstance(args, QuantizationConfigArgs) or args.moe is None:
        return None
    return args.moe.activation

convert_gpt_oss_weight_to_mxfp4_moe_kernel_format ¶

convert_gpt_oss_weight_to_mxfp4_moe_kernel_format(
    mxfp4_backend: Mxfp4MoeBackend,
    layer: Module,
    w13_weight: Tensor,
    w2_weight: Tensor,
    w13_weight_scale: Tensor,
    w2_weight_scale: Tensor,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
    _cache_permute_indices: dict[Size, Tensor]
    | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Union[Tensor, PrecisionConfig],
    Union[Tensor, PrecisionConfig],
    Tensor | None,
    Tensor | None,
]

Convert loaded weights into backend-specific kernel format.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def convert_gpt_oss_weight_to_mxfp4_moe_kernel_format(
    mxfp4_backend: Mxfp4MoeBackend,
    layer: torch.nn.Module,
    w13_weight: torch.Tensor,
    w2_weight: torch.Tensor,
    w13_weight_scale: torch.Tensor,
    w2_weight_scale: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    _cache_permute_indices: dict[torch.Size, torch.Tensor] | None = None,
) -> tuple[
    torch.Tensor,
    torch.Tensor,
    Union[torch.Tensor, "PrecisionConfig"],
    Union[torch.Tensor, "PrecisionConfig"],
    torch.Tensor | None,
    torch.Tensor | None,
]:
    """Convert loaded weights into backend-specific kernel format."""

    if mxfp4_backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
            _upcast_e8m0_to_fp32,
        )

        return (
            w13_weight.data,
            w2_weight.data,
            _upcast_e8m0_to_fp32(w13_weight_scale.data),
            _upcast_e8m0_to_fp32(w2_weight_scale.data),
            w13_bias,
            w2_bias,
        )

    num_experts = w13_weight.shape[0]
    intermediate_size = w13_weight.shape[1] // 2
    hidden_size = w13_weight.shape[2] * 2

    sf_block_size = 32  # mxfp4 block size

    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
        from vllm.model_executor.layers.quantization.utils.humming_utils import (
            prepare_humming_moe_layer,
        )

        prepare_humming_moe_layer(layer, {"quant_method": "gpt_oss_mxfp4"})
        return (
            layer.w13_weight,
            layer.w2_weight,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            getattr(layer, "w13_bias", None),
            getattr(layer, "w2_bias", None),
        )
    elif mxfp4_backend in (
        Mxfp4MoeBackend.MARLIN,
        Mxfp4MoeBackend.BATCHED_MARLIN,
    ):
        from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
            prepare_moe_mxfp4_layer_for_marlin,
        )

        return prepare_moe_mxfp4_layer_for_marlin(
            layer,
            w13_weight,
            w2_weight,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend in TRTLLM_BACKENDS:
        assert _cache_permute_indices is not None
        from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
        from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache

        # gemm1_alpha/beta/clamp_limit are created by the expert class
        # (TrtLlmMxfp4ExpertsBase), not on the layer.

        w13_weight = w13_weight.data
        w2_weight = w2_weight.data
        w13_weight_scale = w13_weight_scale.data
        w2_weight_scale = w2_weight_scale.data
        assert w13_bias is not None and w2_bias is not None
        w13_bias = w13_bias.data.to(torch.float32)
        w2_bias = w2_bias.data.to(torch.float32)

        # Swap w1 and w3 as the definition of swiglu is different in trtllm-gen
        def swap_every_two_rows(x, axis=-1):
            shape = x.shape
            if axis < 0:
                axis = len(shape) + axis
            new_shape = list(shape)
            new_shape[axis] = shape[axis] // 2
            new_shape.insert(axis + 1, 2)
            x = x.reshape(*new_shape)
            x = x.flip(axis + 1)
            new_shape = list(shape)
            return x.reshape(*new_shape)

        w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2)
        w13_weight = swap_every_two_rows(w13_weight, -2)
        w13_bias = swap_every_two_rows(w13_bias, -1)

        # Shuffle weights and scaling factors for transposed mma output
        gemm1_weights_shuffled = []
        gemm1_scales_shuffled = []
        gemm2_weights_shuffled = []
        gemm2_scales_shuffled = []
        gemm1_bias_shuffled = []
        gemm2_bias_shuffled = []
        epilogue_tile_m = 128
        for i in range(num_experts):
            # w13 weight
            permute_indices = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w13_weight[i].view(torch.uint8),
                epilogue_tile_m,
            )
            gemm1_weights_shuffled.append(
                w13_weight[i]
                .view(torch.uint8)[permute_indices.to(w13_weight.device)]
                .contiguous()
            )
            # w13 scale
            permute_sf_indices = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w13_weight_scale[i].view(torch.uint8),
                epilogue_tile_m,
                num_elts_per_sf=16,
            )
            gemm1_scales_shuffled.append(
                nvfp4_block_scale_interleave(
                    w13_weight_scale[i]
                    .view(torch.uint8)[permute_sf_indices.to(w13_weight_scale.device)]
                    .contiguous()
                )
            )
            # w13 bias
            permute_bias_indices = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w13_bias[i].clone().reshape(-1, 1),
                epilogue_tile_m,
            )
            gemm1_bias_shuffled.append(
                w13_bias[i]
                .clone()
                .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)]
                .contiguous()
            )
            # w2 weight
            permute_indices = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w2_weight[i].view(torch.uint8),
                epilogue_tile_m,
            )
            gemm2_weights_shuffled.append(
                w2_weight[i]
                .view(torch.uint8)[permute_indices.to(w2_weight.device)]
                .contiguous()
            )
            # w2 scale
            permute_sf_indices = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w2_weight_scale[i].view(torch.uint8),
                epilogue_tile_m,
                num_elts_per_sf=16,
            )
            gemm2_scales_shuffled.append(
                nvfp4_block_scale_interleave(
                    w2_weight_scale[i]
                    .view(torch.uint8)[permute_sf_indices.to(w2_weight_scale.device)]
                    .contiguous()
                )
            )
            # w2 bias
            permute_indices = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w2_bias[i].clone().reshape(-1, 1),
                epilogue_tile_m,
            )
            gemm2_bias_shuffled.append(
                w2_bias[i]
                .clone()
                .reshape(-1, 1)[permute_indices.to(w2_bias.device)]
                .contiguous()
            )

        w13_weight = torch.stack(gemm1_weights_shuffled)
        w13_weight_scale = (
            torch.stack(gemm1_scales_shuffled)
            .reshape(num_experts, 2 * intermediate_size, hidden_size // sf_block_size)
            .view(torch.float8_e4m3fn)
        )
        w2_weight = torch.stack(gemm2_weights_shuffled)
        w2_weight_scale = (
            torch.stack(gemm2_scales_shuffled)
            .reshape(num_experts, hidden_size, intermediate_size // sf_block_size)
            .view(torch.float8_e4m3fn)
        )
        w13_bias = torch.stack(gemm1_bias_shuffled).reshape(num_experts, -1)
        w2_bias = torch.stack(gemm2_bias_shuffled).reshape(num_experts, -1)

        return (
            w13_weight,
            w2_weight,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend in (
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
    ):
        # De-interleave and swap for w13 weight, bias, and scales
        w13_w = w13_weight.data
        gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :]
        deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1)
        w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1)
        w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)

        assert w13_bias is not None and w2_bias is not None
        w13_b = w13_bias.data.to(torch.float32)
        gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2]
        deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1)
        b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1)
        w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)

        w13_s = w13_weight_scale.data
        gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :]
        deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1)
        s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1)
        w13_scale_swapped = torch.cat([s3, s1], dim=1)

        if mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8:
            from flashinfer import block_scale_interleave

            orig_shape = w13_scale_swapped.shape
            w13_scale_interleaved = block_scale_interleave(
                w13_scale_swapped.view(torch.uint8)
            ).reshape(orig_shape)

            w2_s = w2_weight_scale.data
            orig_shape = w2_s.shape
            w2_scale_interleaved = block_scale_interleave(
                w2_s.view(torch.uint8)
            ).reshape(orig_shape)

            return (
                w13_weight_swapped,
                w2_weight,
                w13_scale_interleaved,
                w2_scale_interleaved,
                w13_bias_swapped,
                w2_bias,
            )

        else:
            assert mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16

            from flashinfer.fused_moe import (
                interleave_moe_scales_for_sm90_mixed_gemm,
                interleave_moe_weights_for_sm90_mixed_gemm,
            )

            w13_weight_interleaved = interleave_moe_weights_for_sm90_mixed_gemm(
                w13_weight_swapped.contiguous(), "fp4"
            )
            w2_weight_interleaved = interleave_moe_weights_for_sm90_mixed_gemm(
                w2_weight.contiguous(), "fp4"
            )
            w31_scales_interleaved = interleave_moe_scales_for_sm90_mixed_gemm(
                w13_scale_swapped.to(torch.uint8)
            )
            w2_scale_interleaved = interleave_moe_scales_for_sm90_mixed_gemm(
                w2_weight_scale.data.to(torch.uint8)
            )

            return (
                w13_weight_interleaved,
                w2_weight_interleaved,
                w31_scales_interleaved,
                w2_scale_interleaved,
                w13_bias_swapped,
                w2_bias,
            )

    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_MXFP4:
        from vllm._aiter_ops import rocm_aiter_ops

        if w13_bias is not None:
            w13_bias = w13_bias.data.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.data.to(torch.float32)

        # e8m0_shuffle on weight scales (GFX950 swizzle layout)
        from aiter.utility.fp4_utils import e8m0_shuffle

        s0, s1, _ = w13_weight_scale.shape
        w13_weight_scale.data = e8m0_shuffle(w13_weight_scale.view(s0 * s1, -1)).view(
            s0, s1, -1
        )

        s0, s1, _ = w2_weight_scale.shape
        w2_weight_scale.data = e8m0_shuffle(w2_weight_scale.view(s0 * s1, -1)).view(
            s0, s1, -1
        )

        # View as native FP4 dtype
        fp4_dtype = getattr(torch, "float4_e2m1fn_x2", None)
        if fp4_dtype is not None:
            w13_weight.data = w13_weight.data.view(fp4_dtype)
            w2_weight.data = w2_weight.data.view(fp4_dtype)

        # Shuffle weights for AITER CK kernel
        shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
            w13_weight, w2_weight
        )
        shuffled_w13.is_shuffled = True
        shuffled_w2.is_shuffled = True

        return (
            shuffled_w13,
            shuffled_w2,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
        from vllm._aiter_ops import rocm_aiter_ops

        if w13_bias is not None:
            w13_bias = w13_bias.data.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.data.to(torch.float32)

        e, n, k = w13_weight.shape

        # De-interleave w13 rows: gate/up pairs -> contiguous gate, up blocks
        w13_weight.view(torch.uint8).copy_(
            w13_weight.data.view(torch.uint8)
            .view(e, n // 2, 2, k)
            .permute(0, 2, 1, 3)
            .contiguous()
            .view(e, n, k)
        )
        w13_weight_scale.data = (
            w13_weight_scale.data.view(e, n // 2, 2, -1)
            .permute(0, 2, 1, 3)
            .contiguous()
            .view(e, n, -1)
        )

        # View as native FP4 dtype for AITER shuffle
        w13_weight.data = w13_weight.data.view(torch.float4_e2m1fn_x2)
        w2_weight.data = w2_weight.data.view(torch.float4_e2m1fn_x2)

        # Shuffle weights and scales for AITER CK kernel layout
        w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w13_weight, 16, True)
        shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
            w13_weight_scale.view(-1, w13_weight_scale.shape[-1]),
            num_experts,
            True,
        )

        w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w2_weight, 16, False)
        shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
            w2_weight_scale.view(-1, w2_weight_scale.shape[-1]),
            num_experts,
            False,
        )

        # Permute bias to match de-interleaved weight layout
        if w13_bias is not None:
            w13_bias = (
                w13_bias.data.view(-1, n // 2, 2)
                .permute(0, 2, 1)
                .contiguous()
                .view(-1, n)
            )

        return (
            w13_weight,
            w2_weight,
            shuffled_w13_scale,
            shuffled_w2_scale,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
        # W4A8: MXFP4 weights + static FP8 activations (triton kernel)
        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
        from triton_kernels.numerics import InFlexData

        if w13_bias is not None:
            w13_bias = w13_bias.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.to(torch.float32)

        # Process static FP8 input scales (reduce to scalar, warn if not uniform)
        w13_input_scale = layer.w13_input_scale
        w2_input_scale = layer.w2_input_scale
        if w13_input_scale is None or w2_input_scale is None:
            raise ValueError(
                "W4A8 (AITER_MXFP4_FP8) requires static input scales, but found "
                "w13_input_scale or w2_input_scale is None."
            )
        if not all_close_1d(w13_input_scale) or not all_close_1d(w2_input_scale):
            logger.warning_once(
                "Found input_scales that are not equal for "
                "fp8 MoE layer. Using the maximum across experts "
                "for each layer."
            )
        w13_input_scale = w13_input_scale.max().to(torch.float32)
        w2_input_scale = w2_input_scale.max().to(torch.float32)

        # Swizzle weights for GFX950
        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(w13_weight, w13_weight_scale)
        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(w2_weight, w2_weight_scale)

        # Create InFlexData for activation scales
        lhs_data13 = InFlexData(scale=w13_input_scale)
        lhs_data2 = InFlexData(scale=w2_input_scale)

        # Create PrecisionConfig with both weight and activation info
        w13_precision_config = PrecisionConfig(
            weight_scale=w13_scale,
            flex_ctx=FlexCtx(rhs_data=w13_flex, lhs_data=lhs_data13),
        )
        w2_precision_config = PrecisionConfig(
            weight_scale=w2_scale,
            flex_ctx=FlexCtx(rhs_data=w2_flex, lhs_data=lhs_data2),
        )

        del layer.w13_weight
        del layer.w2_weight

        return (
            w13_weight,
            w2_weight,
            w13_precision_config,
            w2_precision_config,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend in TRITON_BACKENDS:
        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig

        if w13_bias is not None:
            w13_bias = w13_bias.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.to(torch.float32)

        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
            w13_weight,
            w13_weight_scale,
        )
        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
            w2_weight,
            w2_weight_scale,
        )

        w13_precision_config = PrecisionConfig(
            weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
        )
        w2_precision_config = PrecisionConfig(
            weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
        )

        del layer.w13_weight
        del layer.w2_weight

        return (
            w13_weight,
            w2_weight,
            w13_precision_config,
            w2_precision_config,
            w13_bias,
            w2_bias,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.XPU:
        # No additional transformation needed for XPU backend
        return (
            w13_weight,
            w2_weight,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.CPU:
        from vllm.model_executor.layers.fused_moe.experts.cpu_moe import (
            prepare_mxfp4_moe_layer_for_cpu,
        )

        packed_w13, packed_w2, packed_w13_scale, packed_w2_scale = (
            prepare_mxfp4_moe_layer_for_cpu(
                w13_weight.data,
                w2_weight.data,
                w13_weight_scale.data,
                w2_weight_scale.data,
            )
        )
        if w13_bias is not None:
            w13_bias = w13_bias.data.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.data.to(torch.float32)
        return (
            packed_w13,
            packed_w2,
            packed_w13_scale,
            packed_w2_scale,
            w13_bias,
            w2_bias,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.EMULATION:
        # No additional transformation needed for emulation backend,
        # weights are dequantized on the fly in the experts class.
        return (
            w13_weight,
            w2_weight,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )
    else:
        raise ValueError(
            f"Unsupported mxfp4_backend: {mxfp4_backend}: "
            f"should be one of: {list(Mxfp4MoeBackend)}."
        )

convert_weight_to_mxfp4_moe_kernel_format ¶

convert_weight_to_mxfp4_moe_kernel_format(
    mxfp4_backend: Mxfp4MoeBackend,
    layer: Module,
    w13_weight: Tensor,
    w2_weight: Tensor,
    w13_weight_scale: Tensor,
    w2_weight_scale: Tensor,
    w13_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
    _cache_permute_indices: dict[Size, Tensor]
    | None = None,
) -> tuple[
    Tensor,
    Tensor,
    Union[Tensor, PrecisionConfig],
    Union[Tensor, PrecisionConfig],
    Tensor | None,
    Tensor | None,
]

Convert loaded weights into backend-specific kernel format.

Supports DeepGEMM, TRTLLM MXFP8, Triton and Marlin backends.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def convert_weight_to_mxfp4_moe_kernel_format(
    mxfp4_backend: Mxfp4MoeBackend,
    layer: torch.nn.Module,
    w13_weight: torch.Tensor,
    w2_weight: torch.Tensor,
    w13_weight_scale: torch.Tensor,
    w2_weight_scale: torch.Tensor,
    w13_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    _cache_permute_indices: dict[torch.Size, torch.Tensor] | None = None,
) -> tuple[
    torch.Tensor,
    torch.Tensor,
    Union[torch.Tensor, "PrecisionConfig"],
    Union[torch.Tensor, "PrecisionConfig"],
    torch.Tensor | None,
    torch.Tensor | None,
]:
    """Convert loaded weights into backend-specific kernel format.

    Supports DeepGEMM, TRTLLM MXFP8, Triton and Marlin backends.
    """

    if mxfp4_backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
            _upcast_e8m0_to_fp32,
        )

        # Weights stay as uint8 packed FP4 — no layout change needed.
        # Convert E8M0 uint8 scales to float32.
        return (
            w13_weight.data,
            w2_weight.data,
            _upcast_e8m0_to_fp32(w13_weight_scale.data),
            _upcast_e8m0_to_fp32(w2_weight_scale.data),
            w13_bias,
            w2_bias,
        )

    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
        from vllm.model_executor.layers.quantization.utils.humming_utils import (
            prepare_humming_moe_layer,
        )

        prepare_humming_moe_layer(layer, {"quant_method": "mxfp4"})
        return (
            layer.w13_weight,
            layer.w2_weight,
            layer.w13_weight_scale,
            layer.w2_weight_scale,
            getattr(layer, "w13_bias", None),
            getattr(layer, "w2_bias", None),
        )

    if mxfp4_backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN):
        from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
            prepare_moe_mxfp4_layer_for_marlin,
        )

        return prepare_moe_mxfp4_layer_for_marlin(
            layer,
            w13_weight,
            w2_weight,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )

    num_experts = w13_weight.shape[0]
    intermediate_size = w13_weight.shape[1] // 2
    hidden_size = w13_weight.shape[2] * 2

    sf_block_size = 32  # mxfp4 block size

    if mxfp4_backend in TRTLLM_BACKENDS:
        assert _cache_permute_indices is not None
        from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
        from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache

        w13_weight = w13_weight.data
        w2_weight = w2_weight.data
        w13_weight_scale = w13_weight_scale.data
        w2_weight_scale = w2_weight_scale.data
        if w13_bias is not None:
            w13_bias = w13_bias.data.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.data.to(torch.float32)

        # Swap w1/w3 and interleave to match TRTLLM SwiGLU convention.
        # Standard loading gives contiguous [w1/gate, w3/up].
        # TRTLLM kernel expects interleaved [w3_0, w1_0, w3_1, w1_1, ...].
        w1_weight = w13_weight[:, :intermediate_size, :]
        w3_weight = w13_weight[:, intermediate_size:, :]
        w13_weight = torch.stack([w3_weight, w1_weight], dim=2).reshape(
            w13_weight.shape
        )

        w1_scale = w13_weight_scale[:, :intermediate_size, :]
        w3_scale = w13_weight_scale[:, intermediate_size:, :]
        w13_weight_scale = torch.stack([w3_scale, w1_scale], dim=2).reshape(
            w13_weight_scale.shape
        )

        if w13_bias is not None:
            b1 = w13_bias[:, :intermediate_size]
            b3 = w13_bias[:, intermediate_size:]
            w13_bias = torch.stack([b3, b1], dim=2).reshape(w13_bias.shape)

        # Shuffle weights and scaling factors for transposed mma output.
        # Permute indices depend only on shape (cached by torch.Size),
        # so compute once and apply to all experts via batched indexing.
        epilogue_tile_m = 128

        # w13 weight permute
        w13_perm = get_w2_permute_indices_with_cache(
            _cache_permute_indices,
            w13_weight[0].view(torch.uint8),
            epilogue_tile_m,
        ).to(w13_weight.device)
        w13_weight = w13_weight.view(torch.uint8)[:, w13_perm].contiguous()

        # w13 scale permute + interleave
        w13_sf_perm = get_w2_permute_indices_with_cache(
            _cache_permute_indices,
            w13_weight_scale[0].view(torch.uint8),
            epilogue_tile_m,
            num_elts_per_sf=16,
        ).to(w13_weight_scale.device)
        w13_s = w13_weight_scale.view(torch.uint8)[:, w13_sf_perm].contiguous()
        E, N_s, K_s = w13_s.shape
        w13_weight_scale = (
            nvfp4_block_scale_interleave(w13_s.reshape(E * N_s, K_s))
            .reshape(num_experts, 2 * intermediate_size, hidden_size // sf_block_size)
            .view(torch.float8_e4m3fn)
        )

        # w2 weight permute
        w2_perm = get_w2_permute_indices_with_cache(
            _cache_permute_indices,
            w2_weight[0].view(torch.uint8),
            epilogue_tile_m,
        ).to(w2_weight.device)
        w2_weight = w2_weight.view(torch.uint8)[:, w2_perm].contiguous()

        # w2 scale permute + interleave
        w2_sf_perm = get_w2_permute_indices_with_cache(
            _cache_permute_indices,
            w2_weight_scale[0].view(torch.uint8),
            epilogue_tile_m,
            num_elts_per_sf=16,
        ).to(w2_weight_scale.device)
        w2_s = w2_weight_scale.view(torch.uint8)[:, w2_sf_perm].contiguous()
        E2, N2_s, K2_s = w2_s.shape
        w2_weight_scale = (
            nvfp4_block_scale_interleave(w2_s.reshape(E2 * N2_s, K2_s))
            .reshape(num_experts, hidden_size, intermediate_size // sf_block_size)
            .view(torch.float8_e4m3fn)
        )

        # w13 bias permute
        if w13_bias is not None:
            w13_b_perm = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w13_bias[0].reshape(-1, 1),
                epilogue_tile_m,
            ).to(w13_bias.device)
            w13_bias = w13_bias.reshape(num_experts, -1, 1)[:, w13_b_perm].reshape(
                num_experts, -1
            )

        # w2 bias permute
        if w2_bias is not None:
            w2_b_perm = get_w2_permute_indices_with_cache(
                _cache_permute_indices,
                w2_bias[0].reshape(-1, 1),
                epilogue_tile_m,
            ).to(w2_bias.device)
            w2_bias = w2_bias.reshape(num_experts, -1, 1)[:, w2_b_perm].reshape(
                num_experts, -1
            )

        return (
            w13_weight,
            w2_weight,
            w13_weight_scale,
            w2_weight_scale,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
        from vllm._aiter_ops import rocm_aiter_ops

        if w13_bias is not None:
            w13_bias = w13_bias.data.to(torch.float32)
        if w2_bias is not None:
            w2_bias = w2_bias.data.to(torch.float32)

        e, n, k = w13_weight.shape

        w13_weight.view(torch.uint8).copy_(
            w13_weight.data.view(torch.uint8)
            .view(e, n // 2, 2, k)
            .permute(0, 2, 1, 3)
            .contiguous()
            .view(e, n, k)
        )
        w13_weight_scale.data = (
            w13_weight_scale.data.view(e, n // 2, 2, -1)
            .permute(0, 2, 1, 3)
            .contiguous()
            .view(e, n, -1)
        )

        w13_weight.data = w13_weight.data.view(torch.float4_e2m1fn_x2)
        w2_weight.data = w2_weight.data.view(torch.float4_e2m1fn_x2)

        w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w13_weight, 16, True)
        shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
            w13_weight_scale.view(-1, w13_weight_scale.shape[-1]),
            num_experts,
            True,
        )

        w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w2_weight, 16, False)
        shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
            w2_weight_scale.view(-1, w2_weight_scale.shape[-1]),
            num_experts,
            False,
        )

        if w13_bias is not None:
            w13_bias = (
                w13_bias.data.view(-1, n // 2, 2)
                .permute(0, 2, 1)
                .contiguous()
                .view(-1, n)
            )

        return (
            w13_weight,
            w2_weight,
            shuffled_w13_scale,
            shuffled_w2_scale,
            w13_bias,
            w2_bias,
        )

    elif mxfp4_backend in TRITON_BACKENDS:
        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig

        if mxfp4_backend == Mxfp4MoeBackend.TRITON:

            def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
                shape = w.shape
                n = shape[-1]
                first = w[..., : n // 2]
                second = w[..., n // 2 :]
                stacked = torch.stack((first, second), dim=-1)
                return stacked.reshape(shape)

            w13_weight = shuffle_weight(w13_weight)
            w13_weight_scale = shuffle_weight(w13_weight_scale)

            if w13_bias is not None:
                w13_bias = shuffle_weight(w13_bias.to(torch.float32))
        else:
            if w13_bias is not None:
                w13_bias = w13_bias.to(torch.float32)

        if w2_bias is not None:
            w2_bias = w2_bias.to(torch.float32)

        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
            w13_weight,
            w13_weight_scale,
        )
        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
            w2_weight,
            w2_weight_scale,
        )

        w13_precision_config = PrecisionConfig(
            weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
        )
        w2_precision_config = PrecisionConfig(
            weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
        )

        del layer.w13_weight
        del layer.w2_weight

        return (
            w13_weight,
            w2_weight,
            w13_precision_config,
            w2_precision_config,
            w13_bias,
            w2_bias,
        )
    else:
        raise ValueError(
            f"Unsupported mxfp4_backend for Mxfp4MoEMethod: {mxfp4_backend}. "
            f"Expected TRTLLM, Triton, or AITER backend."
        )

make_mxfp4_moe_kernel ¶

make_mxfp4_moe_kernel(
    moe_quant_config: FusedMoEQuantConfig,
    moe_config: FusedMoEConfig,
    experts_cls: type[FusedMoEExperts],
    mxfp4_backend: Mxfp4MoeBackend,
    routing_tables: tuple[Tensor, Tensor, Tensor]
    | None = None,
    layer: RoutedExperts | None = None,
) -> FusedMoEKernel

Create a FusedMoEKernel for the given MXFP4 backend.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def make_mxfp4_moe_kernel(
    moe_quant_config: FusedMoEQuantConfig,
    moe_config: FusedMoEConfig,
    experts_cls: type[mk.FusedMoEExperts],
    mxfp4_backend: Mxfp4MoeBackend,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    layer: "RoutedExperts | None" = None,
) -> mk.FusedMoEKernel:
    """Create a FusedMoEKernel for the given MXFP4 backend."""
    is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)

    prepare_finalize = maybe_make_prepare_finalize(
        moe=moe_config,
        quant_config=moe_quant_config,
        routing_tables=routing_tables,
        allow_new_interface=True,
        use_monolithic=is_monolithic,
    )
    assert prepare_finalize is not None

    logger.info_once("Using %s", prepare_finalize.__class__.__name__)

    extra_kwargs = {}
    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
        assert layer is not None
        extra_kwargs["layer"] = layer

    # Create Experts.
    if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
        max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
        assert max_num_tokens is not None
        experts = experts_cls(
            moe_config=moe_config,
            quant_config=moe_quant_config,
            max_num_tokens=max_num_tokens,
            num_dispatchers=prepare_finalize.num_dispatchers(),
            **extra_kwargs,
        )
    else:
        experts = experts_cls(
            moe_config=moe_config,
            quant_config=moe_quant_config,
            **extra_kwargs,
        )

    kernel = mk.FusedMoEKernel(
        prepare_finalize,
        experts,
        inplace=(
            not moe_config.disable_inplace and mxfp4_backend not in TRTLLM_BACKENDS
        ),
    )

    return kernel

make_mxfp4_moe_quant_config ¶

make_mxfp4_moe_quant_config(
    mxfp4_backend: Mxfp4MoeBackend,
    w1_scale: Union[Tensor, PrecisionConfig],
    w2_scale: Union[Tensor, PrecisionConfig],
    gemm1_alpha: float | None = None,
    gemm1_beta: float | None = None,
    swiglu_limit: float | None = None,
    w1_bias: Tensor | None = None,
    w2_bias: Tensor | None = None,
    a1_scale: Tensor | None = None,
    a2_scale: Tensor | None = None,
    layer: Module | None = None,
) -> FusedMoEQuantConfig | None

Create a FusedMoEQuantConfig for the given MXFP4 backend.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def make_mxfp4_moe_quant_config(
    mxfp4_backend: Mxfp4MoeBackend,
    w1_scale: Union[torch.Tensor, "PrecisionConfig"],
    w2_scale: Union[torch.Tensor, "PrecisionConfig"],
    gemm1_alpha: float | None = None,
    gemm1_beta: float | None = None,
    swiglu_limit: float | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    a1_scale: torch.Tensor | None = None,
    a2_scale: torch.Tensor | None = None,
    layer: torch.nn.Module | None = None,
) -> FusedMoEQuantConfig | None:
    """Create a FusedMoEQuantConfig for the given MXFP4 backend."""
    if mxfp4_backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
        from vllm.model_executor.layers.quantization.utils.quant_utils import (
            GroupShape,
        )

        # DeepGEMM FP4 uses FP8 per-token-group activation quantization
        # with block 128, matching the FP8 DeepGEMM path.
        _fp8_dtype = current_platform.fp8_dtype()
        _block_shape = GroupShape(128, 128)
        return FusedMoEQuantConfig(
            _a1=FusedMoEQuantDesc(_fp8_dtype, _block_shape, None, None, None, None),
            _a2=FusedMoEQuantDesc(_fp8_dtype, _block_shape, None, None, None, None),
            _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias),
            _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias),
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8:
        # TRTLLM kernel expects non-swizzled mxfp8 activation scales.
        return mxfp4_mxfp8_moe_quant_config(
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
            mx_alignment=256,
            is_scale_swizzled=False,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8:
        # CUTLASS kernel expects swizzled mxfp8 activation scales.
        return mxfp4_mxfp8_moe_quant_config(
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
            is_scale_swizzled=True,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
        # W4A8: MXFP4 weights + static FP8 activations
        return mxfp4_w4a8_moe_quant_config(
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            block_shape=None,
            gemm1_clamp_limit=swiglu_limit,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_MXFP4:
        return ocp_mx_moe_quant_config(
            quant_dtype="mxfp4",
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
        )
    elif mxfp4_backend in (
        Mxfp4MoeBackend.MARLIN,
        Mxfp4MoeBackend.BATCHED_MARLIN,
        Mxfp4MoeBackend.TRITON,
        Mxfp4MoeBackend.TRITON_UNFUSED,
        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
        Mxfp4MoeBackend.AITER_MXFP4_BF16,
        Mxfp4MoeBackend.CPU,
    ):
        return mxfp4_w4a16_moe_quant_config(
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
        )
    elif mxfp4_backend == Mxfp4MoeBackend.HUMMING:
        from vllm.model_executor.layers.fused_moe.layer import FusedMoE
        from vllm.model_executor.layers.quantization.utils.humming_utils import (
            get_humming_moe_quant_config,
        )

        assert isinstance(layer, FusedMoE)
        return get_humming_moe_quant_config(
            layer,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
        )
    else:
        return ocp_mx_moe_quant_config(
            quant_dtype="mxfp4",
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=swiglu_limit,
        )

map_mxfp4_backend ¶

map_mxfp4_backend(
    runner_backend: MoEBackend,
) -> list[Mxfp4MoeBackend]

Map a moe_backend string to its candidate Mxfp4MoeBackends.

Vendor families return all activation variants; the caller picks one via activation_key and is_supported_config.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def map_mxfp4_backend(runner_backend: MoEBackend) -> list[Mxfp4MoeBackend]:
    """Map a moe_backend string to its candidate Mxfp4MoeBackends.

    Vendor families return all activation variants; the caller picks one
    via ``activation_key`` and ``is_supported_config``.
    """
    mapping: dict[str, list[Mxfp4MoeBackend]] = {
        "deep_gemm": [Mxfp4MoeBackend.DEEPGEMM_MXFP4],
        "flashinfer_trtllm": [
            Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
            Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
        ],
        "flashinfer_trtllm_afp8": [Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8],
        "flashinfer_cutlass": [
            Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
            Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
        ],
        "flashinfer_cutlass_afp8": [Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8],
        "triton": [Mxfp4MoeBackend.TRITON],
        "triton_unfused": [Mxfp4MoeBackend.TRITON_UNFUSED],
        "humming": [Mxfp4MoeBackend.HUMMING],
        "marlin": [Mxfp4MoeBackend.MARLIN],
        "aiter": [
            Mxfp4MoeBackend.AITER_MXFP4_BF16,
            Mxfp4MoeBackend.AITER_MXFP4_FP8,
            Mxfp4MoeBackend.AITER_MXFP4_MXFP4,
        ],
        "aiter_mxfp4_fp8": [Mxfp4MoeBackend.AITER_MXFP4_FP8],
        "aiter_mxfp4_mxfp4": [Mxfp4MoeBackend.AITER_MXFP4_MXFP4],
        "xpu": [Mxfp4MoeBackend.XPU],
        "cpu": [Mxfp4MoeBackend.CPU],
        "emulation": [Mxfp4MoeBackend.EMULATION],
    }
    if backends := mapping.get(runner_backend):
        return backends
    raise ValueError(
        f"moe_backend='{runner_backend}' is not supported for MXFP4 MoE. "
        f"Expected one of {list(mapping.keys())}."
    )

mxfp4_round_up_hidden_size_and_intermediate_size ¶

mxfp4_round_up_hidden_size_and_intermediate_size(
    backend: Mxfp4MoeBackend,
    hidden_size: int,
    intermediate_size: int,
) -> tuple[int, int]

Round up hidden_size and intermediate_size based on backend requirements.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def mxfp4_round_up_hidden_size_and_intermediate_size(
    backend: Mxfp4MoeBackend, hidden_size: int, intermediate_size: int
) -> tuple[int, int]:
    """Round up hidden_size and intermediate_size based on backend requirements."""
    if backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
        # DeepGEMM requires M/N/K alignment
        intermediate_size = round_up(intermediate_size, 128)
        hidden_size = round_up(hidden_size, 128)
    elif backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN):
        intermediate_size = round_up(intermediate_size, 128)
        if current_platform.is_xpu():
            hidden_size = round_up(hidden_size, 128)
        else:
            hidden_size = round_up(hidden_size, 256)
    elif backend in TRTLLM_BACKENDS:
        intermediate_size = round_up(intermediate_size, 256)
        hidden_size = round_up(hidden_size, 256)
    elif backend in (
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
    ):
        intermediate_size = round_up(intermediate_size, 128)
        hidden_size = round_up(hidden_size, 128)
    elif current_platform.is_rocm():
        intermediate_size = round_up(intermediate_size, 256)
        hidden_size = round_up(hidden_size, 256)
    elif backend == Mxfp4MoeBackend.CPU:
        # CPU AMX kernel uses BLOCK_N=32, align to 32
        intermediate_size = round_up(intermediate_size, 32)
        hidden_size = round_up(hidden_size, 32)
    else:
        intermediate_size = round_up(intermediate_size, 64)
    return hidden_size, intermediate_size

select_deepseek_v4_mxfp4_moe_backend ¶

select_deepseek_v4_mxfp4_moe_backend(
    config: FusedMoEConfig,
) -> tuple[Mxfp4MoeBackend, type[FusedMoEExperts] | None]

Select the MXFP4 MoE backend with MXFP8 activation as top priority. Falls back through BF16 and other backends.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def select_deepseek_v4_mxfp4_moe_backend(
    config: FusedMoEConfig,
) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
    """
    Select the MXFP4 MoE backend with MXFP8 activation as top priority.
    Falls back through BF16 and other backends.
    """
    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    # Honor explicit moe_backend (e.g. "marlin", "triton_unfused") before
    # falling back to the auto priority list.
    runner_backend = config.moe_backend
    if runner_backend != "auto":
        requested_backends = map_mxfp4_backend(runner_backend)
        if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
            requested_backends = [
                Mxfp4MoeBackend.BATCHED_MARLIN if b == Mxfp4MoeBackend.MARLIN else b
                for b in requested_backends
            ]
        last_error: Exception | None = None
        for requested_backend in requested_backends:
            try:
                return _return_or_raise(
                    requested_backend,
                    config,
                    kMxfp4Static,
                    _backend_activation_key(requested_backend),
                    activation_format,
                )
            except ValueError as e:
                last_error = e
        assert last_error is not None
        raise last_error

    # DeepSeek-V4 on ROCm is more accurate with the unfused Triton MXFP4 path
    # than the default AITER path. Prefer Triton-unfused for this routing mode,
    # while keeping AITER as a fallback if Triton-unfused rejects the config.
    if (
        current_platform.is_rocm()
        and config.routing_method == RoutingMethodType.DeepseekV4
    ):
        priority_backends = [
            Mxfp4MoeBackend.TRITON_UNFUSED,
            Mxfp4MoeBackend.AITER_MXFP4_BF16,
        ]
    else:
        priority_backends = _get_priority_backends()

    # Iterate priority backends: TRTLLM MXFP8, then Triton.
    for backend in priority_backends:
        activation_key = _backend_activation_key(backend)
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, kMxfp4Static, activation_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend), scope="local")
                return backend, k_cls
            else:
                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")

    raise NotImplementedError(
        "No MXFP4 MoE backend supports the deployment configuration."
    )

select_mxfp4_moe_backend ¶

select_mxfp4_moe_backend(
    config: FusedMoEConfig,
    activation_key: QuantKey | None = None,
) -> tuple[Mxfp4MoeBackend, type[FusedMoEExperts] | None]

Select the primary MXFP4 MoE backend.

Parameters:

Name	Type	Description	Default
`config`	`FusedMoEConfig`	MoE configuration	required
`activation_key`	`QuantKey \| None`	Optional activation quantization key. If provided, overrides the default activation key for backend selection. Use kFp8StaticTensorSym for W4A8 scheme.	`None`

Note: Shape-specific fallbacks may still occur at runtime.

Source code in vllm/model_executor/layers/fused_moe/oracle/mxfp4.py

def select_mxfp4_moe_backend(
    config: FusedMoEConfig,
    activation_key: QuantKey | None = None,
) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
    """
    Select the primary MXFP4 MoE backend.

    Args:
        config: MoE configuration
        activation_key: Optional activation quantization key. If provided,
            overrides the default activation key for backend selection.
            Use kFp8StaticTensorSym for W4A8 scheme.

    Note: Shape-specific fallbacks may still occur at runtime.
    """
    requested_activation_key = _resolve_activation_key(activation_key)

    activation_format = (
        mk.FusedMoEActivationFormat.BatchedExperts
        if config.moe_parallel_config.use_batched_activation_format
        else mk.FusedMoEActivationFormat.Standard
    )

    runner_backend = config.moe_backend
    if runner_backend != "auto":
        requested_backends = map_mxfp4_backend(runner_backend)
        if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
            requested_backends = [
                Mxfp4MoeBackend.BATCHED_MARLIN if b == Mxfp4MoeBackend.MARLIN else b
                for b in requested_backends
            ]
        candidates = _filter_by_activation(requested_backends, requested_activation_key)
        if not candidates:
            raise ValueError(
                f"moe_backend={runner_backend!r} does not support "
                f"activation={requested_activation_key}; supported variants: "
                f"{[b.name for b in requested_backends]}"
            )
        last_error: Exception | None = None
        for requested_backend in candidates:
            act_key = (
                requested_activation_key
                if requested_activation_key is not None
                else _backend_activation_key(requested_backend)
            )
            try:
                return _return_or_raise(
                    requested_backend,
                    config,
                    kMxfp4Static,
                    act_key,
                    activation_format,
                )
            except ValueError as e:
                last_error = e
        assert last_error is not None
        raise last_error

    # Select kernels in order of backend.
    AVAILABLE_BACKENDS = _filter_by_activation(
        _get_priority_backends_for_gpt_oss(), requested_activation_key
    )

    # Handle explicit FlashInfer MXFP4 BF16 configuration.
    if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"):
        if not envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16:
            for _b in (
                Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
                Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
            ):
                if _b in AVAILABLE_BACKENDS:
                    AVAILABLE_BACKENDS.remove(_b)
        else:
            if current_platform.is_device_capability(90):
                return _return_or_raise(
                    Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
                    config,
                    kMxfp4Static,
                    None,
                    activation_format,
                )
            if current_platform.is_device_capability_family(100):
                return _return_or_raise(
                    Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
                    config,
                    kMxfp4Static,
                    None,
                    activation_format,
                )
            raise ValueError(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16=1 is set but the "
                "current device capability is not supported. "
                "Only SM90 (CUTLASS) and SM100+ (TRTLLM) are supported."
            )

    # Handle explicit FlashInfer MXFP4 MXFP8 TRTLLM configuration.
    if (
        envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")
        and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
    ):
        return _return_or_raise(
            Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
            config,
            kMxfp4Static,
            kMxfp8Dynamic,
            activation_format,
        )

    # Handle explicit FlashInfer MXFP4 MXFP8 CUTLASS configuration.
    if (
        envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS")
        and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
    ):
        return _return_or_raise(
            Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
            config,
            kMxfp4Static,
            kMxfp8Dynamic,
            activation_format,
        )

    # Handle explicit Marlin MXFP4 configuration.
    if envs.is_set("VLLM_MXFP4_USE_MARLIN") and envs.VLLM_MXFP4_USE_MARLIN:
        return _return_or_raise(
            Mxfp4MoeBackend.MARLIN,
            config,
            kMxfp4Static,
            None,
            activation_format,
        )

    for backend in AVAILABLE_BACKENDS:
        # Use requested_activation_key if provided, otherwise use backend default
        act_key = (
            requested_activation_key
            if requested_activation_key is not None
            else _backend_activation_key(backend)
        )
        for k_cls in backend_to_kernel_cls(backend):
            supported, reason = k_cls.is_supported_config(
                k_cls, config, kMxfp4Static, act_key, activation_format
            )
            if supported:
                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
            else:
                logger.debug_once(_make_log_unsupported(backend, reason))

    if current_platform.is_xpu():
        backend = Mxfp4MoeBackend.XPU
        logger.info_once(_make_log_backend(backend))
        return _return_or_raise(
            Mxfp4MoeBackend.XPU,
            config,
            kMxfp4Static,
            None,
            activation_format,
        )

    if current_platform.is_cpu():
        backend = Mxfp4MoeBackend.CPU
        logger.info_once(_make_log_backend(backend))
        return _return_or_raise(
            Mxfp4MoeBackend.CPU,
            config,
            kMxfp4Static,
            None,
            activation_format,
        )

    if current_platform.is_cuda() or current_platform.is_rocm():
        raise NotImplementedError(
            "No MXFP4 MoE backend supports the deployment configuration. "
            f"weight_key=kMxfp4Static, activation_key={activation_key}. "
            "Native backends require specific hardware. "
            "Set `VLLM_LOGGING_LEVEL=DEBUG` to see detailed unsupported reasons. "
            "To use the emulation backend for research/debugging, pass "
            "--moe-backend emulation."
        )

    return Mxfp4MoeBackend.NONE, None