Skip to content

vllm.model_executor.models.config

HybridAttentionMambaModelConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Perform early validation and setup for hybrid attention/mamba models.

        Block size alignment with mamba page sizes is handled later by
        Platform.update_block_size_for_backend(), which runs after model
        layers are constructed and the attention backend is known.

        Args:
            vllm_config: vLLM Config
        """
        cache_config = vllm_config.cache_config

        # Disable calculate_kv_scales for hybrid models: uninitialized
        # recurrent state corrupts scales during the calibration pass.
        # See issue: https://github.com/vllm-project/vllm/issues/37554

        if cache_config.calculate_kv_scales:
            logger.warning(
                "Disabling calculate_kv_scales for hybrid model '%s'. "
                "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
                "produce unreliable KV cache scales during the "
                "calibration pass because recurrent state is "
                "uninitialized. Using default scale of 1.0 instead.",
                vllm_config.model_config.model,
            )
            cache_config.calculate_kv_scales = False

        # Enable FULL_AND_PIECEWISE by default
        MambaModelConfig.verify_and_update_config(vllm_config)

verify_and_update_config classmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Perform early validation and setup for hybrid attention/mamba models.

Block size alignment with mamba page sizes is handled later by Platform.update_block_size_for_backend(), which runs after model layers are constructed and the attention backend is known.

Parameters:

Name Type Description Default
vllm_config VllmConfig

vLLM Config

required
Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """
    Perform early validation and setup for hybrid attention/mamba models.

    Block size alignment with mamba page sizes is handled later by
    Platform.update_block_size_for_backend(), which runs after model
    layers are constructed and the attention backend is known.

    Args:
        vllm_config: vLLM Config
    """
    cache_config = vllm_config.cache_config

    # Disable calculate_kv_scales for hybrid models: uninitialized
    # recurrent state corrupts scales during the calibration pass.
    # See issue: https://github.com/vllm-project/vllm/issues/37554

    if cache_config.calculate_kv_scales:
        logger.warning(
            "Disabling calculate_kv_scales for hybrid model '%s'. "
            "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
            "produce unreliable KV cache scales during the "
            "calibration pass because recurrent state is "
            "uninitialized. Using default scale of 1.0 instead.",
            vllm_config.model_config.model,
        )
        cache_config.calculate_kv_scales = False

    # Enable FULL_AND_PIECEWISE by default
    MambaModelConfig.verify_and_update_config(vllm_config)

LlamaNemotronVLConfig

Bases: VerifyAndUpdateConfig

Config handler for LlamaNemotronVL embedding models.

Source code in vllm/model_executor/models/config.py
class LlamaNemotronVLConfig(VerifyAndUpdateConfig):
    """Config handler for LlamaNemotronVL embedding models."""

    @staticmethod
    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        from vllm.config.pooler import SequencePoolingType

        hf_config = model_config.hf_config

        # Set bidirectional attention on the language model config
        hf_config.is_causal = False
        if hasattr(hf_config, "llm_config"):
            hf_config.llm_config.is_causal = False

        if hasattr(hf_config, "vision_config"):
            hf_config.patch_size = hf_config.vision_config.patch_size

        # Set up pooling type
        pooling_type_map: dict[str, SequencePoolingType] = {
            "avg": "MEAN",
            "cls": "CLS",
            "last": "LAST",
        }

        # Get pooling type from config (check both top-level and llm_config)
        pooling = getattr(hf_config, "pooling", None)
        if pooling is None and hasattr(hf_config, "llm_config"):
            pooling = getattr(hf_config.llm_config, "pooling", "avg")

        pooling_type = pooling_type_map.get(pooling)
        if pooling_type is None:
            raise ValueError(f"pool_type {pooling!r} not supported")

        model_config.pooler_config.seq_pooling_type = pooling_type

MambaModelConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class MambaModelConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
        to get good performance for mamba layers in V1).

        Args:
            vllm_config: vLLM Config
        """
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config

        if cache_config.enable_prefix_caching:
            if cache_config.mamba_cache_mode == "none":
                cache_config.mamba_cache_mode = (
                    "all" if model_config.supports_mamba_prefix_caching else "align"
                )
                logger.warning(
                    "Mamba cache mode is set to '%s' for %s by default "
                    "when prefix caching is enabled",
                    cache_config.mamba_cache_mode,
                    model_config.architecture,
                )
            if (
                cache_config.mamba_cache_mode == "all"
                and not model_config.supports_mamba_prefix_caching
            ):
                cache_config.mamba_cache_mode = "align"
                logger.warning(
                    "Hybrid or mamba-based model detected without support "
                    "for prefix caching with Mamba cache 'all' mode: "
                    "falling back to 'align' mode."
                )
            if cache_config.mamba_cache_mode == "align":
                assert vllm_config.scheduler_config.enable_chunked_prefill, (
                    "Chunked prefill is required for mamba cache mode 'align'."
                )
            logger.info(
                "Warning: Prefix caching in Mamba cache '%s' "
                "mode is currently enabled. "
                "Its support for Mamba layers is experimental. "
                "Please report any issues you may observe.",
                cache_config.mamba_cache_mode,
            )
            # By default, mamba block size will be set to max_model_len (see
            # below). When enabling prefix caching, we align mamba block size
            # to the block size as the basic granularity for prefix caching.
            if cache_config.mamba_block_size is None:
                cache_config.mamba_block_size = cache_config.block_size
        else:
            if cache_config.mamba_cache_mode != "none":
                cache_config.mamba_cache_mode = "none"
                logger.warning(
                    "Mamba cache mode is set to 'none' when prefix caching is disabled"
                )
            if cache_config.mamba_block_size is None:
                cache_config.mamba_block_size = model_config.max_model_len

verify_and_update_config classmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Enable FULL_AND_PIECEWISE cuda graph mode by default (required to get good performance for mamba layers in V1).

Parameters:

Name Type Description Default
vllm_config VllmConfig

vLLM Config

required
Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """
    Enable FULL_AND_PIECEWISE cuda graph mode by default (required
    to get good performance for mamba layers in V1).

    Args:
        vllm_config: vLLM Config
    """
    model_config = vllm_config.model_config
    cache_config = vllm_config.cache_config

    if cache_config.enable_prefix_caching:
        if cache_config.mamba_cache_mode == "none":
            cache_config.mamba_cache_mode = (
                "all" if model_config.supports_mamba_prefix_caching else "align"
            )
            logger.warning(
                "Mamba cache mode is set to '%s' for %s by default "
                "when prefix caching is enabled",
                cache_config.mamba_cache_mode,
                model_config.architecture,
            )
        if (
            cache_config.mamba_cache_mode == "all"
            and not model_config.supports_mamba_prefix_caching
        ):
            cache_config.mamba_cache_mode = "align"
            logger.warning(
                "Hybrid or mamba-based model detected without support "
                "for prefix caching with Mamba cache 'all' mode: "
                "falling back to 'align' mode."
            )
        if cache_config.mamba_cache_mode == "align":
            assert vllm_config.scheduler_config.enable_chunked_prefill, (
                "Chunked prefill is required for mamba cache mode 'align'."
            )
        logger.info(
            "Warning: Prefix caching in Mamba cache '%s' "
            "mode is currently enabled. "
            "Its support for Mamba layers is experimental. "
            "Please report any issues you may observe.",
            cache_config.mamba_cache_mode,
        )
        # By default, mamba block size will be set to max_model_len (see
        # below). When enabling prefix caching, we align mamba block size
        # to the block size as the basic granularity for prefix caching.
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = cache_config.block_size
    else:
        if cache_config.mamba_cache_mode != "none":
            cache_config.mamba_cache_mode = "none"
            logger.warning(
                "Mamba cache mode is set to 'none' when prefix caching is disabled"
            )
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = model_config.max_model_len

NemotronHForCausalLMConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config, or to
        float16 if not specified.
        """
        cache_config = vllm_config.cache_config
        if cache_config.mamba_ssm_cache_dtype == "auto":
            hf_config = vllm_config.model_config.hf_config
            mamba_ssm_cache_dtype = getattr(
                hf_config, "mamba_ssm_cache_dtype", "float16"
            )
            logger.info(
                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
                mamba_ssm_cache_dtype,
            )
            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype

verify_and_update_config staticmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto' (or not explicitly set), to the value specified in the HF config, or to float16 if not specified.

Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
    (or not explicitly set), to the value specified in the HF config, or to
    float16 if not specified.
    """
    cache_config = vllm_config.cache_config
    if cache_config.mamba_ssm_cache_dtype == "auto":
        hf_config = vllm_config.model_config.hf_config
        mamba_ssm_cache_dtype = getattr(
            hf_config, "mamba_ssm_cache_dtype", "float16"
        )
        logger.info(
            "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
            mamba_ssm_cache_dtype,
        )
        cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype

Qwen3_5ForConditionalGenerationConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config's
        mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
        different value.
        """
        cache_config = vllm_config.cache_config
        hf_text_config = vllm_config.model_config.hf_text_config
        mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None)
        if cache_config.mamba_ssm_cache_dtype == "auto":
            if mamba_ssm_dtype is not None:
                cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype
        elif (
            mamba_ssm_dtype is not None
            and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype
        ):
            logger.warning(
                "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
                "but --mamba-ssm-cache-dtype='%s' was passed. "
                "Using the user-specified value.",
                mamba_ssm_dtype,
                cache_config.mamba_ssm_cache_dtype,
            )

verify_and_update_config staticmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto' (or not explicitly set), to the value specified in the HF config's mamba_ssm_dtype field. Warn if the user explicitly overrides it to a different value.

Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
    (or not explicitly set), to the value specified in the HF config's
    mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
    different value.
    """
    cache_config = vllm_config.cache_config
    hf_text_config = vllm_config.model_config.hf_text_config
    mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None)
    if cache_config.mamba_ssm_cache_dtype == "auto":
        if mamba_ssm_dtype is not None:
            cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype
    elif (
        mamba_ssm_dtype is not None
        and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype
    ):
        logger.warning(
            "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
            "but --mamba-ssm-cache-dtype='%s' was passed. "
            "Using the user-specified value.",
            mamba_ssm_dtype,
            cache_config.mamba_ssm_cache_dtype,
        )