Source code for ralph.mcp.multimodal.capabilities

"""Multimodal capability detection and delivery policy.

This module is the single source of truth for provider/model identity,
capability detection, and delivery policy decisions. All runtime layers that
need to determine whether a modality can be delivered must derive their answer
from this module rather than re-declaring provider knowledge elsewhere.
"""

from __future__ import annotations

from dataclasses import dataclass

from ralph.mcp.multimodal._capability_verdict import CapabilityVerdict
from ralph.mcp.multimodal._delivery_mode import DeliveryMode
from ralph.mcp.multimodal._multimodal_model_identity import MultimodalModelIdentity
from ralph.mcp.multimodal.artifacts import SUPPORTED_MODALITIES

# ---------------------------------------------------------------------------
# Per-provider inline-image support
# ---------------------------------------------------------------------------


def _claude_supports_inline_image(model_id: str | None) -> bool:
    return True  # All current Claude models support vision


def _openai_supports_inline_image(model_id: str | None) -> bool:
    if model_id is None:
        return True
    vision_capable_prefixes = ("gpt-4o", "gpt-4-vision", "gpt-4-turbo", "gpt-4", "o1", "o3")
    return any(model_id.startswith(prefix) for prefix in vision_capable_prefixes)


def _gemini_supports_inline_image(model_id: str | None) -> bool:
    return True  # All current Gemini models support vision


def _inline_image_reason(provider: str, model_id: str | None) -> str | None:
    """Return a human-readable reason string if the provider supports inline images."""
    if provider in {"claude", "anthropic"} and _claude_supports_inline_image(model_id):
        return "Claude supports inline image delivery"
    if provider in {"openai", "codex"} and _openai_supports_inline_image(model_id):
        return "OpenAI model supports inline image delivery"
    if provider == "gemini" and _gemini_supports_inline_image(model_id):
        return "Gemini supports inline image delivery"
    return None


# Typed-block support per provider and modality.
# Maps (provider, modality) -> block_type string for TYPED_BLOCK delivery.
_TYPED_BLOCK_SUPPORT: dict[str, dict[str, str]] = {
    "claude": {"pdf": "pdf", "document": "document"},
    "anthropic": {"pdf": "pdf", "document": "document"},
    "gemini": {"pdf": "pdf", "document": "document", "audio": "audio", "video": "video"},
}


# ---------------------------------------------------------------------------
# Per-provider non-image modality support matrix
# ---------------------------------------------------------------------------

# Modalities explicitly unsupported for each known provider via Ralph's
# managed MCP runtime path. Providers not listed here fall through to the
# safe resource_reference default.
#
# UNSUPPORTED means Ralph cannot deliver the modality through its managed
# path for this provider — the model API simply does not accept it.
# RESOURCE_REFERENCE means the agent can retrieve the bytes via resources/read
# and attempt to relay them to the model in a provider-appropriate form.
_PROVIDER_UNSUPPORTED_MODALITIES: dict[str, frozenset[str]] = {
    # Claude/Anthropic does not accept audio or video input via its API.
    # Images and PDFs are deliverable (inline or via document blocks).
    # Documents (.docx, .pptx, .xlsx) are accepted via document blocks on
    # models that support them.
    "claude": frozenset({"audio", "video"}),
    "anthropic": frozenset({"audio", "video"}),
    # OpenAI chat completion API does not accept PDFs, documents, audio, or
    # video as raw bytes through Ralph's managed MCP path. Only images are
    # supported (for vision-capable models). Marking pdf/document/audio/video
    # as UNSUPPORTED so the agent receives an explicit failure instead of a
    # resource_reference that the model cannot process.
    "openai": frozenset({"audio", "video", "pdf", "document"}),
    "codex": frozenset({"audio", "video", "pdf", "document"}),
    # Gemini supports audio, video, PDFs, and documents natively;
    # no modalities are unsupported.
    "gemini": frozenset(),
}

_PROVIDER_UNSUPPORTED_REASON: dict[str, str] = {
    "claude": "Claude does not accept this modality via Ralph's managed MCP path",
    "anthropic": "Anthropic does not accept this modality via Ralph's managed MCP path",
    "openai": "OpenAI does not accept this modality via Ralph's managed MCP path",
    "codex": "Codex does not accept this modality via Ralph's managed MCP path",
}



[docs]
def get_delivery_mode(
    identity: MultimodalModelIdentity,
    modality: str,
) -> CapabilityVerdict:
    """Determine how to deliver a modality for the given model identity.

    Returns a CapabilityVerdict indicating the delivery mode:

    - INLINE_IMAGE: provider accepts inline base64 image data.
    - TYPED_BLOCK: provider accepts a named typed block (pdf, document, audio, video).
    - RESOURCE_REFERENCE_REPLAY: unknown provider; multimodal surface stays visible
      via resource reference replay handle.
    - UNSUPPORTED: provider cannot accept this modality via Ralph's managed path.

    Unknown providers default to RESOURCE_REFERENCE_REPLAY (safe, keeps multimodal
    surface available without false typed-delivery promises).
    """
    if modality not in SUPPORTED_MODALITIES:
        return CapabilityVerdict(
            modality=modality,
            delivery=DeliveryMode.UNSUPPORTED,
            provider=identity.provider,
            model_id=identity.model_id,
            reason=f"unknown modality '{modality}'",
        )

    if not identity.is_known():
        return CapabilityVerdict(
            modality=modality,
            delivery=DeliveryMode.RESOURCE_REFERENCE_REPLAY,
            provider=identity.provider,
            model_id=identity.model_id,
            reason="unknown provider — defaulting to resource_reference_replay delivery",
        )

    provider_lower = identity.provider.lower()

    if modality == "image":
        inline_reason = _inline_image_reason(provider_lower, identity.model_id)
        delivery = (
            DeliveryMode.INLINE_IMAGE if inline_reason else DeliveryMode.RESOURCE_REFERENCE_REPLAY
        )
        reason = inline_reason or "provider does not support inline image delivery"
        return CapabilityVerdict(
            modality=modality,
            delivery=delivery,
            provider=identity.provider,
            model_id=identity.model_id,
            reason=reason,
        )

    # Check whether this provider explicitly does not support this modality.
    unsupported = _PROVIDER_UNSUPPORTED_MODALITIES.get(provider_lower, frozenset())
    if modality in unsupported:
        base_reason = _PROVIDER_UNSUPPORTED_REASON.get(
            provider_lower,
            f"provider '{identity.provider}' does not support '{modality}'",
        )
        return CapabilityVerdict(
            modality=modality,
            delivery=DeliveryMode.UNSUPPORTED,
            provider=identity.provider,
            model_id=identity.model_id,
            reason=f"{base_reason} (modality: {modality})",
        )

    # Typed-block or resource_reference_replay for remaining known-provider modalities.
    typed_blocks = _TYPED_BLOCK_SUPPORT.get(provider_lower, {})
    block_type: str | None = typed_blocks.get(modality)
    delivery = DeliveryMode.TYPED_BLOCK if block_type else DeliveryMode.RESOURCE_REFERENCE_REPLAY
    reason = (
        f"'{modality}' delivered as typed block '{block_type}' for provider '{identity.provider}'"
        if block_type
        else f"'{modality}' as resource_reference_replay for provider '{identity.provider}'"
    )
    return CapabilityVerdict(
        modality=modality,
        delivery=delivery,
        provider=identity.provider,
        model_id=identity.model_id,
        reason=reason,
        block_type=block_type,
    )




[docs]
@dataclass
class ResolvedCapabilityProfile:
    """Pre-computed capability verdicts for a resolved model identity.

    This is the runtime-owned contract for multimodal delivery decisions.
    Downstream layers consume this profile from the session rather than
    re-calling get_delivery_mode() at each use site.
    """

    identity: MultimodalModelIdentity
    verdicts: dict[str, CapabilityVerdict]


[docs]
    def verdict_for(self, modality: str) -> CapabilityVerdict:
        """Return the pre-computed verdict, or compute fresh for unlisted modalities."""
        if modality in self.verdicts:
            return self.verdicts[modality]
        return get_delivery_mode(self.identity, modality)



[docs]
    def to_payload(self) -> dict[str, object]:
        """Serialize to a JSON-compatible dict for session payload persistence."""
        return {
            "provider": self.identity.provider,
            "model_id": self.identity.model_id,
            "transport": self.identity.transport,
            "verdicts": {
                modality: {
                    "delivery": v.delivery.value,
                    "reason": v.reason,
                    "block_type": v.block_type,
                }
                for modality, v in self.verdicts.items()
            },
        }




UNKNOWN_IDENTITY = MultimodalModelIdentity(provider="unknown")



[docs]
def resolve_capability_profile(identity: MultimodalModelIdentity) -> ResolvedCapabilityProfile:
    """Build a pre-computed capability profile for all supported modalities."""
    verdicts = {
        modality: get_delivery_mode(identity, modality) for modality in SUPPORTED_MODALITIES
    }
    return ResolvedCapabilityProfile(identity=identity, verdicts=verdicts)




[docs]
def profile_from_payload(raw: dict[str, object]) -> ResolvedCapabilityProfile:
    """Rehydrate a ResolvedCapabilityProfile from a serialized session payload dict."""
    provider = str(raw.get("provider", "unknown"))
    model_id_raw = raw.get("model_id")
    transport_raw = raw.get("transport")
    identity = MultimodalModelIdentity(
        provider=provider,
        model_id=str(model_id_raw) if model_id_raw is not None else None,
        transport=str(transport_raw) if transport_raw is not None else None,
    )
    raw_verdicts = raw.get("verdicts")
    if not isinstance(raw_verdicts, dict):
        return resolve_capability_profile(identity)
    verdicts: dict[str, CapabilityVerdict] = {}
    for modality, v in raw_verdicts.items():
        if not isinstance(v, dict):
            continue
        delivery_raw = v.get("delivery", "")
        try:
            delivery = DeliveryMode(str(delivery_raw))
        except ValueError:
            delivery = DeliveryMode.RESOURCE_REFERENCE_REPLAY
        block_type_raw = v.get("block_type")
        verdicts[modality] = CapabilityVerdict(
            modality=modality,
            delivery=delivery,
            provider=provider,
            model_id=str(model_id_raw) if model_id_raw is not None else None,
            reason=str(v.get("reason", "")),
            block_type=str(block_type_raw) if block_type_raw is not None else None,
        )
    for modality in SUPPORTED_MODALITIES:
        if modality not in verdicts:
            verdicts[modality] = get_delivery_mode(identity, modality)
    return ResolvedCapabilityProfile(identity=identity, verdicts=verdicts)



__all__ = [
    "UNKNOWN_IDENTITY",
    "CapabilityVerdict",
    "DeliveryMode",
    "MultimodalModelIdentity",
    "ResolvedCapabilityProfile",
    "get_delivery_mode",
    "profile_from_payload",
    "resolve_capability_profile",
]