Source code for ralph.mcp.multimodal.capabilities

"""Multimodal capability detection and delivery policy.

This module is the single source of truth for provider/model identity,
capability detection, and delivery policy decisions. All runtime layers that
need to determine whether a modality can be delivered must derive their answer
from this module rather than re-declaring provider knowledge elsewhere.
"""

from __future__ import annotations

from dataclasses import dataclass

from ralph.mcp.multimodal._capability_verdict import CapabilityVerdict
from ralph.mcp.multimodal._delivery_mode import DeliveryMode
from ralph.mcp.multimodal._multimodal_model_identity import MultimodalModelIdentity
from ralph.mcp.multimodal.artifacts import SUPPORTED_MODALITIES

# ---------------------------------------------------------------------------
# Per-provider inline-image support
# ---------------------------------------------------------------------------


def _claude_supports_inline_image(model_id: str | None) -> bool:
    return True  # All current Claude models support vision


def _openai_supports_inline_image(model_id: str | None) -> bool:
    if model_id is None:
        return True
    vision_capable_prefixes = ("gpt-4o", "gpt-4-vision", "gpt-4-turbo", "gpt-4", "o1", "o3")
    return any(model_id.startswith(prefix) for prefix in vision_capable_prefixes)


def _gemini_supports_inline_image(model_id: str | None) -> bool:
    return True  # All current Gemini models support vision


def _inline_image_reason(provider: str, model_id: str | None) -> str | None:
    """Return a human-readable reason string if the provider supports inline images."""
    if provider in {"claude", "anthropic"} and _claude_supports_inline_image(model_id):
        return "Claude supports inline image delivery"
    if provider in {"openai", "codex"} and _openai_supports_inline_image(model_id):
        return "OpenAI model supports inline image delivery"
    if provider == "gemini" and _gemini_supports_inline_image(model_id):
        return "Gemini supports inline image delivery"
    return None


# Typed-block support per provider and modality.
# Maps (provider, modality) -> block_type string for TYPED_BLOCK delivery.
_TYPED_BLOCK_SUPPORT: dict[str, dict[str, str]] = {
    "claude": {"pdf": "pdf", "document": "document"},
    "anthropic": {"pdf": "pdf", "document": "document"},
    "gemini": {"pdf": "pdf", "document": "document", "audio": "audio", "video": "video"},
}


# ---------------------------------------------------------------------------
# Per-provider non-image modality support matrix
# ---------------------------------------------------------------------------

# Modalities explicitly unsupported for each known provider via Ralph's
# managed MCP runtime path. Providers not listed here fall through to the
# safe resource_reference default.
#
# UNSUPPORTED means Ralph cannot deliver the modality through its managed
# path for this provider — the model API simply does not accept it.
# RESOURCE_REFERENCE means the agent can retrieve the bytes via resources/read
# and attempt to relay them to the model in a provider-appropriate form.
_PROVIDER_UNSUPPORTED_MODALITIES: dict[str, frozenset[str]] = {
    # Claude/Anthropic does not accept audio or video input via its API.
    # Images and PDFs are deliverable (inline or via document blocks).
    # Documents (.docx, .pptx, .xlsx) are accepted via document blocks on
    # models that support them.
    "claude": frozenset({"audio", "video"}),
    "anthropic": frozenset({"audio", "video"}),
    # OpenAI chat completion API does not accept PDFs, documents, audio, or
    # video as raw bytes through Ralph's managed MCP path. Only images are
    # supported (for vision-capable models). Marking pdf/document/audio/video
    # as UNSUPPORTED so the agent receives an explicit failure instead of a
    # resource_reference that the model cannot process.
    "openai": frozenset({"audio", "video", "pdf", "document"}),
    "codex": frozenset({"audio", "video", "pdf", "document"}),
    # Gemini supports audio, video, PDFs, and documents natively;
    # no modalities are unsupported.
    "gemini": frozenset(),
}

_PROVIDER_UNSUPPORTED_REASON: dict[str, str] = {
    "claude": "Claude does not accept this modality via Ralph's managed MCP path",
    "anthropic": "Anthropic does not accept this modality via Ralph's managed MCP path",
    "openai": "OpenAI does not accept this modality via Ralph's managed MCP path",
    "codex": "Codex does not accept this modality via Ralph's managed MCP path",
}


[docs] def get_delivery_mode( identity: MultimodalModelIdentity, modality: str, ) -> CapabilityVerdict: """Determine how to deliver a modality for the given model identity. Returns a CapabilityVerdict indicating the delivery mode: - INLINE_IMAGE: provider accepts inline base64 image data. - TYPED_BLOCK: provider accepts a named typed block (pdf, document, audio, video). - RESOURCE_REFERENCE_REPLAY: unknown provider; multimodal surface stays visible via resource reference replay handle. - UNSUPPORTED: provider cannot accept this modality via Ralph's managed path. Unknown providers default to RESOURCE_REFERENCE_REPLAY (safe, keeps multimodal surface available without false typed-delivery promises). """ if modality not in SUPPORTED_MODALITIES: return CapabilityVerdict( modality=modality, delivery=DeliveryMode.UNSUPPORTED, provider=identity.provider, model_id=identity.model_id, reason=f"unknown modality '{modality}'", ) if not identity.is_known(): return CapabilityVerdict( modality=modality, delivery=DeliveryMode.RESOURCE_REFERENCE_REPLAY, provider=identity.provider, model_id=identity.model_id, reason="unknown provider — defaulting to resource_reference_replay delivery", ) provider_lower = identity.provider.lower() if modality == "image": inline_reason = _inline_image_reason(provider_lower, identity.model_id) delivery = ( DeliveryMode.INLINE_IMAGE if inline_reason else DeliveryMode.RESOURCE_REFERENCE_REPLAY ) reason = inline_reason or "provider does not support inline image delivery" return CapabilityVerdict( modality=modality, delivery=delivery, provider=identity.provider, model_id=identity.model_id, reason=reason, ) # Check whether this provider explicitly does not support this modality. unsupported = _PROVIDER_UNSUPPORTED_MODALITIES.get(provider_lower, frozenset()) if modality in unsupported: base_reason = _PROVIDER_UNSUPPORTED_REASON.get( provider_lower, f"provider '{identity.provider}' does not support '{modality}'", ) return CapabilityVerdict( modality=modality, delivery=DeliveryMode.UNSUPPORTED, provider=identity.provider, model_id=identity.model_id, reason=f"{base_reason} (modality: {modality})", ) # Typed-block or resource_reference_replay for remaining known-provider modalities. typed_blocks = _TYPED_BLOCK_SUPPORT.get(provider_lower, {}) block_type: str | None = typed_blocks.get(modality) delivery = DeliveryMode.TYPED_BLOCK if block_type else DeliveryMode.RESOURCE_REFERENCE_REPLAY reason = ( f"'{modality}' delivered as typed block '{block_type}' for provider '{identity.provider}'" if block_type else f"'{modality}' as resource_reference_replay for provider '{identity.provider}'" ) return CapabilityVerdict( modality=modality, delivery=delivery, provider=identity.provider, model_id=identity.model_id, reason=reason, block_type=block_type, )
[docs] @dataclass class ResolvedCapabilityProfile: """Pre-computed capability verdicts for a resolved model identity. This is the runtime-owned contract for multimodal delivery decisions. Downstream layers consume this profile from the session rather than re-calling get_delivery_mode() at each use site. """ identity: MultimodalModelIdentity verdicts: dict[str, CapabilityVerdict]
[docs] def verdict_for(self, modality: str) -> CapabilityVerdict: """Return the pre-computed verdict, or compute fresh for unlisted modalities.""" if modality in self.verdicts: return self.verdicts[modality] return get_delivery_mode(self.identity, modality)
[docs] def to_payload(self) -> dict[str, object]: """Serialize to a JSON-compatible dict for session payload persistence.""" return { "provider": self.identity.provider, "model_id": self.identity.model_id, "transport": self.identity.transport, "verdicts": { modality: { "delivery": v.delivery.value, "reason": v.reason, "block_type": v.block_type, } for modality, v in self.verdicts.items() }, }
UNKNOWN_IDENTITY = MultimodalModelIdentity(provider="unknown")
[docs] def resolve_capability_profile(identity: MultimodalModelIdentity) -> ResolvedCapabilityProfile: """Build a pre-computed capability profile for all supported modalities.""" verdicts = { modality: get_delivery_mode(identity, modality) for modality in SUPPORTED_MODALITIES } return ResolvedCapabilityProfile(identity=identity, verdicts=verdicts)
[docs] def profile_from_payload(raw: dict[str, object]) -> ResolvedCapabilityProfile: """Rehydrate a ResolvedCapabilityProfile from a serialized session payload dict.""" provider = str(raw.get("provider", "unknown")) model_id_raw = raw.get("model_id") transport_raw = raw.get("transport") identity = MultimodalModelIdentity( provider=provider, model_id=str(model_id_raw) if model_id_raw is not None else None, transport=str(transport_raw) if transport_raw is not None else None, ) raw_verdicts = raw.get("verdicts") if not isinstance(raw_verdicts, dict): return resolve_capability_profile(identity) verdicts: dict[str, CapabilityVerdict] = {} for modality, v in raw_verdicts.items(): if not isinstance(v, dict): continue delivery_raw = v.get("delivery", "") try: delivery = DeliveryMode(str(delivery_raw)) except ValueError: delivery = DeliveryMode.RESOURCE_REFERENCE_REPLAY block_type_raw = v.get("block_type") verdicts[modality] = CapabilityVerdict( modality=modality, delivery=delivery, provider=provider, model_id=str(model_id_raw) if model_id_raw is not None else None, reason=str(v.get("reason", "")), block_type=str(block_type_raw) if block_type_raw is not None else None, ) for modality in SUPPORTED_MODALITIES: if modality not in verdicts: verdicts[modality] = get_delivery_mode(identity, modality) return ResolvedCapabilityProfile(identity=identity, verdicts=verdicts)
__all__ = [ "UNKNOWN_IDENTITY", "CapabilityVerdict", "DeliveryMode", "MultimodalModelIdentity", "ResolvedCapabilityProfile", "get_delivery_mode", "profile_from_payload", "resolve_capability_profile", ]