Source code for ralph.mcp.webvisit.extractor
"""HTML text extraction for the visit_url tool.
Uses readability-lxml for main-content isolation and selectolax for fast
plain-text rendering. Both dependencies are optional ([web-visit] extras).
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING, cast
from urllib.parse import urljoin, urlparse
if TYPE_CHECKING:
from typing import Protocol
class _ReadabilityDocumentProtocol(Protocol):
def __init__(self, input: str) -> None: ...
def title(self) -> str: ...
def summary(self, html_partial: bool = ...) -> str: ...
class _HTMLNodeProtocol(Protocol):
@property
def attributes(self) -> dict[str, str | None]: ...
def decompose(self) -> None: ...
def text(self, *, separator: str = ..., strip: bool = ...) -> str: ...
class _HTMLParserProtocol(Protocol):
def __init__(self, html: str) -> None: ...
@property
def root(self) -> _HTMLNodeProtocol | None: ...
def css(self, _selector: str) -> list[_HTMLNodeProtocol]: ...
_MAX_LINKS = 100
# Module-level optional imports — runtime may lack the extras.
_ReadabilityDocument: type[_ReadabilityDocumentProtocol] | None = None
_HTMLParser: type[_HTMLParserProtocol] | None = None
try:
from readability import Document as _RawDocument
from selectolax.parser import HTMLParser as _RawHTMLParser
_ReadabilityDocument = cast("type[_ReadabilityDocumentProtocol]", _RawDocument)
_HTMLParser = cast("type[_HTMLParserProtocol]", _RawHTMLParser)
except ImportError:
pass
def _require_deps() -> tuple[type[_ReadabilityDocumentProtocol], type[_HTMLParserProtocol]]:
if _ReadabilityDocument is None or _HTMLParser is None:
raise ImportError(
"Web visit text extraction requires optional dependencies. "
"Install them with: pip install ralph-workflow[web-visit]"
)
return _ReadabilityDocument, _HTMLParser
def _collapse_whitespace(text: str) -> str:
lines = [line.strip() for line in text.splitlines()]
non_empty: list[str] = []
for line in lines:
if line or (non_empty and non_empty[-1]):
non_empty.append(line)
return "\n".join(non_empty).strip()
def _extract_links(html: str, *, base_url: str | None) -> tuple[str, ...]:
if _HTMLParser is None:
return ()
parser = _HTMLParser(html)
seen: set[str] = set()
result: list[str] = []
for node in parser.css("a[href]"):
href = (node.attributes.get("href") or "").strip()
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
continue
absolute = urljoin(base_url or "", href) if base_url else href
parsed = urlparse(absolute)
if parsed.scheme not in {"http", "https"}:
continue
if absolute not in seen:
seen.add(absolute)
result.append(absolute)
if len(result) >= _MAX_LINKS:
break
return tuple(result)
__all__ = ["ExtractedPage", "extract_readable"]