ingestion.url_ingestor

chunktuner.ingestion.url_ingestor

Fetch remote URLs into Document objects.

URLIngestor

Fetch HTTP(S) resources into a single Document (HTML preprocessed to text).

ingest_url

ingest_url(url, *, timeout=30.0)

GET url and map response body to text / markdown / html content.

Source code in src/chunktuner/ingestion/url_ingestor.py
def ingest_url(self, url: str, *, timeout: float = 30.0) -> Document:
    """GET ``url`` and map response body to ``text`` / ``markdown`` / ``html`` content."""
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        raise ValueError(f"Unsupported URL scheme: {parsed.scheme!r}")
    host = parsed.hostname or ""
    if _is_private_ip(host):
        raise ValueError(
            f"SSRF guard: {host!r} resolves to a private/loopback address. "
            "Only public URLs are permitted."
        )
    with httpx.Client(follow_redirects=True, timeout=timeout) as client:
        resp = client.get(url)
        resp.raise_for_status()
    ctype = (resp.headers.get("content-type") or "").split(";")[0].strip().lower()
    raw = resp.text
    if "html" in ctype:
        body = preprocess(raw, "html")
        content_type = "html"
    else:
        body = raw
        content_type = "markdown"
    return Document(
        id=str(uuid.uuid4()),
        content=body,
        content_type=content_type,
        source_url=url,
        metadata={"content_type_header": ctype},
    )