chunking.agentic

chunktuner.chunking.agentic

LLM-proposed chunk boundaries (expensive; opt-in via strategy selection).

AgenticStrategy

AgenticStrategy(encoding_name='cl100k_base')

LLM proposes UTF-8 character spans; validates offsets against doc.content.

Source code in src/chunktuner/chunking/agentic.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Call LiteLLM JSON mode to obtain start_offset / end_offset chunk list.

Source code in src/chunktuner/chunking/agentic.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Call LiteLLM JSON mode to obtain ``start_offset`` / ``end_offset`` chunk list."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    import litellm

    model = str(config.params.get("model", "gpt-4o-mini"))
    max_props = int(config.params.get("max_propositions", 40))
    content = doc.content
    truncated = len(content) > MAX_CHARS
    if truncated:
        logger.warning(
            "AgenticStrategy: doc %r truncated from %d to %d chars. "
            "Content beyond offset %d will have no chunks.",
            doc.id,
            len(doc.content),
            MAX_CHARS,
            MAX_CHARS,
        )
        content = content[:MAX_CHARS]

    prompt = (
        "Split the following document into coherent RAG chunks. "
        "Return JSON object with key chunks: array of "
        '{"start_offset": int, "end_offset": int} using UTF-16? NO — use character offsets '
        "into the exact input string (Python slicing). Max chunks: "
        f"{max_props}.\n\nDOCUMENT:\n{content}"
    )
    resp = litellm.completion(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},
        temperature=0.1,
    )
    raw = resp.choices[0].message.content or "{}"
    data = json.loads(raw)
    raw_items = data.get("chunks", data) if isinstance(data, dict) else data
    if isinstance(raw_items, dict):
        raw_items = raw_items.get("chunks", [])
    chunks: list[Chunk] = []
    for item in raw_items[:max_props]:
        if not isinstance(item, dict):
            continue
        raw_a = int(item.get("start_offset", item.get("start", 0)))
        raw_b = int(item.get("end_offset", item.get("end", 0)))
        a = max(0, min(raw_a, len(content)))
        b = max(a, min(raw_b, len(content)))
        if a != raw_a or b != raw_b:
            logger.warning(
                "AgenticStrategy: clamped LLM offsets [%d:%d] → [%d:%d] for doc %r",
                raw_a,
                raw_b,
                a,
                b,
                doc.id,
            )
        piece = content[a:b]
        if not piece.strip():
            continue
        intended = item.get("text") or item.get("chunk_text") or item.get("content") or ""
        if isinstance(intended, str) and intended.strip():
            if not _fuzzy_token_match(piece, intended, self._enc, threshold=0.5):
                logger.warning("AgenticStrategy: rejecting chunk with poor offset match")
                continue
        chunks.append(
            Chunk.from_document(
                doc,
                id=str(uuid.uuid4()),
                start_offset=a,
                end_offset=b,
                tokens=len(self._enc.encode(piece)),
            )
        )
    if not chunks:
        from chunktuner.chunking.recursive_character import RecursiveCharacterStrategy

        out = RecursiveCharacterStrategy(encoding_name=self._enc.name).chunk(
            doc,
            ChunkConfig(
                name="recursive_character",
                params={"chunk_size_chars": 1200, "chunk_overlap_chars": 100},
            ),
        )
        if truncated:
            for c in out:
                c.metadata["agentic_truncated"] = True
                c.metadata["agentic_truncated_at"] = MAX_CHARS
        return out
    if truncated:
        for c in chunks:
            c.metadata["agentic_truncated"] = True
            c.metadata["agentic_truncated_at"] = MAX_CHARS
    validate_chunk_offsets(doc, chunks)
    return chunks