def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
"""Call LiteLLM JSON mode to obtain ``start_offset`` / ``end_offset`` chunk list."""
validate_content_type(self.name, self.supported_content_types, doc.content_type)
import litellm
model = str(config.params.get("model", "gpt-4o-mini"))
max_props = int(config.params.get("max_propositions", 40))
content = doc.content
truncated = len(content) > MAX_CHARS
if truncated:
logger.warning(
"AgenticStrategy: doc %r truncated from %d to %d chars. "
"Content beyond offset %d will have no chunks.",
doc.id,
len(doc.content),
MAX_CHARS,
MAX_CHARS,
)
content = content[:MAX_CHARS]
prompt = (
"Split the following document into coherent RAG chunks. "
"Return JSON object with key chunks: array of "
'{"start_offset": int, "end_offset": int} using UTF-16? NO — use character offsets '
"into the exact input string (Python slicing). Max chunks: "
f"{max_props}.\n\nDOCUMENT:\n{content}"
)
resp = litellm.completion(
model=model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.1,
)
raw = resp.choices[0].message.content or "{}"
data = json.loads(raw)
raw_items = data.get("chunks", data) if isinstance(data, dict) else data
if isinstance(raw_items, dict):
raw_items = raw_items.get("chunks", [])
chunks: list[Chunk] = []
for item in raw_items[:max_props]:
if not isinstance(item, dict):
continue
raw_a = int(item.get("start_offset", item.get("start", 0)))
raw_b = int(item.get("end_offset", item.get("end", 0)))
a = max(0, min(raw_a, len(content)))
b = max(a, min(raw_b, len(content)))
if a != raw_a or b != raw_b:
logger.warning(
"AgenticStrategy: clamped LLM offsets [%d:%d] → [%d:%d] for doc %r",
raw_a,
raw_b,
a,
b,
doc.id,
)
piece = content[a:b]
if not piece.strip():
continue
intended = item.get("text") or item.get("chunk_text") or item.get("content") or ""
if isinstance(intended, str) and intended.strip():
if not _fuzzy_token_match(piece, intended, self._enc, threshold=0.5):
logger.warning("AgenticStrategy: rejecting chunk with poor offset match")
continue
chunks.append(
Chunk.from_document(
doc,
id=str(uuid.uuid4()),
start_offset=a,
end_offset=b,
tokens=len(self._enc.encode(piece)),
)
)
if not chunks:
from chunktuner.chunking.recursive_character import RecursiveCharacterStrategy
out = RecursiveCharacterStrategy(encoding_name=self._enc.name).chunk(
doc,
ChunkConfig(
name="recursive_character",
params={"chunk_size_chars": 1200, "chunk_overlap_chars": 100},
),
)
if truncated:
for c in out:
c.metadata["agentic_truncated"] = True
c.metadata["agentic_truncated_at"] = MAX_CHARS
return out
if truncated:
for c in chunks:
c.metadata["agentic_truncated"] = True
c.metadata["agentic_truncated_at"] = MAX_CHARS
validate_chunk_offsets(doc, chunks)
return chunks