def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
"""Chunk ``doc.content`` with semchunk using ``max_tokens`` / overlap settings."""
validate_content_type(self.name, self.supported_content_types, doc.content_type)
semchunk = _require_semchunk()
max_tokens = max(16, int(config.params.get("max_tokens", 512)))
ot = config.params.get("overlap_tokens")
if ot is not None:
overlap: int | None = int(ot)
else:
thr = float(config.params.get("similarity_threshold", 0.0) or 0.0)
overlap = int(max_tokens * thr) if thr > 0 else None
text = doc.content
if not text:
validate_chunk_offsets(doc, [])
return []
out = semchunk.chunk(
text,
max_tokens,
self._count,
offsets=True,
overlap=overlap,
)
pieces, offsets = out
chunks: list[Chunk] = []
for idx, (piece, (a, b)) in enumerate(zip(pieces, offsets, strict=True)):
slice_text = text[a:b]
if slice_text != piece:
raise ValueError(
f"semchunk offset mismatch at [{a}:{b}]: "
f"expected {piece[:40]!r}, got {slice_text[:40]!r}. "
"This is a semchunk bug — please report it."
)
chunks.append(
Chunk.from_document(
doc,
id=f"{doc.id}_sem_{idx}",
start_offset=a,
end_offset=b,
tokens=self._count(slice_text),
)
)
validate_chunk_offsets(doc, chunks)
return chunks