def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
"""Produce overlapping spans by splitting on ``separators`` up to ``chunk_size_chars``."""
validate_content_type(self.name, self.supported_content_types, doc.content_type)
chunk_size = int(config.params.get("chunk_size_chars", 1600))
overlap = int(config.params.get("chunk_overlap_chars", 0))
separators: list[str] = list(config.params.get("separators", _DEFAULT_SEPARATORS))
chunk_size = max(1, chunk_size)
if overlap >= chunk_size:
raise ValueError(
f"chunk_overlap_chars ({overlap}) must be < chunk_size_chars ({chunk_size}). "
"Otherwise the sliding window cannot advance."
)
overlap = max(0, min(overlap, chunk_size - 1)) if chunk_size > 1 else 0
text = doc.content
if not text:
return []
raw_ranges: list[tuple[int, int]] = []
start = 0
n = len(text)
while start < n:
end = self._find_break(text, start, min(start + chunk_size, n), separators)
if end <= start:
end = min(start + chunk_size, n)
raw_ranges.append((start, end))
if end >= n:
break
start = max(start + 1, end - overlap)
chunks: list[Chunk] = []
for idx, (a, b) in enumerate(raw_ranges):
slice_text = text[a:b]
toks = len(self._enc.encode(slice_text))
chunks.append(
Chunk.from_document(
doc,
id=f"{doc.id}_rc_{idx}",
start_offset=a,
end_offset=b,
tokens=toks,
)
)
validate_chunk_offsets(doc, chunks)
return chunks