chunking.recursive_character

chunktuner.chunking.recursive_character

Recursive character splitting with overlap (LangChain-style).

RecursiveCharacterStrategy

RecursiveCharacterStrategy(encoding_name='cl100k_base')

Hierarchical character splits (paragraphs, lines, sentences) with overlap.

Source code in src/chunktuner/chunking/recursive_character.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Produce overlapping spans by splitting on separators up to chunk_size_chars.

Source code in src/chunktuner/chunking/recursive_character.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Produce overlapping spans by splitting on ``separators`` up to ``chunk_size_chars``."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    chunk_size = int(config.params.get("chunk_size_chars", 1600))
    overlap = int(config.params.get("chunk_overlap_chars", 0))
    separators: list[str] = list(config.params.get("separators", _DEFAULT_SEPARATORS))
    chunk_size = max(1, chunk_size)
    if overlap >= chunk_size:
        raise ValueError(
            f"chunk_overlap_chars ({overlap}) must be < chunk_size_chars ({chunk_size}). "
            "Otherwise the sliding window cannot advance."
        )
    overlap = max(0, min(overlap, chunk_size - 1)) if chunk_size > 1 else 0

    text = doc.content
    if not text:
        return []

    raw_ranges: list[tuple[int, int]] = []
    start = 0
    n = len(text)
    while start < n:
        end = self._find_break(text, start, min(start + chunk_size, n), separators)
        if end <= start:
            end = min(start + chunk_size, n)
        raw_ranges.append((start, end))
        if end >= n:
            break
        start = max(start + 1, end - overlap)

    chunks: list[Chunk] = []
    for idx, (a, b) in enumerate(raw_ranges):
        slice_text = text[a:b]
        toks = len(self._enc.encode(slice_text))
        chunks.append(
            Chunk.from_document(
                doc,
                id=f"{doc.id}_rc_{idx}",
                start_offset=a,
                end_offset=b,
                tokens=toks,
            )
        )
    validate_chunk_offsets(doc, chunks)
    return chunks