chunking

chunktuner.chunking

FixedTokenStrategy

FixedTokenStrategy(encoding_name='cl100k_base')

Sliding tiktoken windows with optional overlap (baseline RAG chunker).

Source code in src/chunktuner/chunking/fixed_tokens.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._encoding_name = encoding_name
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Split doc.content into fixed-size token spans with correct char offsets.

Source code in src/chunktuner/chunking/fixed_tokens.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Split ``doc.content`` into fixed-size token spans with correct char offsets."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    max_tokens = int(config.params.get("max_tokens", 512))
    overlap = int(config.params.get("overlap_tokens", 0))
    max_tokens = max(1, max_tokens)
    overlap = max(0, min(overlap, max_tokens - 1)) if max_tokens > 1 else 0
    step = max(1, max_tokens - overlap)

    text = doc.content
    ids = self._enc.encode(text)
    if not ids:
        return []

    char_boundaries = self._token_char_boundaries(text, ids)
    chunks: list[Chunk] = []
    i = 0
    idx = 0
    while i < len(ids):
        j = min(i + max_tokens, len(ids))
        start_char = char_boundaries[i]
        end_char = char_boundaries[j] if j < len(ids) else len(text)
        chunks.append(
            Chunk.from_document(
                doc,
                id=f"{doc.id}_ft_{idx}",
                start_offset=start_char,
                end_offset=end_char,
                tokens=j - i,
            )
        )
        idx += 1
        if j >= len(ids):
            break
        i += step
    validate_chunk_offsets(doc, chunks)
    return chunks

RecursiveCharacterStrategy

RecursiveCharacterStrategy(encoding_name='cl100k_base')

Hierarchical character splits (paragraphs, lines, sentences) with overlap.

Source code in src/chunktuner/chunking/recursive_character.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Produce overlapping spans by splitting on separators up to chunk_size_chars.

Source code in src/chunktuner/chunking/recursive_character.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Produce overlapping spans by splitting on ``separators`` up to ``chunk_size_chars``."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    chunk_size = int(config.params.get("chunk_size_chars", 1600))
    overlap = int(config.params.get("chunk_overlap_chars", 0))
    separators: list[str] = list(config.params.get("separators", _DEFAULT_SEPARATORS))
    chunk_size = max(1, chunk_size)
    if overlap >= chunk_size:
        raise ValueError(
            f"chunk_overlap_chars ({overlap}) must be < chunk_size_chars ({chunk_size}). "
            "Otherwise the sliding window cannot advance."
        )
    overlap = max(0, min(overlap, chunk_size - 1)) if chunk_size > 1 else 0

    text = doc.content
    if not text:
        return []

    raw_ranges: list[tuple[int, int]] = []
    start = 0
    n = len(text)
    while start < n:
        end = self._find_break(text, start, min(start + chunk_size, n), separators)
        if end <= start:
            end = min(start + chunk_size, n)
        raw_ranges.append((start, end))
        if end >= n:
            break
        start = max(start + 1, end - overlap)

    chunks: list[Chunk] = []
    for idx, (a, b) in enumerate(raw_ranges):
        slice_text = text[a:b]
        toks = len(self._enc.encode(slice_text))
        chunks.append(
            Chunk.from_document(
                doc,
                id=f"{doc.id}_rc_{idx}",
                start_offset=a,
                end_offset=b,
                tokens=toks,
            )
        )
    validate_chunk_offsets(doc, chunks)
    return chunks

build_default_registry

build_default_registry(encoding='cl100k_base')

Alias for build_full_registry (backward compatible name).

Source code in src/chunktuner/chunking/__init__.py
7
8
9
def build_default_registry(encoding: str = "cl100k_base") -> StrategyRegistry:
    """Alias for ``build_full_registry`` (backward compatible name)."""
    return build_full_registry(encoding)