chunking.fixed_tokens

chunktuner.chunking.fixed_tokens

Fixed-size token windows with overlap (tiktoken).

FixedTokenStrategy

FixedTokenStrategy(encoding_name='cl100k_base')

Sliding tiktoken windows with optional overlap (baseline RAG chunker).

Source code in src/chunktuner/chunking/fixed_tokens.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._encoding_name = encoding_name
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Split doc.content into fixed-size token spans with correct char offsets.

Source code in src/chunktuner/chunking/fixed_tokens.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Split ``doc.content`` into fixed-size token spans with correct char offsets."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    max_tokens = int(config.params.get("max_tokens", 512))
    overlap = int(config.params.get("overlap_tokens", 0))
    max_tokens = max(1, max_tokens)
    overlap = max(0, min(overlap, max_tokens - 1)) if max_tokens > 1 else 0
    step = max(1, max_tokens - overlap)

    text = doc.content
    ids = self._enc.encode(text)
    if not ids:
        return []

    char_boundaries = self._token_char_boundaries(text, ids)
    chunks: list[Chunk] = []
    i = 0
    idx = 0
    while i < len(ids):
        j = min(i + max_tokens, len(ids))
        start_char = char_boundaries[i]
        end_char = char_boundaries[j] if j < len(ids) else len(text)
        chunks.append(
            Chunk.from_document(
                doc,
                id=f"{doc.id}_ft_{idx}",
                start_offset=start_char,
                end_offset=end_char,
                tokens=j - i,
            )
        )
        idx += 1
        if j >= len(ids):
            break
        i += step
    validate_chunk_offsets(doc, chunks)
    return chunks