chunking.semantic

chunktuner.chunking.semantic

Semantic-ish chunking via semchunk (optional extra chunktuner[semantic]).

SemanticStrategy

SemanticStrategy(encoding_name='cl100k_base')

Token-budget chunks via optional semchunk (install chunktuner[semantic]).

Source code in src/chunktuner/chunking/semantic.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._encoding_name = encoding_name
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Chunk doc.content with semchunk using max_tokens / overlap settings.

Source code in src/chunktuner/chunking/semantic.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Chunk ``doc.content`` with semchunk using ``max_tokens`` / overlap settings."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    semchunk = _require_semchunk()
    max_tokens = max(16, int(config.params.get("max_tokens", 512)))
    ot = config.params.get("overlap_tokens")
    if ot is not None:
        overlap: int | None = int(ot)
    else:
        thr = float(config.params.get("similarity_threshold", 0.0) or 0.0)
        overlap = int(max_tokens * thr) if thr > 0 else None
    text = doc.content
    if not text:
        validate_chunk_offsets(doc, [])
        return []
    out = semchunk.chunk(
        text,
        max_tokens,
        self._count,
        offsets=True,
        overlap=overlap,
    )
    pieces, offsets = out
    chunks: list[Chunk] = []
    for idx, (piece, (a, b)) in enumerate(zip(pieces, offsets, strict=True)):
        slice_text = text[a:b]
        if slice_text != piece:
            raise ValueError(
                f"semchunk offset mismatch at [{a}:{b}]: "
                f"expected {piece[:40]!r}, got {slice_text[:40]!r}. "
                "This is a semchunk bug — please report it."
            )
        chunks.append(
            Chunk.from_document(
                doc,
                id=f"{doc.id}_sem_{idx}",
                start_offset=a,
                end_offset=b,
                tokens=self._count(slice_text),
            )
        )
    validate_chunk_offsets(doc, chunks)
    return chunks