chunking.markdown_semantic

chunktuner.chunking.markdown_semantic

Markdown: split on headings, then semchunk within large sections.

MarkdownSemanticStrategy

MarkdownSemanticStrategy(encoding_name='cl100k_base')

Markdown headings define sections; each section is semchunk-split by token budget.

Source code in src/chunktuner/chunking/markdown_semantic.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Chunk per heading section then merge semchunk spans with stable offsets.

Source code in src/chunktuner/chunking/markdown_semantic.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Chunk per heading section then merge semchunk spans with stable offsets."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    semchunk = _require_semchunk()
    max_tokens = max(16, int(config.params.get("max_tokens", 512)))
    overlap = int(config.params.get("overlap_tokens", 0))
    text = doc.content
    if not text:
        validate_chunk_offsets(doc, [])
        return []
    chunks: list[Chunk] = []
    idx = 0
    for sec_a, sec_b in self._section_spans(text):
        section = text[sec_a:sec_b]
        if not section.strip():
            continue
        out = semchunk.chunk(
            section,
            max_tokens,
            self._count,
            offsets=True,
            overlap=overlap or None,
        )
        pieces, offsets = out
        for piece, (la, lb) in zip(pieces, offsets, strict=True):
            a = sec_a + la
            b = sec_a + lb
            slice_text = text[a:b]
            if slice_text != piece:
                raise ValueError(
                    f"semchunk offset mismatch at [{a}:{b}]: "
                    f"expected {piece[:40]!r}, got {slice_text[:40]!r}. "
                    "This is a semchunk bug — please report it."
                )
            chunks.append(
                Chunk.from_document(
                    doc,
                    id=f"{doc.id}_mdsem_{idx}",
                    start_offset=a,
                    end_offset=b,
                    tokens=self._count(slice_text),
                    metadata={"section_start": sec_a},
                )
            )
            idx += 1
    validate_chunk_offsets(doc, chunks)
    return chunks