chunking.structural_semantic

chunktuner.chunking.structural_semantic

Structural regions then token-sized windows (pdf/docx/pptx markdown).

StructuralSemanticStrategy

StructuralSemanticStrategy(encoding_name='cl100k_base')

Coarse PdfStructuralStrategy regions refined with fixed-token sub-windows.

Source code in src/chunktuner/chunking/structural_semantic.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._encoding_name = encoding_name
    self._enc = tiktoken.get_encoding(encoding_name)
    self._struct = PdfStructuralStrategy(encoding_name=encoding_name)

chunk

chunk(doc, config)

Map structural regions to absolute-offset sub-chunks using FixedTokenStrategy.

Source code in src/chunktuner/chunking/structural_semantic.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Map structural regions to absolute-offset sub-chunks using `FixedTokenStrategy`."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    max_tokens = max(32, int(config.params.get("max_tokens", 512)))
    overlap = int(config.params.get("overlap_tokens", 0))
    coarse = dict(config.params)
    coarse.setdefault("max_region_chars", 8000)
    regions = self._struct.chunk(doc, ChunkConfig(name="pdf_structural", params=coarse))
    out: list[Chunk] = []
    idx = 0
    for r in regions:
        subdoc = Document(
            id=doc.id,
            content=r.text,
            content_type="markdown",
            path=doc.path,
            page_number=doc.page_number,
            metadata=dict(doc.metadata),
        )
        from chunktuner.chunking.fixed_tokens import FixedTokenStrategy

        ft = FixedTokenStrategy(encoding_name=self._encoding_name)
        subchunks = ft.chunk(
            subdoc,
            ChunkConfig(
                name="fixed_tokens",
                params={"max_tokens": max_tokens, "overlap_tokens": overlap},
            ),
        )
        for sc in subchunks:
            abs_start = r.start_offset + sc.start_offset
            abs_end = r.start_offset + sc.end_offset
            out.append(
                Chunk.from_document(
                    doc,
                    id=f"{doc.id}_ss_{idx}",
                    start_offset=abs_start,
                    end_offset=abs_end,
                    tokens=sc.tokens,
                    metadata={**r.metadata, **sc.metadata},
                )
            )
            idx += 1
    validate_chunk_offsets(doc, out)
    return out