chunking.pdf_structural

chunktuner.chunking.pdf_structural

Layout-inspired splits for PDF-derived markdown (## Page N markers optional).

PdfStructuralStrategy

PdfStructuralStrategy(encoding_name='cl100k_base')

Page or section headings define regions; long regions are split by character cap.

Source code in src/chunktuner/chunking/pdf_structural.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._enc = tiktoken.get_encoding(encoding_name)

chunk

chunk(doc, config)

Emit one chunk per sub-region up to max_region_chars within each structural span.

Source code in src/chunktuner/chunking/pdf_structural.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Emit one chunk per sub-region up to ``max_region_chars`` within each structural span."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    text = doc.content
    if not text:
        return []
    max_chars = int(config.params.get("max_region_chars", 4000))
    chunks: list[Chunk] = []
    idx = 0
    for a, b in self._region_spans(text):
        pos = a
        while pos < b:
            end = min(pos + max_chars, b)
            piece = text[pos:end]
            meta: dict = {}
            if doc.page_number is not None:
                meta["page_number"] = doc.page_number
            chunks.append(
                Chunk.from_document(
                    doc,
                    id=f"{doc.id}_pdf_{idx}",
                    start_offset=pos,
                    end_offset=end,
                    tokens=len(self._enc.encode(piece)),
                    metadata=meta,
                )
            )
            idx += 1
            pos = end
    validate_chunk_offsets(doc, chunks)
    return chunks