chunking.code_ast

chunktuner.chunking.code_ast

AST-aware chunks for Python (tree-sitter optional extra).

CodeASTStrategy

CodeASTStrategy(encoding_name='cl100k_base')

Python tree-sitter top-level definitions, with token-capped fallback splits.

Source code in src/chunktuner/chunking/code_ast.py
def __init__(self, encoding_name: str = "cl100k_base"):
    self._encoding_name = encoding_name
    self._enc = tiktoken.get_encoding(encoding_name)
    self._parser = None
    try:
        import tree_sitter_python as tsp
        from tree_sitter import Language, Parser

        self._parser = Parser(Language(tsp.language()))
    except ImportError:
        pass

chunk

chunk(doc, config)

Emit function/class chunks for Python AST; non-Python falls back to line windows.

Source code in src/chunktuner/chunking/code_ast.py
def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
    """Emit function/class chunks for Python AST; non-Python falls back to line windows."""
    validate_content_type(self.name, self.supported_content_types, doc.content_type)
    max_tokens = int(config.params.get("max_tokens", 512))
    lang = (doc.language or "").lower()
    use_ast = self._parser is not None and lang in ("", "python", "py")
    if not use_ast:
        out = self._fallback().chunk(doc, config)
        validate_chunk_offsets(doc, out)
        return out
    b = doc.content.encode("utf8")
    tree = self._parser.parse(b)
    root = tree.root_node
    targets = ("function_definition", "class_definition")
    chunks: list[Chunk] = []
    idx = 0
    for child in root.children:
        if child.type not in targets:
            continue
        start_b, end_b = child.start_byte, child.end_byte
        start_c = len(b[:start_b].decode("utf8"))
        end_c = len(b[:end_b].decode("utf8"))
        text = doc.content[start_c:end_c]
        if len(self._enc.encode(text)) > max_tokens:
            subdoc = Document(
                id=doc.id,
                content=text,
                content_type="code",
                path=doc.path,
                language=doc.language,
                metadata=doc.metadata,
            )
            for c in self._fallback().chunk(
                subdoc,
                ChunkConfig(
                    name="code_window", params={"max_tokens": max_tokens, "overlap_lines": 2}
                ),
            ):
                chunks.append(
                    Chunk.from_document(
                        doc,
                        id=f"{doc.id}_ast_{idx}",
                        start_offset=start_c + c.start_offset,
                        end_offset=start_c + c.end_offset,
                        tokens=c.tokens,
                    )
                )
                idx += 1
            continue
        chunks.append(
            Chunk.from_document(
                doc,
                id=f"{doc.id}_ast_{idx}",
                start_offset=start_c,
                end_offset=end_c,
                tokens=len(self._enc.encode(text)),
            )
        )
        idx += 1
    if not chunks:
        out = self._fallback().chunk(doc, config)
        validate_chunk_offsets(doc, out)
        return out
    validate_chunk_offsets(doc, chunks)
    return chunks