def chunk(self, doc: Document, config: ChunkConfig) -> list[Chunk]:
"""Emit function/class chunks for Python AST; non-Python falls back to line windows."""
validate_content_type(self.name, self.supported_content_types, doc.content_type)
max_tokens = int(config.params.get("max_tokens", 512))
lang = (doc.language or "").lower()
use_ast = self._parser is not None and lang in ("", "python", "py")
if not use_ast:
out = self._fallback().chunk(doc, config)
validate_chunk_offsets(doc, out)
return out
b = doc.content.encode("utf8")
tree = self._parser.parse(b)
root = tree.root_node
targets = ("function_definition", "class_definition")
chunks: list[Chunk] = []
idx = 0
for child in root.children:
if child.type not in targets:
continue
start_b, end_b = child.start_byte, child.end_byte
start_c = len(b[:start_b].decode("utf8"))
end_c = len(b[:end_b].decode("utf8"))
text = doc.content[start_c:end_c]
if len(self._enc.encode(text)) > max_tokens:
subdoc = Document(
id=doc.id,
content=text,
content_type="code",
path=doc.path,
language=doc.language,
metadata=doc.metadata,
)
for c in self._fallback().chunk(
subdoc,
ChunkConfig(
name="code_window", params={"max_tokens": max_tokens, "overlap_lines": 2}
),
):
chunks.append(
Chunk.from_document(
doc,
id=f"{doc.id}_ast_{idx}",
start_offset=start_c + c.start_offset,
end_offset=start_c + c.end_offset,
tokens=c.tokens,
)
)
idx += 1
continue
chunks.append(
Chunk.from_document(
doc,
id=f"{doc.id}_ast_{idx}",
start_offset=start_c,
end_offset=end_c,
tokens=len(self._enc.encode(text)),
)
)
idx += 1
if not chunks:
out = self._fallback().chunk(doc, config)
validate_chunk_offsets(doc, out)
return out
validate_chunk_offsets(doc, chunks)
return chunks