def ingest_repo(self, root: Path) -> list[Document]:
"""Recursively ingest supported extensions under ``root``."""
root = root.resolve()
docs: list[Document] = []
exts = {".py", ".js", ".ts", ".tsx", ".go", ".java", ".rs", ".cpp", ".c", ".md", ".txt"}
for path in sorted(root.rglob("*")):
if not path.is_file():
continue
if path.suffix.lower() not in exts:
continue
if self._skip(path, root):
continue
try:
raw = path.read_text(encoding="utf-8", errors="replace")
except OSError:
continue
detected = detect_content_type(path, raw)
if detected in ("markdown", "text"):
ct: ContentType = "markdown" if detected == "markdown" else "text"
content = preprocess(raw, detected)
else:
ct = "code"
content = raw
lang = _EXT_LANG.get(path.suffix.lower())
docs.append(
Document(
id=str(uuid.uuid4()),
content=content,
content_type=ct,
path=str(path),
language=lang,
metadata={"repo_root": str(root), "rel_path": str(path.relative_to(root))},
)
)
return docs