ingestion.file_ingestor

chunktuner.ingestion.file_ingestor

Load documents from filesystem paths.

FileIngestor

FileIngestor(root=None)

Load Document records from filesystem paths (single file or directory tree).

Source code in src/chunktuner/ingestion/file_ingestor.py
def __init__(self, root: Path | None = None):
    self.root = root.resolve() if root else None

ingest_path

ingest_path(path, *, content_type_override=None)

Ingest a single file or expand a directory via ingest_dir.

Source code in src/chunktuner/ingestion/file_ingestor.py
def ingest_path(
    self,
    path: Path,
    *,
    content_type_override: str | None = None,
) -> list[Document]:
    """Ingest a single file or expand a directory via `ingest_dir`."""
    path = self._ensure_under_root(path)
    if not path.exists():
        raise FileNotFoundError(path)
    if path.is_dir():
        return self.ingest_dir(path)
    return self._ingest_file_multi(path, content_type_override=content_type_override)

ingest_dir

ingest_dir(path, glob='**/*')

Walk path with glob and ingest every file with a supported extension.

Source code in src/chunktuner/ingestion/file_ingestor.py
def ingest_dir(self, path: Path, glob: str = "**/*") -> list[Document]:
    """Walk ``path`` with ``glob`` and ingest every file with a supported extension."""
    path = self._ensure_under_root(path)
    docs: list[Document] = []
    for p in sorted(path.glob(glob)):
        if not p.is_file():
            continue
        if p.suffix.lower() not in self.SUPPORTED_EXTENSIONS:
            continue
        try:
            docs.extend(self._ingest_file_multi(p, content_type_override=None))
        except (ImportError, NotImplementedError, OSError):
            continue
    return docs