eval.evaluator

chunktuner.eval.evaluator

Retrieval-style evaluation over chunking strategies.

Evaluator

Evaluator(
    embedding_fn,
    top_k=5,
    enable_generation_metrics=False,
    llm_client=None,
    *,
    encoding_name="cl100k_base",
    context_budget_tokens=2000,
    batch_size=64,
    llm_answer_model=None,
    ragas_bridge=None,
)

Chunks documents, embeds text, runs retrieval metrics (and optional RAGAS).

Source code in src/chunktuner/eval/evaluator.py
def __init__(
    self,
    embedding_fn: EmbeddingFunction,
    top_k: int = 5,
    enable_generation_metrics: bool = False,
    llm_client: object | None = None,
    *,
    encoding_name: str = "cl100k_base",
    context_budget_tokens: int = 2000,
    batch_size: int = 64,
    llm_answer_model: str | None = None,
    ragas_bridge: object | None = None,
):
    self.embedding_fn = embedding_fn
    self.top_k = top_k
    self.enable_generation_metrics = enable_generation_metrics
    self.llm_client = llm_client
    self._enc = tiktoken.get_encoding(encoding_name)
    self.context_budget_tokens = context_budget_tokens
    self.batch_size = batch_size
    self.llm_answer_model = llm_answer_model or "gpt-4o-mini"
    self.ragas_bridge = ragas_bridge

evaluate

evaluate(strategy, config, docs, dataset, *, scorer=None)

Evaluate one strategy configuration on a document set and dataset.

Chunks each document, validates offsets, embeds chunks and dataset queries, computes retrieval metrics per query, optionally generation metrics, and assigns score when scorer is provided.

Parameters:

Name Type Description Default
strategy ChunkingStrategy

Registered chunking implementation.

required
config ChunkConfig

Strategy name and parameters.

required
docs list[Document]

Corpus (ids must match dataset references).

required
dataset EvalDataset

Queries and gold spans for scoring.

required
scorer ScoreCalculator | None

If set, used to populate EvalResult.score.

None

Returns:

Type Description
EvalResult

EvalResult with EvalMetrics and composite score.

Source code in src/chunktuner/eval/evaluator.py
def evaluate(
    self,
    strategy: ChunkingStrategy,
    config: ChunkConfig,
    docs: list[Document],
    dataset: EvalDataset,
    *,
    scorer: ScoreCalculator | None = None,
) -> EvalResult:
    """Evaluate one strategy configuration on a document set and dataset.

    Chunks each document, validates offsets, embeds chunks and dataset queries,
    computes retrieval metrics per query, optionally generation metrics, and
    assigns ``score`` when ``scorer`` is provided.

    Args:
        strategy: Registered chunking implementation.
        config: Strategy name and parameters.
        docs: Corpus (ids must match ``dataset`` references).
        dataset: Queries and gold spans for scoring.
        scorer: If set, used to populate `EvalResult.score`.

    Returns:
        `EvalResult` with `EvalMetrics` and composite score.
    """
    docs_by_id = {d.id: d for d in docs}
    all_chunks: list[Chunk] = []
    for d in docs:
        all_chunks.extend(strategy.chunk(d, config))

    self._validate_offsets_sample(all_chunks, docs_by_id)

    t0 = time.perf_counter()
    chunk_vecs = self._embed_batched([c.text for c in all_chunks])
    q_vecs: dict[str, np.ndarray] = {}
    for q in dataset.queries:
        v = np.array(self.embedding_fn.embed_query(q.question), dtype=np.float64)
        q_vecs[q.id] = v
    latency_ms = (time.perf_counter() - t0) * 1000

    by_doc: dict[str, list[tuple[int, Chunk]]] = defaultdict(list)
    for idx, ch in enumerate(all_chunks):
        by_doc[ch.document_id].append((idx, ch))

    tok_lens = [len(self._enc.encode(c.text)) for c in all_chunks]
    avg_chunk_len = float(np.mean(tok_lens)) if all_chunks else 0.0
    chunk_std = float(np.std(tok_lens)) if len(all_chunks) > 1 else 0.0

    recalls_at: dict[int, list[float]] = {1: [], 3: [], 5: []}
    mrrs: list[float] = []
    ndcgs: dict[int, list[float]] = {1: [], 3: [], 5: []}
    precs: list[float] = []
    recalls: list[float] = []
    ious: list[float] = []
    dups: list[float] = []
    toks_per_q: list[float] = []
    topk_by_q: dict[str, list[Chunk]] = {}

    for q in dataset.queries:
        doc = docs_by_id.get(q.document_id)
        if doc is None:
            logger.warning(
                "Query %r references document_id=%r which is not in the ingested corpus. "
                "Skipping. Check that document IDs in the eval dataset match ingested doc IDs.",
                q.id,
                q.document_id,
            )
            continue
        bounds = _token_bounds(self._enc, doc.content)
        gold: set[int] = set()
        for a, b in q.answer_spans:
            gold |= _token_indices_for_span(bounds, a, b)
        if not gold:
            continue

        pairs = by_doc.get(q.document_id, [])
        if not pairs:
            continue
        idxs = [p[0] for p in pairs]
        mat = np.stack([chunk_vecs[i] for i in idxs])
        qv = q_vecs[q.id]
        qn = qv / (np.linalg.norm(qv) + _EPS)
        matn = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + _EPS)
        sims = matn @ qn
        order = np.argsort(-sims)
        ranked_chunks = [pairs[int(j)][1] for j in order]

        eff_k = compute_effective_k(avg_chunk_len, self.context_budget_tokens)
        k_use = min(max(eff_k, 1), len(ranked_chunks))

        rels = [
            1.0 if gold & _token_indices_for_chunk(bounds, ranked_chunks[j]) else 0.0
            for j in range(len(ranked_chunks))
        ]

        for kk in (1, 3, 5):
            top = ranked_chunks[: min(kk, len(ranked_chunks))]
            hit = any(gold & _token_indices_for_chunk(bounds, c) for c in top)
            recalls_at[kk].append(1.0 if hit else 0.0)
            ndcgs[kk].append(_ndcg_at_k(rels, kk))

        first_rel = next((i + 1 for i, r in enumerate(rels) if r > 0), None)
        mrrs.append(1.0 / first_rel if first_rel else 0.0)

        topk = ranked_chunks[:k_use]
        ret_union: set[int] = set()
        for c in topk:
            ret_union |= _token_indices_for_chunk(bounds, c)
        inter = gold & ret_union
        precs.append(len(inter) / max(1, len(ret_union)))
        recalls.append(len(inter) / max(1, len(gold)))
        union = gold | ret_union
        ious.append(len(inter) / max(1, len(union)))
        dups.append(_duplication_ratio(topk, bounds))
        toks_per_q.append(float(sum(len(self._enc.encode(c.text)) for c in topk)))
        topk_by_q[q.id] = topk

    def mean(xs: list[float]) -> float:
        return float(sum(xs) / len(xs)) if xs else 0.0

    recall_at_k_dict = {k: mean(v) for k, v in recalls_at.items()}
    ndcg_at_k_dict = {k: mean(v) for k, v in ndcgs.items()}

    metrics = EvalMetrics(
        token_iou=mean(ious),
        token_precision=mean(precs),
        token_recall=mean(recalls),
        recall_at_k=recall_at_k_dict,
        mrr=mean(mrrs),
        ndcg_at_k=ndcg_at_k_dict,
        avg_tokens_per_query=mean(toks_per_q),
        duplication_ratio=mean(dups),
        avg_chunk_length=avg_chunk_len,
        chunk_length_std=chunk_std,
        embedding_latency_ms=latency_ms,
        total_embedding_tokens=sum(len(self._enc.encode(c.text)) for c in all_chunks)
        + sum(len(self._enc.encode(q.question)) for q in dataset.queries),
    )

    if self.enable_generation_metrics and topk_by_q:
        metrics = self._apply_generation_metrics(
            metrics,
            docs_by_id,
            dataset,
            topk_by_q,
        )

    sc = scorer or ScoreCalculator("rag_qa")
    score = sc.score(metrics)
    return EvalResult(
        strategy_name=strategy.name,
        config=config,
        embedding_profile=self.embedding_fn.profile_name,
        metrics=metrics,
        score=score,
    )