Source code for world_models.benchmarks.metrics
from __future__ import annotations
from typing import Iterable, List, Dict
import numpy as np
[docs]
def compute_aggregate_metrics(per_seed_means: Iterable[float]) -> Dict[str, float]:
arr = np.array(list(per_seed_means), dtype=float)
if arr.size == 0:
return {"mean": 0.0, "median": 0.0, "iqm": 0.0, "num_seeds": 0}
mean = float(np.mean(arr))
median = float(np.median(arr))
# IQM: interquartile mean (mean of values between 25th and 75th percentiles)
iqm = float(iqm_of_array(arr))
return {
"mean": mean,
"median": median,
"iqm": iqm,
"num_seeds": int(arr.size),
}
[docs]
def bootstrap_ci(values: List[float], num_samples: int = 1000, alpha: float = 0.05):
"""Compute simple bootstrap 1-alpha CI on the mean."""
if not values:
return (0.0, 0.0)
vals = np.array(values)
n = vals.size
means = []
for _ in range(num_samples):
sample = np.random.choice(vals, size=n, replace=True)
means.append(sample.mean())
lower = float(np.percentile(means, 100 * (alpha / 2)))
upper = float(np.percentile(means, 100 * (1 - alpha / 2)))
return lower, upper
[docs]
def iqm_of_array(values: Iterable[float]) -> float:
"""Compute the Interquartile Mean (IQM) of an array of values.
IQM is the mean of values that lie between the 25th and 75th percentiles
(inclusive). This is a robust central tendency measure used in RL
benchmark reporting.
"""
arr = np.array(list(values), dtype=float)
if arr.size == 0:
return 0.0
lo = float(np.percentile(arr, 25))
hi = float(np.percentile(arr, 75))
# Keep values within [lo, hi]
mask = (arr >= lo) & (arr <= hi)
if not mask.any():
# Fallback to simple mean
return float(arr.mean())
return float(arr[mask].mean())
[docs]
def bootstrap_iqm_ci(values: List[float], num_samples: int = 1000, alpha: float = 0.05):
"""Bootstrap a confidence interval for the IQM.
Returns (lower, upper) percentiles of the bootstrap IQM distribution.
"""
if not values:
return (0.0, 0.0)
vals = np.array(values)
n = vals.size
iqms = np.empty(num_samples, dtype=float)
for i in range(num_samples):
sample = np.random.choice(vals, size=n, replace=True)
iqms[i] = iqm_of_array(sample)
lower = float(np.percentile(iqms, 100 * (alpha / 2)))
upper = float(np.percentile(iqms, 100 * (1 - alpha / 2)))
return lower, upper