Spaces:

gpu2grid
/

live

Running

App Files Files Community

github-actions[bot] commited on 9 days ago

Commit

e83942e

1 Parent(s): 817f4f7

deploy: sync from GitHub 2026-05-13T14:25:24Z

Browse files

Files changed (32) hide show

generate_heatmap.py +5 -2
openg2g/__init__.py +0 -3
openg2g/clock.py +0 -91
openg2g/common.py +0 -20
openg2g/controller/__init__.py +0 -1
openg2g/controller/base.py +0 -153
openg2g/controller/batch_size_schedule.py +0 -159
openg2g/controller/noop.py +0 -36
openg2g/controller/ofo.py +0 -793
openg2g/controller/tap_schedule.py +0 -70
openg2g/coordinator.py +0 -269
openg2g/datacenter/__init__.py +0 -1
openg2g/datacenter/base.py +0 -183
openg2g/datacenter/command.py +0 -31
openg2g/datacenter/config.py +0 -342
openg2g/datacenter/layout.py +0 -126
openg2g/datacenter/offline.py +0 -320
openg2g/datacenter/online.py +0 -1196
openg2g/datacenter/workloads/__init__.py +0 -4
openg2g/datacenter/workloads/inference.py +0 -1363
openg2g/datacenter/workloads/training.py +0 -200
openg2g/events.py +0 -60
openg2g/grid/__init__.py +0 -0
openg2g/grid/base.py +0 -203
openg2g/grid/command.py +0 -31
openg2g/grid/config.py +0 -92
openg2g/grid/opendss.py +0 -476
openg2g/metrics/__init__.py +0 -0
openg2g/metrics/voltage.py +0 -94
openg2g/utils.py +0 -18
pyproject.toml +8 -6
server.py +34 -90

generate_heatmap.py CHANGED Viewed

@@ -8,6 +8,8 @@ import matplotlib.patches as mpatches
 from matplotlib.lines import Line2D
 import matplotlib.cm as cm
 import matplotlib.colors as mcolors
@@ -196,12 +198,13 @@ def generate_heatmap(voltages, output_path, vmin=0.92, vmax=1.06, map_image="13b
     plt.rcParams['svg.fonttype'] = 'none'
     plt.savefig(output_path, format='svg', bbox_inches='tight', dpi=150, facecolor='white')
     plt.close()
-    print(f"[ok] saved {output_path}")
 if __name__ == "__main__":
     if len(sys.argv) < 15:
-        print("Usage: generate_heatmap.py <output.png> <v1> <v2> ... <v13> [dc_bus_idx]")
         sys.exit(1)
     out      = sys.argv[1]
     volts    = [float(v) for v in sys.argv[2:15]]

 from matplotlib.lines import Line2D
 import matplotlib.cm as cm
 import matplotlib.colors as mcolors
+import logging
+logger = logging.getLogger(__name__)
     plt.rcParams['svg.fonttype'] = 'none'
     plt.savefig(output_path, format='svg', bbox_inches='tight', dpi=150, facecolor='white')
     plt.close()
+    logger.info(f"Saved heatmap: {output_path}")
 if __name__ == "__main__":
     if len(sys.argv) < 15:
+        logger.info(f"Usage: generate_heatmap.py <output.png> <v1> <v2> ... <v13> [dc_bus_idx]")
         sys.exit(1)
     out      = sys.argv[1]
     volts    = [float(v) for v in sys.argv[2:15]]

openg2g/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-"""OpenG2G: GPU-to-Grid framework for distribution-level voltage regulation."""
-__version__ = "0.1.0"

openg2g/clock.py DELETED Viewed

@@ -1,91 +0,0 @@
-"""Simulation clock with multi-rate support and optional live-mode wall-clock sync."""
-from __future__ import annotations
-import time
-import warnings
-from dataclasses import dataclass, field
-from fractions import Fraction
-@dataclass
-class SimulationClock:
-    """Integer-tick clock that avoids floating-point drift.
-    Components run at different rates (DC=0.1s, Grid=1.0s, Controller=1.0s or 60s).
-    The coordinator computes `tick_s` as the GCD of all component periods.
-    All time step parameters use `fractions.Fraction` for exact arithmetic.
-    The `time_s` property returns `float` for compatibility with numpy/plotting.
-    In live mode (`live=True`), the clock synchronizes with wall-clock time.
-    If computation falls behind, a warning is issued.
-    Attributes:
-        tick_s: Duration of one tick as a `Fraction` (seconds).
-        live: If `True`, synchronize with wall-clock time.
-    """
-    tick_s: Fraction
-    live: bool = False
-    _step: int = field(default=0, init=False, repr=False)
-    _wall_t0: float | None = field(default=None, init=False, repr=False)
-    def __post_init__(self) -> None:
-        if not isinstance(self.tick_s, Fraction):
-            raise TypeError(f"tick_s must be a Fraction, got {type(self.tick_s).__name__}")
-        if self.tick_s <= 0:
-            raise ValueError(f"tick_s must be positive, got {self.tick_s}")
-    @property
-    def time_s(self) -> float:
-        return float(self._step * self.tick_s)
-    @property
-    def step(self) -> int:
-        return self._step
-    def advance(self) -> float:
-        """Advance one tick.
-        Returns:
-            New simulation time in seconds.
-        """
-        self._step += 1
-        if self.live:
-            if self._wall_t0 is None:
-                self._wall_t0 = time.monotonic()
-            expected_wall = self._wall_t0 + self.time_s
-            now = time.monotonic()
-            if now < expected_wall:
-                time.sleep(expected_wall - now)
-            elif now - expected_wall > float(self.tick_s):
-                lag = now - expected_wall
-                warnings.warn(
-                    f"Clock lag: {lag:.3f}s behind wall time at sim t={self.time_s:.1f}s. "
-                    f"Control loop cannot keep up with real-time.",
-                    stacklevel=2,
-                )
-        return self.time_s
-    def reset(self) -> None:
-        """Reset clock to initial state (tick 0)."""
-        self._step = 0
-        self._wall_t0 = None
-    def is_due(self, period_s: Fraction) -> bool:
-        """Check if an event with the given period should fire on this tick.
-        Returns:
-            `True` if this tick is a multiple of the period.
-        Raises:
-            ValueError: If *period_s* is not an exact multiple of *tick_s*.
-        """
-        if period_s <= 0:
-            raise ValueError(f"period_s must be positive, got {period_s}")
-        ratio = period_s / self.tick_s
-        if ratio.denominator != 1:
-            raise ValueError(f"period_s={period_s} is not an exact multiple of tick_s={self.tick_s}")
-        period_ticks = int(ratio)
-        return self._step % period_ticks == 0

openg2g/common.py DELETED Viewed

@@ -1,20 +0,0 @@
-"""Cross-cutting types shared across component families."""
-from __future__ import annotations
-from dataclasses import dataclass
-@dataclass(frozen=True)
-class ThreePhase:
-    """Three-phase quantity. Access via `.a`, `.b`, `.c`.
-    Attributes:
-        a: Phase A value.
-        b: Phase B value.
-        c: Phase C value.
-    """
-    a: float
-    b: float
-    c: float

openg2g/controller/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Controllers receive datacenter and grid state and produce control actions."""

openg2g/controller/base.py DELETED Viewed

@@ -1,153 +0,0 @@
-"""Abstract base class for controllers."""
-from __future__ import annotations
-from abc import ABC, abstractmethod
-from fractions import Fraction
-from typing import Generic, TypeVar, Union, final, get_args, get_origin
-from openg2g.clock import SimulationClock
-from openg2g.datacenter.base import DatacenterBackend
-from openg2g.datacenter.command import DatacenterCommand
-from openg2g.events import EventEmitter
-from openg2g.grid.base import GridBackend
-from openg2g.grid.command import GridCommand
-DCBackendT = TypeVar("DCBackendT", bound=DatacenterBackend)
-GridBackendT = TypeVar("GridBackendT", bound=GridBackend)
-def _normalize_backend_type_arg(
-    arg: object,
-    *,
-    required_base: type[object],
-) -> tuple[type[object], ...]:
-    if isinstance(arg, type):
-        if issubclass(arg, required_base):
-            return (arg,)
-        raise TypeError(f"Controller generic type {arg!r} is not a subclass of {required_base.__name__}.")
-    origin = get_origin(arg)
-    # Handle parameterized generics like DatacenterBackend[OfflineDatacenterState]
-    if isinstance(origin, type) and issubclass(origin, required_base):
-        return (origin,)
-    if origin is Union:
-        out: list[type[object]] = []
-        for item in get_args(arg):
-            item_type = item if isinstance(item, type) else get_origin(item)
-            if not isinstance(item_type, type) or not issubclass(item_type, required_base):
-                raise TypeError(f"Controller generic type {item!r} is not a subclass of {required_base.__name__}.")
-            out.append(item_type)
-        return tuple(out)
-    raise TypeError(
-        f"Unsupported controller generic type argument: {arg!r}. Use a concrete class (or Union of concrete classes)."
-    )
-class Controller(Generic[DCBackendT, GridBackendT], ABC):
-    """Interface for a control component in the G2G framework.
-    Controllers receive datacenter and grid state and produce control actions.
-    Multiple controllers compose in order within the coordinator.
-    """
-    _dc_types: tuple[type[DatacenterBackend], ...] = (DatacenterBackend,)
-    _grid_types: tuple[type[GridBackend], ...] = (GridBackend,)
-    def __init_subclass__(cls, **kwargs: object) -> None:
-        super().__init_subclass__(**kwargs)
-        dc_types: tuple[type[DatacenterBackend], ...] | None = None
-        grid_types: tuple[type[GridBackend], ...] | None = None
-        for base in getattr(cls, "__orig_bases__", ()):
-            if get_origin(base) is Controller:
-                args = get_args(base)
-                if len(args) != 2:
-                    raise TypeError(
-                        f"{cls.__name__} must specialize Controller with two generic args: "
-                        "Controller[DatacenterType, GridType]."
-                    )
-                dc_raw, grid_raw = args
-                dc_norm = _normalize_backend_type_arg(dc_raw, required_base=DatacenterBackend)
-                grid_norm = _normalize_backend_type_arg(grid_raw, required_base=GridBackend)
-                dc_types = tuple(t for t in dc_norm if issubclass(t, DatacenterBackend))
-                grid_types = tuple(t for t in grid_norm if issubclass(t, GridBackend))
-                break
-        if dc_types is None or grid_types is None:
-            inherited = [b for b in cls.__bases__ if issubclass(b, Controller)]
-            inherited = [b for b in inherited if b is not Controller]
-            if inherited:
-                parent = inherited[0]
-                cls._dc_types = parent.compatible_datacenter_types()
-                cls._grid_types = parent.compatible_grid_types()
-                return
-            raise TypeError(
-                f"{cls.__name__} must explicitly specialize Controller generics as "
-                "Controller[DatacenterType, GridType]."
-            )
-        cls._dc_types = dc_types
-        cls._grid_types = grid_types
-    @final
-    @classmethod
-    def compatible_datacenter_types(cls) -> tuple[type[DatacenterBackend], ...]:
-        return cls._dc_types
-    @final
-    @classmethod
-    def compatible_grid_types(cls) -> tuple[type[GridBackend], ...]:
-        return cls._grid_types
-    @final
-    @classmethod
-    def compatibility_signature(cls) -> str:
-        dc = " | ".join(t.__name__ for t in cls.compatible_datacenter_types())
-        grid = " | ".join(t.__name__ for t in cls.compatible_grid_types())
-        return f"Controller[{dc}, {grid}]"
-    @property
-    @abstractmethod
-    def dt_s(self) -> Fraction:
-        """Control interval as a Fraction (seconds)."""
-    @abstractmethod
-    def reset(self) -> None:
-        """Reset simulation state to initial conditions.
-        Called by the coordinator before each [`start`][..start]. Must
-        clear all simulation state: dual variables, counters, cached
-        matrices. Configuration (dt_s, fits, step sizes) is not
-        affected.
-        Abstract so every implementation explicitly enumerates its state.
-        A forgotten field is a bug -- not clearing it silently corrupts
-        the second run.
-        """
-    def start(self) -> None:
-        """Acquire per-run resources.
-        Called after [`reset`][..reset], before the simulation loop.
-        No-op by default because most controllers have no resources to
-        acquire.
-        """
-    def stop(self) -> None:
-        """Release per-run resources. Simulation state is preserved.
-        Called after the simulation loop in LIFO order. No-op by default.
-        """
-    @abstractmethod
-    def step(
-        self,
-        clock: SimulationClock,
-        datacenter: DCBackendT,
-        grid: GridBackendT,
-        events: EventEmitter,
-    ) -> list[DatacenterCommand | GridCommand]:
-        """Compute control commands for this step. Return an empty list for no-op."""

openg2g/controller/batch_size_schedule.py DELETED Viewed

@@ -1,159 +0,0 @@
-"""Batch size schedule controller: applies pre-defined batch size changes at specified times."""
-from __future__ import annotations
-from collections.abc import Iterator
-from dataclasses import dataclass
-from fractions import Fraction
-from openg2g.clock import SimulationClock
-from openg2g.controller.base import Controller
-from openg2g.datacenter.base import DatacenterBackend
-from openg2g.datacenter.command import DatacenterCommand, SetBatchSize
-from openg2g.events import EventEmitter
-from openg2g.grid.base import GridBackend
-from openg2g.grid.command import GridCommand
-@dataclass(frozen=True)
-class BatchSizeChange:
-    """A batch size change event, optionally with gradual ramp-up.
-    Attributes:
-        batch_size: Target batch size (max_num_seqs).
-        ramp_up_rate: Requests/second ramp-up rate. 0 means immediate.
-    """
-    batch_size: int
-    ramp_up_rate: float = 0.0
-    def __post_init__(self) -> None:
-        if self.batch_size <= 0:
-            raise ValueError(f"batch_size must be positive, got {self.batch_size}.")
-        if self.ramp_up_rate < 0:
-            raise ValueError(f"ramp_up_rate must be >= 0, got {self.ramp_up_rate}.")
-    def at(self, t: float) -> BatchSizeSchedule:
-        """Schedule this change at time *t* seconds.
-        Returns:
-            A single-entry [`BatchSizeSchedule`][...BatchSizeSchedule].
-        """
-        return BatchSizeSchedule(((t, self),))
-class BatchSizeSchedule:
-    """Ordered sequence of batch size changes, built with `|` operator.
-    Example:
-    ```python
-    schedule = (
-        BatchSizeChange(48).at(40)
-        | BatchSizeChange(32).at(60)
-        | BatchSizeChange(48, ramp_up_rate=4).at(280)
-    )
-    ```
-    Raises:
-        ValueError: If two entries share the same timestamp.
-    """
-    __slots__ = ("_entries",)
-    def __init__(self, entries: tuple[tuple[float, BatchSizeChange], ...]) -> None:
-        self._entries = tuple(sorted(entries, key=lambda e: e[0]))
-        times = [t for t, _ in self._entries]
-        if len(times) != len(set(times)):
-            seen: set[float] = set()
-            dupes = sorted({t for t in times if t in seen or seen.add(t)})
-            raise ValueError(f"BatchSizeSchedule has duplicate timestamps: {dupes}")
-    def __or__(self, other: BatchSizeSchedule) -> BatchSizeSchedule:
-        return BatchSizeSchedule(self._entries + other._entries)
-    def __iter__(self) -> Iterator[tuple[float, BatchSizeChange]]:
-        return iter(self._entries)
-    def __len__(self) -> int:
-        return len(self._entries)
-    def __bool__(self) -> bool:
-        return bool(self._entries)
-    def __repr__(self) -> str:
-        parts: list[str] = []
-        for t, c in self._entries:
-            ramp = f", ramp_up_rate={c.ramp_up_rate}" if c.ramp_up_rate > 0 else ""
-            parts.append(f"BatchSizeChange({c.batch_size}{ramp}).at(t={t})")
-        return " | ".join(parts)
-class BatchSizeScheduleController(Controller[DatacenterBackend, GridBackend]):
-    """Applies pre-defined batch size changes at scheduled times.
-    Walks each model's schedule and emits
-    [`SetBatchSize`][openg2g.datacenter.command.SetBatchSize] commands when the
-    simulation clock reaches the scheduled time.
-    Args:
-        schedules: Per-model batch size schedules, keyed by model label.
-        dt_s: How often the controller checks the schedule (seconds).
-    """
-    def __init__(
-        self,
-        *,
-        schedules: dict[str, BatchSizeSchedule],
-        dt_s: Fraction = Fraction(1),
-    ) -> None:
-        self._dt_s = dt_s
-        self._schedules = dict(schedules)
-        self._indices: dict[str, int] = {label: 0 for label in schedules}
-    def reset(self) -> None:
-        self._indices = {label: 0 for label in self._schedules}
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    def step(
-        self,
-        clock: SimulationClock,
-        datacenter: DatacenterBackend,
-        grid: GridBackend,
-        events: EventEmitter,
-    ) -> list[DatacenterCommand | GridCommand]:
-        t_now = clock.time_s
-        batch_changes: dict[str, int] = {}
-        ramp_rates: dict[str, float] = {}
-        for label, schedule in self._schedules.items():
-            entries = list(schedule)
-            idx = self._indices[label]
-            while idx < len(entries):
-                t_ev, change = entries[idx]
-                if float(t_ev) <= t_now + 1e-12:
-                    batch_changes[label] = change.batch_size
-                    if change.ramp_up_rate > 0:
-                        ramp_rates[label] = change.ramp_up_rate
-                    idx += 1
-                else:
-                    break
-            self._indices[label] = idx
-        if batch_changes:
-            events.emit(
-                "controller.batch_schedule.fired",
-                {"batch_size_by_model": batch_changes},
-            )
-            return [
-                SetBatchSize(
-                    batch_size_by_model=batch_changes,
-                    ramp_up_rate_by_model=ramp_rates,
-                )
-            ]
-        return []

openg2g/controller/noop.py DELETED Viewed

@@ -1,36 +0,0 @@
-"""No-op controller that does nothing."""
-from __future__ import annotations
-from fractions import Fraction
-from openg2g.clock import SimulationClock
-from openg2g.controller.base import Controller
-from openg2g.datacenter.base import DatacenterBackend
-from openg2g.datacenter.command import DatacenterCommand
-from openg2g.events import EventEmitter
-from openg2g.grid.base import GridBackend
-from openg2g.grid.command import GridCommand
-class NoopController(Controller[DatacenterBackend, GridBackend]):
-    """Controller that always returns an empty action."""
-    def __init__(self, dt_s: Fraction = Fraction(1)) -> None:
-        self._dt_s = dt_s
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    def reset(self) -> None:
-        pass
-    def step(
-        self,
-        clock: SimulationClock,
-        datacenter: DatacenterBackend,
-        grid: GridBackend,
-        events: EventEmitter,
-    ) -> list[DatacenterCommand | GridCommand]:
-        return []

openg2g/controller/ofo.py DELETED Viewed

@@ -1,793 +0,0 @@
-"""Online Feedback Optimization (OFO) batch-size controller.
-Implements the primal-dual algorithm for joint voltage regulation and
-latency management via GPU batch size control.
-"""
-from __future__ import annotations
-import bisect
-import logging
-import math
-from fractions import Fraction
-from pathlib import Path
-from typing import Any
-import numpy as np
-import pandas as pd
-from mlenergy_data.modeling import LogisticModel
-from mlenergy_data.records import LLMRuns
-from pydantic import BaseModel, ConfigDict
-from openg2g.clock import SimulationClock
-from openg2g.controller.base import Controller
-from openg2g.datacenter.base import LLMBatchSizeControlledDatacenter, LLMDatacenterState
-from openg2g.datacenter.command import DatacenterCommand, SetBatchSize
-from openg2g.datacenter.config import InferenceModelSpec
-from openg2g.events import EventEmitter
-from openg2g.grid.command import GridCommand
-from openg2g.grid.opendss import OpenDSSGrid
-logger = logging.getLogger(__name__)
-class OFOConfig(BaseModel):
-    """Online Feedback Optimization tuning parameters.
-    Attributes:
-        primal_step_size: Primal descent step size ρ_x (Eq. 8).
-        w_throughput: Throughput weight in primal gradient.
-        w_switch: Switching cost regularizer weight γ (Eq. 4a).
-        voltage_gradient_scale: Scaling factor k_v for voltage dual term
-            in the primal gradient.
-        v_min: Lower voltage bound (pu).
-        v_max: Upper voltage bound (pu).
-        voltage_dual_step_size: Voltage dual ascent step size ρ_v (Eqs. 5-6).
-        latency_dual_step_size: Latency dual ascent step size ρ_l (Eq. 7).
-        sensitivity_update_interval: Steps between H-matrix re-estimation
-            (0 = only once at init).
-        sensitivity_perturbation_kw: Perturbation magnitude (kW) for
-            finite-difference sensitivity estimation.
-    """
-    model_config = ConfigDict(frozen=True)
-    # Primal
-    primal_step_size: float = 0.05
-    w_throughput: float = 0.1
-    w_switch: float = 0.0
-    voltage_gradient_scale: float = 1e6
-    # Dual
-    v_min: float = 0.95
-    v_max: float = 1.05
-    voltage_dual_step_size: float = 0.5
-    latency_dual_step_size: float = 1.0
-    # Sensitivity
-    sensitivity_update_interval: int = 0
-    sensitivity_perturbation_kw: float = 100.0
-class LogisticModelStore:
-    """Per-model logistic models for power, latency, and throughput.
-    Used by
-    [`OFOBatchSizeController`][openg2g.controller.ofo.OFOBatchSizeController]
-    to compute gradients of the Lagrangian with respect to batch size.
-    Attributes:
-        COL_MODEL_LABEL: Column name for model label in the CSV.
-        COL_METRIC: Column name for metric type in the CSV.
-    """
-    COL_MODEL_LABEL = "model_label"
-    COL_METRIC = "metric"
-    def __init__(
-        self,
-        power: dict[str, LogisticModel],
-        latency: dict[str, LogisticModel],
-        throughput: dict[str, LogisticModel],
-    ) -> None:
-        self._power = dict(power)
-        self._latency = dict(latency)
-        self._throughput = dict(throughput)
-        self._by_batch: dict[str, dict[int, list[tuple[float, float, float]]]] | None = None
-    def power(self, model: str) -> LogisticModel:
-        """Return the power logistic model for a model label."""
-        return self._power[model]
-    def latency(self, model: str) -> LogisticModel:
-        """Return the latency logistic model for a model label."""
-        return self._latency[model]
-    def throughput(self, model: str) -> LogisticModel:
-        """Return the throughput logistic model for a model label."""
-        return self._throughput[model]
-    @property
-    def power_fits(self) -> dict[str, LogisticModel]:
-        return dict(self._power)
-    @property
-    def latency_fits(self) -> dict[str, LogisticModel]:
-        return dict(self._latency)
-    @property
-    def throughput_fits(self) -> dict[str, LogisticModel]:
-        return dict(self._throughput)
-    @classmethod
-    def generate(
-        cls,
-        models: tuple[InferenceModelSpec, ...],
-        data_sources: dict[str, Any],
-        *,
-        runs: Any = None,
-        mlenergy_data_dir: Path | None = None,
-    ) -> LogisticModelStore:
-        """Generate logistic fits from ML.ENERGY benchmark data.
-        Args:
-            models: Model specifications.
-            data_sources: Per-model `MLEnergySource` instances, keyed by
-                `model_label`.
-            runs: Pre-loaded `LLMRuns` object. If `None`, loads from
-                `mlenergy_data_dir` or the HuggingFace Hub.
-            mlenergy_data_dir: Path to compiled mlenergy-data directory.
-                Ignored if `runs` is provided.
-        Returns:
-            A new `LogisticModelStore` with fitted logistic models.
-        """
-        if runs is None:
-            unique_tasks = {src.task for src in data_sources.values()}
-            if mlenergy_data_dir:
-                runs = LLMRuns.from_directory(str(mlenergy_data_dir), stable_only=False).task(*unique_tasks)
-            else:
-                runs = LLMRuns.from_hf(stable_only=False).task(*unique_tasks)
-        if not runs:
-            raise ValueError("No runs found for the specified tasks")
-        subsets_by_label: dict[str, Any] = {}
-        for ms in models:
-            src = data_sources.get(ms.model_label)
-            if src is None:
-                raise ValueError(f"No data source for model {ms.model_label!r}")
-            model_id = ms.model_id
-            if not model_id:
-                raise ValueError(f"model_id is required for data generation (model={ms.model_label!r})")
-            subset = (
-                runs.model_id(model_id).gpu_model(src.gpu).num_gpus(ms.gpus_per_replica).max_num_seqs(*src.batch_sizes)
-            )
-            if not subset:
-                raise ValueError(
-                    f"Config matched zero runs for logistic fits: model_id={model_id!r}, "
-                    f"gpu={src.gpu!r}, num_gpus={ms.gpus_per_replica}, "
-                    f"batch_sizes={src.batch_sizes}"
-                )
-            subsets_by_label[ms.model_label] = subset
-        all_by_batch: dict[str, dict[int, list[tuple[float, float, float]]]] = {}
-        power: dict[str, LogisticModel] = {}
-        latency: dict[str, LogisticModel] = {}
-        throughput: dict[str, LogisticModel] = {}
-        for model_label, group in subsets_by_label.items():
-            exclude = set(data_sources[model_label].fit_exclude_batch_sizes)
-            by_batch: dict[int, list[tuple[float, float, float]]] = {}
-            for r in group:
-                if r.max_num_seqs in exclude:
-                    continue
-                by_batch.setdefault(r.max_num_seqs, []).append(
-                    (r.avg_power_watts, r.mean_itl_ms / 1000.0, r.output_throughput_tokens_per_sec)
-                )
-            all_by_batch[model_label] = by_batch
-            batches = sorted(by_batch.keys())
-            if not batches:
-                continue
-            x = np.log2(np.array(batches, dtype=float).clip(min=1))
-            for _metric_name, idx, target in [
-                ("power", 0, power),
-                ("latency", 1, latency),
-                ("throughput", 2, throughput),
-            ]:
-                y = np.array([float(np.median([t[idx] for t in by_batch[b]])) for b in batches])
-                fit = LogisticModel.fit(x, y)
-                target[model_label] = fit
-        if not power and not latency and not throughput:
-            raise ValueError("No logistic fit rows produced")
-        store = cls(power=power, latency=latency, throughput=throughput)
-        store._by_batch = all_by_batch
-        return store
-    def save(self, csv_path: Path, *, plot: bool = False) -> None:
-        """Save logistic fits to a CSV.
-        Args:
-            csv_path: Output CSV path.
-            plot: If `True`, also write a logistic fits plot to the
-                same directory.
-        """
-        csv_path = Path(csv_path)
-        csv_path.parent.mkdir(parents=True, exist_ok=True)
-        rows: list[dict[str, Any]] = []
-        for metric_name, fits in [("power", self._power), ("latency", self._latency), ("throughput", self._throughput)]:
-            for label in sorted(fits):
-                model = fits[label]
-                rows.append(
-                    {
-                        self.COL_MODEL_LABEL: label,
-                        self.COL_METRIC: metric_name,
-                        "L": model.L,
-                        "x0": model.x0,
-                        "k": model.k,
-                        "b0": model.b0,
-                    }
-                )
-        pd.DataFrame(rows).to_csv(csv_path, index=False)
-        by_batch = getattr(self, "_by_batch", None)
-        if plot and by_batch is not None:
-            model_labels = sorted(self._power.keys())
-            _plot_logistic_fits(
-                by_batch,
-                self._power,
-                self._latency,
-                self._throughput,
-                model_labels,
-                csv_path.parent,
-            )
-    @classmethod
-    def load(cls, csv_path: Path | str) -> LogisticModelStore:
-        """Load power, latency, and throughput fits from a merged CSV.
-        Expected columns: `model_label`, `metric`, plus the logistic
-        model parameter columns (`L`, `x0`, `k`, `b0`).
-        The `metric` column must contain `power`, `latency`, or
-        `throughput` (case-insensitive).
-        Args:
-            csv_path: Path to the logistic fits CSV.
-        """
-        csv_path = Path(csv_path)
-        df = pd.read_csv(csv_path)
-        required_cols = [cls.COL_MODEL_LABEL, cls.COL_METRIC]
-        missing = [c for c in required_cols if c not in df.columns]
-        if missing:
-            raise ValueError(f"{csv_path} missing columns: {missing}. Got: {list(df.columns)}")
-        power: dict[str, LogisticModel] = {}
-        latency: dict[str, LogisticModel] = {}
-        throughput: dict[str, LogisticModel] = {}
-        targets = {"power": power, "latency": latency, "throughput": throughput}
-        for row in df.to_dict(orient="records"):
-            metric = str(row[cls.COL_METRIC]).strip().lower()
-            if metric in targets:
-                targets[metric][str(row[cls.COL_MODEL_LABEL])] = LogisticModel.from_dict(row)
-        if not power and not latency and not throughput:
-            raise ValueError(f"No logistic model rows loaded from {csv_path}")
-        return cls(power=power, latency=latency, throughput=throughput)
-    @classmethod
-    def ensure(
-        cls,
-        csv_path: Path,
-        models: tuple[InferenceModelSpec, ...] | None = None,
-        data_sources: dict[str, Any] | None = None,
-        *,
-        mlenergy_data_dir: Path | None = None,
-        plot: bool = False,
-    ) -> LogisticModelStore:
-        """Load from `csv_path`, generating first if needed.
-        Args:
-            csv_path: Path to the logistic fits CSV.
-            models: Model specifications. Required when no cached file exists.
-            data_sources: Per-model `MLEnergySource` instances, keyed by
-                `model_label`. Required when no cached file exists.
-            mlenergy_data_dir: Path to compiled mlenergy-data directory.
-            plot: If `True`, generate a logistic fits plot on generation.
-        """
-        csv_path = Path(csv_path)
-        if not csv_path.exists():
-            if models is None or data_sources is None:
-                raise ValueError("models and data_sources required for LogisticModelStore generation (no cached data)")
-            logger.info("Generating logistic fits to %s ...", csv_path)
-            cls.generate(models, data_sources, mlenergy_data_dir=mlenergy_data_dir).save(csv_path, plot=plot)
-        return cls.load(csv_path)
-class VoltageDualVariables:
-    """Full-network duals for voltage box constraints.
-    Maintains per-bus dual variables for under- and overvoltage and updates
-    them via projected gradient ascent:
-        dual_undervoltage  <- [dual_undervoltage  + ρ_v * (v_min - v̂)]+
-        dual_overvoltage   <- [dual_overvoltage   + ρ_v * (v̂ - v_max)]+
-    Args:
-        n_bus_phases: Number of bus-phase pairs in the voltage vector (3M).
-        config: OFO configuration (voltage bounds and dual step size).
-    """
-    def __init__(self, n_bus_phases: int, config: OFOConfig) -> None:
-        self.config = config
-        self.dual_undervoltage = np.zeros(int(n_bus_phases), dtype=float)  # λ in G2G paper Eq. 5
-        self.dual_overvoltage = np.zeros(int(n_bus_phases), dtype=float)  # λ̄ in G2G paper Eq. 6
-    def update(self, observed_voltages: np.ndarray) -> None:
-        """Update duals given observed voltage vector.
-        Args:
-            observed_voltages: Observed voltage magnitudes (pu), shape
-                `(n_bus_phases,)`.
-        Raises:
-            ValueError: If `observed_voltages` length does not match the dual
-                dimension.
-        """
-        observed_voltages = np.asarray(observed_voltages, float).reshape(-1)
-        if observed_voltages.shape[0] != self.dual_undervoltage.shape[0]:
-            raise ValueError(
-                f"observed_voltages has len {observed_voltages.shape[0]} "
-                f"but duals have len {self.dual_undervoltage.shape[0]}"
-            )
-        vmin = float(self.config.v_min)
-        vmax = float(self.config.v_max)
-        rho = float(self.config.voltage_dual_step_size)
-        self.dual_undervoltage = np.maximum(self.dual_undervoltage + rho * (vmin - observed_voltages), 0.0)
-        self.dual_overvoltage = np.maximum(self.dual_overvoltage + rho * (observed_voltages - vmax), 0.0)
-    def dual_difference(self) -> np.ndarray:
-        """Return the voltage dual difference (η = λ̄ − λ, Appendix B)."""
-        return self.dual_overvoltage - self.dual_undervoltage
-class PrimalBatchOptimizer:
-    """Primal batch-size optimizer operating in log2 space.
-    Maintains continuous state `x_i = log2(batch_i)` per model and applies
-    a gradient descent step using voltage duals, latency duals, and fitted
-    power/latency/throughput curves.
-    Args:
-        models: Model specifications for each served model.
-        feasible_batch_sizes: Allowed batch sizes (union across all models).
-        power_fits: Per-model logistic fit for power vs log2(batch_size).
-        latency_fits: Per-model logistic fit for latency vs log2(batch_size).
-        throughput_fits: Per-model logistic fit for throughput vs
-            log2(batch_size).
-        config: OFO configuration (step size, throughput/switch weights,
-            voltage gradient scale).
-    """
-    def __init__(
-        self,
-        *,
-        models: list[InferenceModelSpec],
-        feasible_batch_sizes: list[int],
-        power_fits: dict[str, LogisticModel],
-        latency_fits: dict[str, LogisticModel],
-        throughput_fits: dict[str, LogisticModel],
-        config: OFOConfig,
-    ) -> None:
-        self.models = list(models)
-        self.feasible_batch_sizes = sorted({int(b) for b in feasible_batch_sizes})
-        if not self.feasible_batch_sizes:
-            raise ValueError("feasible_batch_sizes cannot be empty.")
-        self.power_fits = power_fits
-        self.latency_fits = latency_fits
-        self.throughput_fits = throughput_fits
-        self.config = config
-        self.log_batch_size_min = math.log2(min(self.feasible_batch_sizes))
-        self.log_batch_size_max = math.log2(max(self.feasible_batch_sizes))
-        self.log_batch_size_by_model: dict[str, float] = {
-            ms.model_label: float(self.log_batch_size_max) for ms in self.models
-        }
-        self.prev_log_batch_size_by_model: dict[str, float] = dict(self.log_batch_size_by_model)
-        # Per-model throughput normalization: r_i(x_max) for a single replica
-        self.throughput_max_by_model: dict[str, float] = {}
-        b_max = int(max(self.feasible_batch_sizes))
-        for ms in self.models:
-            label = ms.model_label
-            try:
-                th_max = float(self.throughput_fits[label].eval(b_max))
-            except Exception:
-                th_max = float("nan")
-            if (not np.isfinite(th_max)) or (th_max <= 0.0):
-                th_max = 1.0
-            self.throughput_max_by_model[label] = th_max
-    def _clamp_log_batch_size(self, log_batch_size: float) -> float:
-        return float(min(max(float(log_batch_size), self.log_batch_size_min), self.log_batch_size_max))
-    def _discretize_batch(self, log_batch_size: float) -> int:
-        b_cont = 2.0 ** float(log_batch_size)
-        idx = bisect.bisect_left(self.feasible_batch_sizes, b_cont)
-        candidates = []
-        if idx > 0:
-            candidates.append(self.feasible_batch_sizes[idx - 1])
-        if idx < len(self.feasible_batch_sizes):
-            candidates.append(self.feasible_batch_sizes[idx])
-        return int(min(candidates, key=lambda bb: abs(bb - b_cont)))
-    def init_from_batches(self, batch_init: dict[str, int]) -> None:
-        """Initialize log-batch-size state from discrete batch sizes."""
-        for ms in self.models:
-            label = ms.model_label
-            b = int(batch_init.get(label, max(self.feasible_batch_sizes)))
-            log_batch_size = math.log2(max(b, 1))
-            log_batch_size = self._clamp_log_batch_size(log_batch_size)
-            self.log_batch_size_by_model[label] = float(log_batch_size)
-            self.prev_log_batch_size_by_model[label] = float(log_batch_size)
-    def step(
-        self,
-        *,
-        voltage_dual_diff: np.ndarray,
-        sensitivity_matrix: np.ndarray,
-        phase_share_by_model: dict[str, np.ndarray],
-        latency_dual_by_model: dict[str, float] | None = None,
-        replica_count_by_model: dict[str, float] | None = None,
-    ) -> dict[str, int]:
-        """Primal gradient descent step.
-        Args:
-            voltage_dual_diff: Voltage dual difference vector
-                (η = λ̄ − λ), shape `(n_bus_phases,)`.
-            sensitivity_matrix: Voltage sensitivity matrix (H = dv/dp),
-                shape `(n_bus_phases, 3)`.
-            phase_share_by_model: Per-model normalized phase share vectors,
-                shape `(3,)` each.
-            latency_dual_by_model: Per-model latency dual variables (μ_i).
-            replica_count_by_model: Per-model active replica counts (w_i).
-        Returns:
-            Next batch sizes per model.
-        """
-        voltage_dual_diff = np.asarray(voltage_dual_diff, float).reshape(-1)
-        sensitivity_matrix = np.asarray(sensitivity_matrix, float)
-        latency_dual_by_model = {} if latency_dual_by_model is None else dict(latency_dual_by_model)
-        replica_count_by_model = {} if replica_count_by_model is None else dict(replica_count_by_model)
-        step_size = float(self.config.primal_step_size)  # ρ_x
-        w_throughput = float(self.config.w_throughput)
-        w_switch = float(self.config.w_switch)
-        voltage_gradient_scale = float(self.config.voltage_gradient_scale)
-        batch_next: dict[str, int] = {}
-        for ms in self.models:
-            label = ms.model_label
-            log_batch_size = float(self.log_batch_size_by_model[label])
-            prev_log_batch_size = float(self.prev_log_batch_size_by_model.get(label, log_batch_size))
-            replica_count = float(replica_count_by_model.get(label, 0.0))  # w_i
-            if (not np.isfinite(replica_count)) or (replica_count < 0.0):
-                replica_count = 0.0
-            phase_share = np.asarray(  # e_i (phase-allocation weight, p.7)
-                phase_share_by_model.get(label, np.array([1 / 3, 1 / 3, 1 / 3], dtype=float)),
-                float,
-            ).reshape(3)
-            s = float(np.sum(phase_share))
-            if (not np.isfinite(s)) or s <= 0.0:
-                phase_share = np.array([1 / 3, 1 / 3, 1 / 3], dtype=float)
-            else:
-                phase_share = phase_share / s
-            weighted_sensitivity = sensitivity_matrix @ phase_share  # H @ e_i
-            voltage_gradient = float(voltage_dual_diff @ weighted_sensitivity)
-            dPdx_1 = float(self.power_fits[label].deriv_wrt_x(log_batch_size))
-            dLdx_1 = float(self.latency_fits[label].deriv_wrt_x(log_batch_size))
-            dThdx_1 = float(self.throughput_fits[label].deriv_wrt_x(log_batch_size))
-            dPdx_1_kw = dPdx_1 / 1000.0
-            th_max = float(self.throughput_max_by_model.get(label, 1.0))
-            if (not np.isfinite(th_max)) or (th_max <= 0.0):
-                th_max = 1.0
-            dThdx_norm_1 = dThdx_1 / th_max
-            dPdx = replica_count * dPdx_1_kw
-            dThdx = replica_count * dThdx_norm_1
-            dLdx = dLdx_1
-            latency_dual = float(latency_dual_by_model.get(label, 0.0))  # μ_i
-            if (not np.isfinite(latency_dual)) or (latency_dual < 0.0):
-                latency_dual = 0.0
-            # Gradient of the Lagrangian w.r.t. x_i = log2(batch_i).
-            # G2G paper Eq. 18: nabla_x L = -dR/dx (throughput)
-            #                              + 2*gamma*(x - x_prev) (switching)
-            #                              + eta^T H e_i dP/dx (voltage dual)
-            #                              + mu_i * dL/dx (latency dual)
-            # Implementation extensions: wT scaling on throughput,
-            #                            k_v scaling on voltage term
-            grad = 0.0
-            grad -= w_throughput * dThdx
-            grad += voltage_gradient_scale * voltage_gradient * dPdx
-            grad += latency_dual * dLdx
-            grad += w_switch * (log_batch_size - prev_log_batch_size)
-            new_log_batch_size = self._clamp_log_batch_size(log_batch_size - step_size * grad)
-            self.prev_log_batch_size_by_model[label] = log_batch_size
-            self.log_batch_size_by_model[label] = new_log_batch_size
-            batch_next[label] = self._discretize_batch(new_log_batch_size)
-        return batch_next
-class OFOBatchSizeController(Controller[LLMBatchSizeControlledDatacenter[LLMDatacenterState], OpenDSSGrid]):
-    """Online Feedback Optimization controller for batch-size regulation.
-    Reads grid voltage and datacenter state, updates voltage and latency
-    duals, runs the primal batch-size optimizer, and returns new batch
-    sizes. Latency dual updates use [`dc_state.observed_itl_s_by_model`
-    ][openg2g.datacenter.base.LLMDatacenterState.observed_itl_s_by_model].
-    Args:
-        inference_models: Model specifications served in the datacenter.
-        models: Per-model logistic models for power, latency, and
-            throughput used in gradient computation.
-        config: Unified OFO tuning parameters.
-        dt_s: Control interval (seconds).
-    """
-    def __init__(
-        self,
-        inference_models: tuple[InferenceModelSpec, ...],
-        *,
-        models: LogisticModelStore,
-        config: OFOConfig | None = None,
-        dt_s: Fraction = Fraction(1),
-    ) -> None:
-        if config is None:
-            config = OFOConfig()
-        if not inference_models:
-            raise ValueError("inference_models must not be empty.")
-        labels = [ms.model_label for ms in inference_models]
-        if len(labels) != len(set(labels)):
-            raise ValueError(f"Duplicate model labels: {labels}")
-        model_specs = list(inference_models)
-        for ms in model_specs:
-            label = ms.model_label
-            for metric_name, accessor in [
-                ("power", models.power),
-                ("latency", models.latency),
-                ("throughput", models.throughput),
-            ]:
-                try:
-                    accessor(label)
-                except KeyError:
-                    raise ValueError(f"LogisticModelStore missing {metric_name} model for {label!r}.") from None
-        self._dt_s = dt_s
-        self._models = model_specs
-        self._config = config
-        self._itl_deadline_by_model = {ms.model_label: ms.itl_deadline_s for ms in model_specs}
-        self._voltage_dual: VoltageDualVariables | None = None
-        self._latency_dual_by_model: dict[str, float] = {ms.model_label: 0.0 for ms in model_specs}
-        all_bs: set[int] = set()
-        for ms in model_specs:
-            all_bs.update(ms.feasible_batch_sizes)
-        feasible_batch_sizes = sorted(all_bs)
-        self._optimizer = PrimalBatchOptimizer(
-            models=model_specs,
-            feasible_batch_sizes=feasible_batch_sizes,
-            power_fits=models.power_fits,
-            latency_fits=models.latency_fits,
-            throughput_fits=models.throughput_fits,
-            config=config,
-        )
-        self._optimizer.init_from_batches({ms.model_label: ms.initial_batch_size for ms in model_specs})
-        self._sensitivity_matrix: np.ndarray | None = None
-        self._control_step_count: int = 0
-        logger.info(
-            "OFOBatchSizeController: %d models, dt=%s s, feasible_batches=%s",
-            len(model_specs),
-            dt_s,
-            feasible_batch_sizes,
-        )
-    def reset(self) -> None:
-        self._voltage_dual = None
-        self._latency_dual_by_model = {ms.model_label: 0.0 for ms in self._models}
-        self._optimizer.init_from_batches({ms.model_label: ms.initial_batch_size for ms in self._models})
-        self._sensitivity_matrix = None
-        self._control_step_count = 0
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    def step(
-        self,
-        clock: SimulationClock,
-        datacenter: LLMBatchSizeControlledDatacenter[LLMDatacenterState],
-        grid: OpenDSSGrid,
-        events: EventEmitter,
-    ) -> list[DatacenterCommand | GridCommand]:
-        if self._voltage_dual is None:
-            self._voltage_dual = VoltageDualVariables(len(grid.v_index), self._config)
-        # 1. Re-estimate sensitivity if needed
-        if self._sensitivity_matrix is None or (
-            self._config.sensitivity_update_interval > 0
-            and self._control_step_count % self._config.sensitivity_update_interval == 0
-        ):
-            self._sensitivity_matrix, _ = grid.estimate_sensitivity(self._config.sensitivity_perturbation_kw)
-        # 2. Update voltage duals from grid state
-        observed_voltages = grid.voltages_vector()
-        self._voltage_dual.update(observed_voltages)
-        voltage_dual_diff = self._voltage_dual.dual_difference()  # η = λ̄ − λ
-        # 3. Read observed latency from datacenter and update latency duals
-        dc_state = datacenter.state
-        missing_replicas = [
-            ms.model_label for ms in self._models if ms.model_label not in dc_state.active_replicas_by_model
-        ]
-        if missing_replicas:
-            miss = ", ".join(sorted(missing_replicas))
-            raise RuntimeError(
-                f"OFOBatchSizeController requires active_replicas_by_model for all models. Missing: {miss}."
-            )
-        missing_itl = [ms.model_label for ms in self._models if ms.model_label not in dc_state.observed_itl_s_by_model]
-        if missing_itl:
-            miss = ", ".join(sorted(missing_itl))
-            raise RuntimeError(
-                f"OFOBatchSizeController requires observed_itl_s_by_model for all models. Missing: {miss}."
-            )
-        for ms in self._models:
-            label = ms.model_label
-            num_replicas = max(int(dc_state.active_replicas_by_model[label]), 0)
-            observed_itl = float(dc_state.observed_itl_s_by_model[label])
-            if num_replicas <= 0:
-                logger.debug("Model %s has 0 replicas, skipping latency dual update", label)
-                observed_itl = float("nan")
-            deadline = float(self._itl_deadline_by_model[label])
-            if np.isfinite(observed_itl):
-                self._latency_dual_by_model[label] = max(
-                    self._latency_dual_by_model[label]
-                    + self._config.latency_dual_step_size * (observed_itl - deadline),
-                    0.0,
-                )
-            else:
-                self._latency_dual_by_model[label] = max(self._latency_dual_by_model[label], 0.0)
-        # 4. Compute replica counts
-        replica_count_by_model: dict[str, float] = {}
-        for ms in self._models:
-            label = ms.model_label
-            replica_count_by_model[label] = float(dc_state.active_replicas_by_model[label])
-        # 5. Primal update -> next batch sizes
-        batch_next = self._optimizer.step(
-            voltage_dual_diff=voltage_dual_diff,
-            sensitivity_matrix=self._sensitivity_matrix,
-            phase_share_by_model=datacenter.phase_share_by_model,
-            latency_dual_by_model=self._latency_dual_by_model,
-            replica_count_by_model=replica_count_by_model,
-        )
-        self._control_step_count += 1
-        logger.info(
-            "OFO step %d (t=%.1f s): batch=%s",
-            self._control_step_count,
-            clock.time_s,
-            batch_next,
-        )
-        events.emit(
-            "controller.ofo.step",
-            {
-                "batch_size_by_model": batch_next,
-                "latency_dual_by_model": dict(self._latency_dual_by_model),
-            },
-        )
-        return [SetBatchSize(batch_size_by_model=batch_next)]
-def _plot_logistic_fits(
-    by_batch: dict[str, dict[int, list[tuple[float, float, float]]]],
-    power: dict[str, LogisticModel],
-    latency_fits: dict[str, LogisticModel],
-    throughput_fits: dict[str, LogisticModel],
-    model_labels: list[str],
-    out_dir: Path,
-) -> None:
-    """Plot 3x1 stacked logistic fits: power, latency, throughput.
-    Scatter dots for measured medians, smooth fitted curves from
-    LogisticModel parameters. Saves to `out_dir / "logistic_fits.png"`.
-    """
-    import matplotlib.pyplot as plt
-    metric_specs: list[tuple[str, int, dict[str, LogisticModel], str, str]] = [
-        ("power", 0, power, "W", "(a) Average GPU power consumption vs batch size"),
-        ("latency", 1, latency_fits, "s/token", "(b) Average inter-token latency vs batch size"),
-        ("throughput", 2, throughput_fits, "tokens/s", "(c) Average token throughput vs batch size"),
-    ]
-    fig, axes = plt.subplots(3, 1, figsize=(6.45, 5.2), dpi=300, sharex=True)
-    for ax_idx, (ax, (_metric_name, val_idx, fits, ylabel, title)) in enumerate(zip(axes, metric_specs, strict=True)):
-        xmins: list[float] = []
-        xmaxs: list[float] = []
-        for label in model_labels:
-            model_by_batch = by_batch.get(label, {})
-            batches = sorted(model_by_batch.keys())
-            if not batches:
-                continue
-            x = np.log2(np.array(batches, dtype=float).clip(min=1))
-            if len(x) > 0:
-                xmins.append(float(np.min(x)))
-                xmaxs.append(float(np.max(x)))
-        if not xmins:
-            ax.set_title(title, fontsize=12, loc="center")
-            ax.set_ylabel(ylabel, fontsize=10)
-            ax.grid(True, alpha=0.25)
-            continue
-        xs = np.linspace(min(xmins), max(xmaxs), 400)
-        for label in model_labels:
-            model_by_batch = by_batch.get(label, {})
-            batches = sorted(model_by_batch.keys())
-            if not batches or label not in fits:
-                continue
-            x = np.log2(np.array(batches, dtype=float).clip(min=1))
-            y = np.array([float(np.median([t[val_idx] for t in model_by_batch[b]])) for b in batches])
-            fit = fits[label]
-            ys_fit = np.array([fit.eval_x(float(xi)) for xi in xs])
-            (line,) = ax.plot(xs, ys_fit, lw=1.8, label=label, zorder=2)
-            ax.scatter(x, y, s=16.0, color=line.get_color(), zorder=3)
-        ax.set_title(title, fontsize=12, loc="center")
-        ax.set_ylabel(ylabel, fontsize=10)
-        ax.grid(True, alpha=0.25)
-        ax.tick_params(axis="both", labelsize=10)
-        if ax_idx == 2:
-            ax.legend(frameon=True, fontsize=9, loc="best")
-    axes[-1].set_xlabel(r"$\log_2(\mathrm{batch\ size})$", fontsize=10)
-    fig.tight_layout(pad=0.35, h_pad=0.6)
-    save_path = out_dir / "logistic_fits.png"
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(save_path, bbox_inches="tight", pad_inches=0.02)
-    plt.close(fig)
-    logger.info("Saved logistic fits plot to %s", save_path)

openg2g/controller/tap_schedule.py DELETED Viewed

@@ -1,70 +0,0 @@
-"""Tap schedule controller: applies pre-defined regulator tap changes at specified times."""
-from __future__ import annotations
-from fractions import Fraction
-from openg2g.clock import SimulationClock
-from openg2g.controller.base import Controller
-from openg2g.datacenter.base import DatacenterBackend
-from openg2g.datacenter.command import DatacenterCommand
-from openg2g.events import EventEmitter
-from openg2g.grid.base import GridBackend
-from openg2g.grid.command import GridCommand, SetTaps
-from openg2g.grid.config import TapPosition, TapSchedule
-class TapScheduleController(Controller[DatacenterBackend, GridBackend]):
-    """Applies pre-defined tap changes at scheduled times.
-    Args:
-        schedule: Tap schedule built via
-            [`TapPosition(...).at(t=...) | ...`][openg2g.grid.config.TapSchedule].
-        dt_s: How often the controller checks the schedule (seconds).
-    """
-    def __init__(self, *, schedule: TapSchedule, dt_s: Fraction = Fraction(1)) -> None:
-        self._dt_s = dt_s
-        self._entries = list(schedule)
-        self._idx = 0
-    def reset(self) -> None:
-        self._idx = 0
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    def step(
-        self,
-        clock: SimulationClock,
-        datacenter: DatacenterBackend,
-        grid: GridBackend,
-        events: EventEmitter,
-    ) -> list[DatacenterCommand | GridCommand]:
-        t_now = clock.time_s
-        merged_a: float | None = None
-        merged_b: float | None = None
-        merged_c: float | None = None
-        any_fired = False
-        while self._idx < len(self._entries):
-            t_ev, pos = self._entries[self._idx]
-            if float(t_ev) <= t_now + 1e-12:
-                if pos.a is not None:
-                    merged_a = pos.a
-                if pos.b is not None:
-                    merged_b = pos.b
-                if pos.c is not None:
-                    merged_c = pos.c
-                any_fired = True
-                self._idx += 1
-            else:
-                break
-        if any_fired and (merged_a is not None or merged_b is not None or merged_c is not None):
-            tap = TapPosition(a=merged_a, b=merged_b, c=merged_c)
-            events.emit("controller.tap_schedule.fired", {"tap_position": tap})
-            return [SetTaps(tap_position=tap)]
-        return []

openg2g/coordinator.py DELETED Viewed

@@ -1,269 +0,0 @@
-"""Central coordinator: multi-rate simulation loop."""
-from __future__ import annotations
-import logging
-import warnings
-from collections.abc import Sequence
-from dataclasses import dataclass, field
-from fractions import Fraction
-from typing import Any, Generic
-from openg2g.clock import SimulationClock
-from openg2g.common import ThreePhase
-from openg2g.controller.base import Controller
-from openg2g.datacenter.base import DatacenterBackend, DCStateT
-from openg2g.datacenter.command import DatacenterCommand
-from openg2g.events import EventEmitter, SimEvent
-from openg2g.grid.base import GridBackend, GridStateT, PhaseVoltages
-from openg2g.grid.command import GridCommand
-logger = logging.getLogger(__name__)
-@dataclass
-class SimulationLog(Generic[DCStateT, GridStateT]):
-    """Accumulated simulation data from a coordinator run.
-    Generic over the datacenter and grid state types. When constructed
-    via [`Coordinator.run`][..Coordinator.run], the type parameters are
-    inferred from the backends, giving typed access to backend-specific
-    state fields.
-    Attributes:
-        dc_states: Every datacenter state produced by the datacenter.
-        grid_states: Every grid state produced by the grid.
-        commands: All commands emitted by controllers.
-        time_s: Simulation time at each grid step (seconds).
-        voltage_a_pu: DC-bus voltage phase A at each grid step (pu).
-        voltage_b_pu: DC-bus voltage phase B at each grid step (pu).
-        voltage_c_pu: DC-bus voltage phase C at each grid step (pu).
-        events: Clock-stamped simulation events from all components.
-    """
-    dc_states: list[DCStateT] = field(default_factory=list)
-    grid_states: list[GridStateT] = field(default_factory=list)
-    commands: list[DatacenterCommand | GridCommand] = field(default_factory=list)
-    time_s: list[float] = field(default_factory=list)
-    voltage_a_pu: list[float] = field(default_factory=list)
-    voltage_b_pu: list[float] = field(default_factory=list)
-    voltage_c_pu: list[float] = field(default_factory=list)
-    events: list[SimEvent] = field(default_factory=list)
-    def record_datacenter(self, state: DCStateT) -> None:
-        """Append a datacenter state snapshot."""
-        self.dc_states.append(state)
-    def record_grid(self, state: GridStateT, *, dc_bus: str) -> None:
-        """Append a grid state snapshot and extract DC bus voltages."""
-        self.grid_states.append(state)
-        self.time_s.append(state.time_s)
-        v_dc = (
-            state.voltages[dc_bus]
-            if dc_bus in state.voltages
-            else PhaseVoltages(a=float("nan"), b=float("nan"), c=float("nan"))
-        )
-        self.voltage_a_pu.append(v_dc.a)
-        self.voltage_b_pu.append(v_dc.b)
-        self.voltage_c_pu.append(v_dc.c)
-    def record_commands(self, commands: list[DatacenterCommand | GridCommand]) -> None:
-        """Append control commands issued during a tick."""
-        self.commands.extend(commands)
-    def emit(self, event: SimEvent) -> None:
-        """Event sink entrypoint for component-originated events."""
-        self.events.append(event)
-def _gcd_fraction(a: Fraction, b: Fraction) -> Fraction:
-    """GCD of two positive Fractions using Euclidean algorithm."""
-    a, b = abs(a), abs(b)
-    while b:
-        a, b = b, a % b
-    return a
-class Coordinator(Generic[DCStateT, GridStateT]):
-    """Multi-rate simulation coordinator.
-    Orchestrates datacenter, grid, and controller components at their
-    respective rates.  The base tick is the GCD of all component
-    periods.
-    Generic over datacenter and grid state types. The type parameters
-    are inferred from the backends and propagated to
-    [`SimulationLog`][..SimulationLog].
-    Args:
-        datacenter: Datacenter backend (offline or online).
-        grid: Grid simulator backend.
-        controllers: List of controllers, applied in order each tick.
-        total_duration_s: Total simulation duration (integer seconds).
-        dc_bus: Bus name for DC voltage logging.
-        live: If True, synchronize with wall-clock time.
-    """
-    def __init__(
-        self,
-        datacenter: DatacenterBackend[DCStateT],
-        grid: GridBackend[GridStateT],
-        controllers: Sequence[Controller[Any, Any]],
-        total_duration_s: int,
-        dc_bus: str,
-        live: bool = False,
-    ) -> None:
-        self.datacenter = datacenter
-        self.grid = grid
-        self.controllers = list(controllers)
-        self.total_duration_s = int(total_duration_s)
-        self.dc_bus = str(dc_bus)
-        # Compute tick as GCD of all component periods
-        periods = [datacenter.dt_s, grid.dt_s] + [c.dt_s for c in controllers]
-        tick = periods[0]
-        for p in periods[1:]:
-            tick = _gcd_fraction(tick, p)
-        logger.info("Coordinator will run with tick %f s", float(tick))
-        # Warn about potentially problematic dt configurations
-        if grid.dt_s < datacenter.dt_s:
-            warnings.warn(
-                f"dt_grid ({grid.dt_s}) < dt_dc ({datacenter.dt_s}): "
-                f"grid steps between DC steps will reuse the most recent DC power.",
-                stacklevel=2,
-            )
-        for ctrl in controllers:
-            if ctrl.dt_s < grid.dt_s:
-                warnings.warn(
-                    f"Controller {ctrl.__class__.__name__} dt_s ({ctrl.dt_s}) "
-                    f"< dt_grid ({grid.dt_s}): controller may read stale voltages.",
-                    stacklevel=2,
-                )
-        n_ticks_estimate = Fraction(self.total_duration_s) / tick
-        if n_ticks_estimate > 10_000_000:
-            warnings.warn(
-                f"Simulation will run {int(n_ticks_estimate)} ticks. This may be slow. Consider coarser time steps.",
-                stacklevel=2,
-            )
-        self.clock = SimulationClock(tick_s=tick, live=live)
-    def reset(self) -> None:
-        """Reset coordinator and all sub-components for a fresh run."""
-        self.clock.reset()
-        self.datacenter.do_reset()
-        self.grid.do_reset()
-        for ctrl in self.controllers:
-            ctrl.reset()
-    def start(self) -> None:
-        """Acquire resources on all sub-components."""
-        self.datacenter.start()
-        self.grid.start()
-        for ctrl in self.controllers:
-            ctrl.start()
-    def stop(self) -> None:
-        """Release resources on all sub-components (LIFO order)."""
-        for ctrl in reversed(self.controllers):
-            ctrl.stop()
-        self.grid.stop()
-        self.datacenter.stop()
-    def _validate_controller_compatibility(self) -> None:
-        for ctrl in self.controllers:
-            sig = ctrl.__class__.compatibility_signature()
-            dc_types = ctrl.compatible_datacenter_types()
-            try:
-                dc_ok = isinstance(self.datacenter, dc_types)
-            except TypeError:
-                continue
-            if not dc_ok:
-                expected = " | ".join(t.__name__ for t in dc_types)
-                got = type(self.datacenter).__name__
-                raise TypeError(f"{ctrl.__class__.__name__} ({sig}) requires datacenter type {expected}, got {got}.")
-            grid_types = ctrl.compatible_grid_types()
-            try:
-                grid_ok = isinstance(self.grid, grid_types)
-            except TypeError:
-                continue
-            if not grid_ok:
-                expected = " | ".join(t.__name__ for t in grid_types)
-                got = type(self.grid).__name__
-                raise TypeError(f"{ctrl.__class__.__name__} ({sig}) requires grid type {expected}, got {got}.")
-    def run(self) -> SimulationLog[DCStateT, GridStateT]:
-        """Run the full simulation and return the log."""
-        log: SimulationLog[DCStateT, GridStateT] = SimulationLog()
-        dc_events = EventEmitter(self.clock, log, "datacenter")
-        grid_events = EventEmitter(self.clock, log, "grid")
-        controller_events = EventEmitter(self.clock, log, "controller")
-        self._validate_controller_compatibility()
-        self.reset()
-        self.start()
-        dc_buffer: list[ThreePhase] = []
-        ratio = Fraction(self.total_duration_s) / self.clock.tick_s
-        if ratio.denominator != 1:
-            raise ValueError(
-                f"total_duration_s ({self.total_duration_s}) is not an exact multiple of tick_s ({self.clock.tick_s})"
-            )
-        n_ticks = int(ratio)
-        logger.info(
-            "Starting simulation: %d s, tick=%s s, %d ticks, dt_dc=%s s, dt_grid=%s s, %d controller(s)",
-            self.total_duration_s,
-            self.clock.tick_s,
-            n_ticks,
-            self.datacenter.dt_s,
-            self.grid.dt_s,
-            len(self.controllers),
-        )
-        try:
-            for _ in range(n_ticks):
-                # 1. Datacenter step (if due)
-                if self.clock.is_due(self.datacenter.dt_s):
-                    dc_state = self.datacenter.do_step(self.clock, dc_events)
-                    dc_buffer.append(dc_state.power_w)
-                    log.record_datacenter(dc_state)
-                # 2. Grid step (if due). Pass full sub-trace since last grid step.
-                if self.clock.is_due(self.grid.dt_s):
-                    grid_state = self.grid.do_step(self.clock, list(dc_buffer), grid_events)
-                    dc_buffer.clear()
-                    log.record_grid(grid_state, dc_bus=self.dc_bus)
-                # 3. Controllers (if due). In order, actions applied immediately.
-                for ctrl in self.controllers:
-                    if self.clock.is_due(ctrl.dt_s):
-                        commands = ctrl.step(self.clock, self.datacenter, self.grid, controller_events)
-                        for command in commands:
-                            if isinstance(command, DatacenterCommand):
-                                self.datacenter.apply_control(command, dc_events)
-                            elif isinstance(command, GridCommand):
-                                self.grid.apply_control(command, grid_events)
-                            else:
-                                raise ValueError(f"Unsupported command type: {type(command).__name__}")
-                        log.record_commands(commands)
-                self.clock.advance()
-        finally:
-            self.stop()
-        logger.info(
-            "Simulation complete: %d grid steps, %d DC steps, %d commands",
-            len(log.grid_states),
-            len(log.dc_states),
-            len(log.commands),
-        )
-        return log

openg2g/datacenter/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Datacenter backends for openg2g."""

openg2g/datacenter/base.py DELETED Viewed

@@ -1,183 +0,0 @@
-"""Abstract base class for datacenter backends and base state types."""
-from __future__ import annotations
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from fractions import Fraction
-from typing import Generic, TypeVar, final
-import numpy as np
-from openg2g.clock import SimulationClock
-from openg2g.common import ThreePhase
-from openg2g.datacenter.command import DatacenterCommand
-from openg2g.events import EventEmitter
-@dataclass(frozen=True)
-class DatacenterState:
-    """State emitted by a datacenter backend each timestep.
-    Contains only universally applicable fields. LLM-inference-specific
-    fields (batch sizes, replicas, latency) live on child classes like
-    [`LLMDatacenterState`][..LLMDatacenterState].
-    Attributes:
-        time_s: Simulation time in seconds.
-        power_w: Three-phase power in watts.
-    """
-    time_s: float
-    power_w: ThreePhase
-@dataclass(frozen=True)
-class LLMDatacenterState(DatacenterState):
-    """State from a datacenter serving LLM workloads.
-    Extends [`DatacenterState`][..DatacenterState] with per-model batch
-    size, replica count, and observed inter-token latency fields used
-    by LLM controllers.
-    Attributes:
-        batch_size_by_model: Current batch size per model label.
-        active_replicas_by_model: Number of active replicas per model.
-        observed_itl_s_by_model: Observed average inter-token latency
-            (seconds) per model. `NaN` if unavailable.
-    """
-    batch_size_by_model: dict[str, int] = field(default_factory=dict)
-    active_replicas_by_model: dict[str, int] = field(default_factory=dict)
-    observed_itl_s_by_model: dict[str, float] = field(default_factory=dict)
-DCStateT = TypeVar("DCStateT", bound=DatacenterState)
-class DatacenterBackend(Generic[DCStateT], ABC):
-    """Interface for datacenter power simulation backends."""
-    _INIT_SENTINEL = object()
-    def __init__(self) -> None:
-        self._state: DCStateT | None = None
-        self._history: list[DCStateT] = []
-        self._dc_base_init = DatacenterBackend._INIT_SENTINEL
-    def _check_base_init(self) -> None:
-        if getattr(self, "_dc_base_init", None) is not DatacenterBackend._INIT_SENTINEL:
-            raise TypeError(f"{type(self).__name__}.__init__ must call super().__init__() ")
-    @property
-    @abstractmethod
-    def dt_s(self) -> Fraction:
-        """Native timestep as a Fraction (seconds)."""
-    @final
-    @property
-    def state(self) -> DCStateT:
-        """Latest emitted state.
-        Raises:
-            RuntimeError: If accessed before the first `step()` call.
-        """
-        self._check_base_init()
-        if self._state is None:
-            raise RuntimeError(f"{type(self).__name__}.state accessed before first step().")
-        return self._state
-    @final
-    def history(self, n: int | None = None) -> list[DCStateT]:
-        """Return emitted state history (all, or latest `n`)."""
-        self._check_base_init()
-        if n is None:
-            return list(self._history)
-        if n <= 0:
-            return []
-        return list(self._history[-int(n) :])
-    @final
-    def do_step(self, clock: SimulationClock, events: EventEmitter) -> DCStateT:
-        """Call `step`, record the state, and return it.
-        Called by the coordinator. Subclasses should not override this.
-        """
-        self._check_base_init()
-        state = self.step(clock, events)
-        self._state = state
-        self._history.append(state)
-        return state
-    @abstractmethod
-    def step(self, clock: SimulationClock, events: EventEmitter) -> DCStateT:
-        """Advance one native timestep. Return state for this step."""
-    @abstractmethod
-    def apply_control(self, command: DatacenterCommand, events: EventEmitter) -> None:
-        """Apply one command. Takes effect on next step() call."""
-    @final
-    def do_reset(self) -> None:
-        """Clear history and call `reset`.
-        Called by the coordinator. Subclasses should not override this.
-        """
-        self._check_base_init()
-        self._state = None
-        self._history.clear()
-        self.reset()
-    @abstractmethod
-    def reset(self) -> None:
-        """Reset simulation state to initial conditions.
-        Called by the coordinator (via `do_reset`) before each
-        [`start`][..start]. Must clear all simulation state: counters,
-        RNG seeds, cached values. Configuration (dt_s, models,
-        templates) is not affected. History is cleared automatically
-        by `do_reset`.
-        Abstract so every implementation explicitly enumerates its state.
-        A forgotten field is a bug -- not clearing it silently corrupts
-        the second run.
-        """
-    def start(self) -> None:
-        """Acquire per-run resources (threads, solver circuits).
-        Called after [`reset`][..reset], before the simulation loop.
-        Override for backends that need resource acquisition (e.g.,
-        [`OpenDSSGrid`][openg2g.grid.opendss.OpenDSSGrid] compiles its
-        DSS circuit here). No-op by default because most offline
-        components have no resources to acquire.
-        """
-    def stop(self) -> None:
-        """Release per-run resources. Simulation state is preserved.
-        Called after the simulation loop in LIFO order. Override for
-        backends that acquired resources in [`start`][..start]. No-op
-        by default.
-        """
-class LLMBatchSizeControlledDatacenter(DatacenterBackend[DCStateT]):
-    """Datacenter that serves LLM workloads and supports batch-size control.
-    Marker layer between [`DatacenterBackend`][..DatacenterBackend] and
-    concrete implementations. Controllers that issue
-    [`SetBatchSize`][openg2g.datacenter.command.SetBatchSize] commands or read
-    `active_replicas_by_model` / `observed_itl_s_by_model`
-    from state should bind their generic to this class.
-    """
-    @property
-    def phase_share_by_model(self) -> dict[str, np.ndarray]:
-        """Per-model phase share vectors `[frac_A, frac_B, frac_C]`.
-        Returns an empty dict by default. Consumers treat missing keys
-        as uniform `[1/3, 1/3, 1/3]`. Override in subclasses that know
-        actual server-to-phase placement.
-        """
-        return {}

openg2g/datacenter/command.py DELETED Viewed

@@ -1,31 +0,0 @@
-"""Command types targeting datacenter backends."""
-from __future__ import annotations
-from dataclasses import dataclass, field
-class DatacenterCommand:
-    """Base for commands targeting the datacenter backend.
-    Subclass this for each concrete datacenter command kind.
-    The coordinator routes commands to backends based on this type hierarchy.
-    """
-    def __init__(self) -> None:
-        if type(self) is DatacenterCommand:
-            raise TypeError("DatacenterCommand cannot be instantiated directly; subclass it.")
-@dataclass(frozen=True)
-class SetBatchSize(DatacenterCommand):
-    """Set batch sizes for one or more models.
-    Attributes:
-        batch_size_by_model: Mapping of model label to target batch size.
-        ramp_up_rate_by_model: Per-model requests/second ramp-up rate.
-            Models not present get immediate changes (rate 0).
-    """
-    batch_size_by_model: dict[str, int]
-    ramp_up_rate_by_model: dict[str, float] = field(default_factory=dict)

openg2g/datacenter/config.py DELETED Viewed

@@ -1,342 +0,0 @@
-"""Datacenter facility and workload configuration."""
-from __future__ import annotations
-from collections.abc import Iterator
-from dataclasses import dataclass
-import numpy as np
-from pydantic import BaseModel, ConfigDict, model_validator
-from openg2g.datacenter.workloads.training import TrainingTrace
-class InferenceModelSpec(BaseModel):
-    """Specification for one LLM model served in the datacenter.
-    Attributes:
-        model_label: Human-readable model identifier (e.g. `"Llama-3.1-70B"`).
-        model_id: HuggingFace model ID (e.g. `"meta-llama/Llama-3.1-70B-Instruct"`).
-            Used for benchmark data lookups and online API model fields.
-        num_replicas: Total number of replicas of this model across the datacenter.
-        gpus_per_replica: GPUs allocated to each replica (determines model
-            parallelism and per-replica power draw).
-        initial_batch_size: Initial batch size for this model.
-        itl_deadline_s: Per-model inter-token latency deadline for the OFO
-            latency dual (seconds).
-        feasible_batch_sizes: Allowed batch sizes. Used by the OFO
-            controller for discretizing continuous batch-size updates
-            and by the online datacenter for load-generator sizing.
-            Defaults to `(initial_batch_size,)`.
-    """
-    model_config = ConfigDict(frozen=True)
-    model_label: str
-    model_id: str = ""
-    num_replicas: int
-    gpus_per_replica: int
-    initial_batch_size: int
-    itl_deadline_s: float
-    feasible_batch_sizes: tuple[int, ...] = ()
-    @model_validator(mode="after")
-    def _validate(self) -> InferenceModelSpec:
-        if not self.feasible_batch_sizes:
-            object.__setattr__(self, "feasible_batch_sizes", (self.initial_batch_size,))
-        elif self.initial_batch_size not in self.feasible_batch_sizes:
-            raise ValueError(
-                f"initial_batch_size ({self.initial_batch_size}) must be in "
-                f"feasible_batch_sizes ({self.feasible_batch_sizes})."
-            )
-        if self.num_replicas < 0:
-            raise ValueError(f"num_replicas must be >= 0, got {self.num_replicas}.")
-        if self.gpus_per_replica < 1:
-            raise ValueError(f"gpus_per_replica must be >= 1, got {self.gpus_per_replica}.")
-        if self.initial_batch_size <= 0:
-            raise ValueError(f"initial_batch_size must be > 0, got {self.initial_batch_size}.")
-        if self.itl_deadline_s <= 0:
-            raise ValueError(f"itl_deadline_s must be > 0, got {self.itl_deadline_s}.")
-        return self
-class TrainingRun:
-    """Training workload parameters.
-    The trace is eagerly rescaled so its peak matches `target_peak_W_per_gpu`.
-    Use `eval_power` to evaluate total training power at a given simulation time.
-    Combine with [`at`][.at] and `|` to build a [`TrainingSchedule`][..TrainingSchedule]:
-    ```python
-    schedule = (
-        TrainingRun(n_gpus=2400, trace=trace_a).at(t_start=1000, t_end=2000)
-        | TrainingRun(n_gpus=1200, trace=trace_b).at(t_start=2500, t_end=3500)
-    )
-    ```
-    Attributes:
-        n_gpus: Number of GPUs running the training workload.
-        trace: Single-GPU [`TrainingTrace`][openg2g.datacenter.workloads.training.TrainingTrace].
-        target_peak_W_per_gpu: The trace is rescaled so its peak equals this value.
-    """
-    __slots__ = ("_period", "_rescaled_power", "_trace_time", "n_gpus", "target_peak_W_per_gpu", "trace")
-    def __init__(self, *, n_gpus: int, trace: TrainingTrace, target_peak_W_per_gpu: float = 400.0) -> None:
-        if n_gpus <= 0:
-            raise ValueError(f"TrainingRun n_gpus must be > 0, got {n_gpus}.")
-        self.n_gpus = n_gpus
-        self.trace = trace
-        self.target_peak_W_per_gpu = target_peak_W_per_gpu
-        t = np.asarray(trace.t_s, float)
-        p = np.asarray(trace.power_w, float)
-        t = t - t[0]
-        period = float(t[-1] - t[0])
-        if period <= 0:
-            raise ValueError("Training trace time span must be positive.")
-        peak = float(np.max(p))
-        if peak <= 0:
-            raise ValueError("Training trace has non-positive peak; cannot scale.")
-        self._rescaled_power = p * (target_peak_W_per_gpu / peak)
-        self._trace_time = t
-        self._period = period
-    def eval_power(self, t: float, t_start: float, t_end: float) -> float:
-        """Evaluate total training power at simulation time `t`.
-        Returns zero if `t` is outside `[t_start, t_end]`.
-        Args:
-            t: Global simulation time (seconds).
-            t_start: Time when training becomes active (seconds).
-            t_end: Time when training stops (seconds).
-        Returns:
-            Total training power (W) across all `n_gpus` GPUs.
-        """
-        if t < t_start or t > t_end:
-            return 0.0
-        t_local = t - t_start
-        t_mod = t_local % self._period
-        p_1gpu = float(np.interp(t_mod, self._trace_time, self._rescaled_power))
-        return p_1gpu * self.n_gpus
-    def at(self, t_start: float, t_end: float) -> TrainingSchedule:
-        """Schedule this training run over `[t_start, t_end]`.
-        Args:
-            t_start: Global simulation time when training becomes active (seconds).
-            t_end: Global simulation time when training stops (seconds).
-        Returns:
-            A single-entry [`TrainingSchedule`][...TrainingSchedule].
-        """
-        if t_end < t_start:
-            raise ValueError(f"t_end ({t_end}) must be >= t_start ({t_start}).")
-        return TrainingSchedule(((self, float(t_start), float(t_end)),))
-class TrainingSchedule:
-    """Ordered collection of [`TrainingRun`][..TrainingRun] objects scheduled
-    over time windows.
-    Each entry is a `(TrainingRun, t_start, t_end)` tuple. Entries are
-    sorted by `t_start`.
-    Built with [`TrainingRun.at`][..TrainingRun.at] and `|`.
-    Example:
-    ```python
-    schedule = (
-        TrainingRun(n_gpus=2400, trace=trace_a).at(t_start=1000, t_end=2000)
-        | TrainingRun(n_gpus=1200, trace=trace_b).at(t_start=2500, t_end=3500)
-    )
-    ```
-    """
-    __slots__ = ("_entries",)
-    def __init__(self, entries: tuple[tuple[TrainingRun, float, float], ...] = ()) -> None:
-        self._entries = tuple(sorted(entries, key=lambda e: e[1]))
-    def __or__(self, other: TrainingSchedule) -> TrainingSchedule:
-        return TrainingSchedule((*self._entries, *other._entries))
-    def __iter__(self) -> Iterator[tuple[TrainingRun, float, float]]:
-        return iter(self._entries)
-    def __len__(self) -> int:
-        return len(self._entries)
-    def __bool__(self) -> bool:
-        return bool(self._entries)
-    def __repr__(self) -> str:
-        parts = [f"TrainingRun(n_gpus={r.n_gpus}).at(t_start={s}, t_end={e})" for r, s, e in self._entries]
-        return " | ".join(parts)
-@dataclass(frozen=True)
-class InferenceRamp:
-    """Inference server ramp parameters.
-    Transitions the active inference server fraction to `target`. Combine with
-    [`at`][.at] and `|` to build an [`InferenceRampSchedule`][..InferenceRampSchedule]:
-    ```python
-    ramps = (
-        InferenceRamp(target=0.2).at(t_start=2500, t_end=3000)
-        | InferenceRamp(target=1.0).at(t_start=3200, t_end=3400)
-    )
-    ```
-    Attributes:
-        target: Target active-server fraction after the ramp (0.0--1.0).
-    """
-    target: float
-    def __post_init__(self) -> None:
-        if not (0.0 <= self.target <= 1.0):
-            raise ValueError(f"InferenceRamp target must be in [0.0, 1.0], got {self.target}.")
-    def at(self, t_start: float, t_end: float) -> InferenceRampSchedule:
-        """Schedule this ramp over `[t_start, t_end]`.
-        Args:
-            t_start: Global simulation time when the ramp begins (seconds).
-            t_end: Global simulation time when the ramp ends (seconds).
-        Returns:
-            A single-entry [`InferenceRampSchedule`][...InferenceRampSchedule].
-        """
-        if t_end < t_start:
-            raise ValueError(f"t_end ({t_end}) must be >= t_start ({t_start}).")
-        return InferenceRampSchedule(((self, float(t_start), float(t_end)),))
-class InferenceRampSchedule:
-    """Ordered collection of [`InferenceRamp`][..InferenceRamp] events.
-    Each entry is an `(InferenceRamp, t_start, t_end)` tuple. Entries are
-    sorted by `t_start`.
-    Built with [`InferenceRamp.at`][..InferenceRamp.at] and `|`.
-    Semantics: before the first ramp, fraction = 1.0. During each
-    `[t_start, t_end]` window, the fraction linearly interpolates from
-    the previous level to `target`. Between ramps, the fraction holds
-    at the last target.
-    An empty schedule means all servers are active (fraction = 1.0)
-    at all times.
-    Example:
-    ```python
-    ramps = (
-        InferenceRamp(target=0.2).at(t_start=2500, t_end=3000)
-        | InferenceRamp(target=1.0).at(t_start=3200, t_end=3400)
-    )
-    ```
-    """
-    __slots__ = ("_entries",)
-    def __init__(self, entries: tuple[tuple[InferenceRamp, float, float], ...] = ()) -> None:
-        self._entries = tuple(sorted(entries, key=lambda e: e[1]))
-    def __or__(self, other: InferenceRampSchedule) -> InferenceRampSchedule:
-        return InferenceRampSchedule((*self._entries, *other._entries))
-    def __iter__(self) -> Iterator[tuple[InferenceRamp, float, float]]:
-        return iter(self._entries)
-    def __len__(self) -> int:
-        return len(self._entries)
-    def __bool__(self) -> bool:
-        return bool(self._entries)
-    def __repr__(self) -> str:
-        parts = [f"InferenceRamp(target={r.target}).at(t_start={s}, t_end={e})" for r, s, e in self._entries]
-        return " | ".join(parts)
-    def fraction_at(self, t: float | np.ndarray) -> float | np.ndarray:
-        """Evaluate the active inference server fraction at time(s) *t*.
-        Piecewise-linear interpolation between ramp events.
-        Before the first ramp, fraction = 1.0.
-        Args:
-            t: Scalar or array of global simulation times (seconds).
-        Returns:
-            Active-server fraction(s), same shape as *t*.
-        """
-        if isinstance(t, np.ndarray):
-            return self._fraction_array(t)
-        return float(self._fraction_scalar(float(t)))
-    def _fraction_scalar(self, t: float) -> float:
-        level = 1.0
-        for ramp, t_start, t_end in self._entries:
-            if t < t_start:
-                return level
-            if t <= t_end:
-                if t_end == t_start:
-                    return ramp.target
-                alpha = (t - t_start) / (t_end - t_start)
-                return level + (ramp.target - level) * alpha
-            level = ramp.target
-        return level
-    def _fraction_array(self, t: np.ndarray) -> np.ndarray:
-        vfunc = np.vectorize(self._fraction_scalar, otypes=[float])
-        return vfunc(t)
-class DatacenterConfig(BaseModel):
-    """Physical datacenter facility configuration.
-    Attributes:
-        gpus_per_server: Number of GPUs per physical server rack.
-        base_kw_per_phase: Constant base load per phase (kW).
-        power_factor: Power factor of the datacenter loads (lagging).
-    """
-    model_config = ConfigDict(frozen=True)
-    gpus_per_server: int = 8
-    base_kw_per_phase: float = 0.0
-    power_factor: float = 0.95
-    @model_validator(mode="after")
-    def _validate(self) -> DatacenterConfig:
-        if self.gpus_per_server < 1:
-            raise ValueError(f"gpus_per_server must be >= 1, got {self.gpus_per_server}.")
-        if not (0.0 < self.power_factor <= 1.0):
-            raise ValueError(f"power_factor must be in (0, 1], got {self.power_factor}.")
-        return self
-class PowerAugmentationConfig(BaseModel):
-    """Power augmentation settings for virtual server scaling.
-    Controls per-server amplitude jitter and additive noise applied during
-    power augmentation.
-    Attributes:
-        amplitude_scale_range: `(low, high)` range for per-server amplitude
-            scaling. Each virtual server draws a uniform multiplier from this range.
-        noise_fraction: Gaussian noise standard deviation as a fraction of
-            per-server power.
-    """
-    model_config = ConfigDict(frozen=True)
-    amplitude_scale_range: tuple[float, float] = (1.0, 1.0)
-    noise_fraction: float = 0.0

openg2g/datacenter/layout.py DELETED Viewed

@@ -1,126 +0,0 @@
-"""Server layout and activation policy primitives.
-Provides the topology and activation-policy building blocks used by
-datacenter backends. Power augmentation (scaling per-GPU power to
-three-phase datacenter power) lives in
-`openg2g.datacenter.workloads.inference`.
-"""
-from __future__ import annotations
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-import numpy as np
-from openg2g.datacenter.config import InferenceRampSchedule
-class ActivationPolicy(ABC):
-    """Per-model activation policy that answers "which servers are active?"
-    Subclass to implement custom activation logic. The datacenter creates
-    one policy per model and passes it to
-    [`InferencePowerAugmenter`][openg2g.datacenter.workloads.inference.InferencePowerAugmenter].
-    """
-    @abstractmethod
-    def active_mask(self, t: float) -> np.ndarray:
-        """Boolean mask of active servers at time *t*.
-        Returns:
-            Array of shape `(num_servers,)` with `True` for active servers.
-        """
-    def active_indices(self, t: float) -> np.ndarray:
-        """Indices of active servers at time *t*.
-        The default implementation returns indices in ascending order
-        via `np.where(`[`active_mask`][..active_mask]`(t))`. Subclasses
-        may override to return
-        indices in a specific order (e.g., priority order) to control
-        floating-point summation order in the datacenter.
-        Returns:
-            1-D int array of active server indices.
-        """
-        return np.where(self.active_mask(t))[0]
-class RampActivationPolicy(ActivationPolicy):
-    """Activate servers by fixed random priority, following an
-    [`InferenceRampSchedule`][openg2g.datacenter.config.InferenceRampSchedule].
-    At time *t*, the top-*k* servers (by random priority) are active,
-    where `k = round(schedule.fraction_at(t) * num_servers)`.
-    This is the default policy used by
-    [`OfflineDatacenter`][openg2g.datacenter.offline.OfflineDatacenter].
-    Args:
-        schedule: Temporal ramp schedule mapping time to active-server fraction.
-        num_servers: Number of physical servers for this model.
-        rng: RNG for randomizing priority ordering. Consumed once at
-            construction time.
-    """
-    __slots__ = ("_n", "_priority", "_schedule")
-    def __init__(
-        self,
-        schedule: InferenceRampSchedule,
-        num_servers: int,
-        rng: np.random.Generator,
-    ) -> None:
-        self._schedule = schedule
-        self._n = num_servers
-        priority = np.arange(num_servers, dtype=int)
-        rng.shuffle(priority)
-        self._priority = priority
-    def active_mask(self, t: float) -> np.ndarray:
-        frac = self._schedule.fraction_at(t)
-        k = max(0, min(self._n, int(round(float(frac) * self._n))))
-        mask = np.zeros(self._n, dtype=bool)
-        mask[self._priority[:k]] = True
-        return mask
-    def active_indices(self, t: float) -> np.ndarray:
-        """Return active server indices in priority order."""
-        frac = self._schedule.fraction_at(t)
-        k = max(0, min(self._n, int(round(float(frac) * self._n))))
-        return self._priority[:k].copy()
-@dataclass
-class ServerLayout:
-    """Per-model server layout describing how GPUs are organized.
-    This describes the physical topology only. Activation policies (which
-    servers are on/off at a given time) are managed separately by the
-    datacenter and passed to
-    [`InferencePowerAugmenter`][openg2g.datacenter.workloads.inference.InferencePowerAugmenter]
-    alongside layouts.
-    Attributes:
-        num_servers: Number of physical servers for this model.
-        total_gpus: Total GPU count across all servers.
-        gpus_per_replica: GPUs per model replica.
-        gpus_per_server_list: GPU count per server (last may be partial).
-        phase_list: Phase assignment per server (0=A, 1=B, 2=C).
-        stagger_offsets: Per-server offsets for desynchronization. In offline
-            mode these are integer indices into a power template; in online
-            mode they can be float time offsets into a rolling buffer.
-        amplitude_scales: Per-server power multiplier for inter-server variation.
-        noise_fraction: Gaussian noise standard deviation as a fraction of
-            per-server power.
-    """
-    num_servers: int
-    total_gpus: int
-    gpus_per_replica: int
-    gpus_per_server_list: np.ndarray
-    phase_list: np.ndarray
-    stagger_offsets: np.ndarray
-    amplitude_scales: np.ndarray
-    noise_fraction: float

openg2g/datacenter/offline.py DELETED Viewed

@@ -1,320 +0,0 @@
-"""Offline (trace-based) datacenter backend."""
-from __future__ import annotations
-import functools
-import logging
-import math
-from dataclasses import dataclass, field
-from fractions import Fraction
-import numpy as np
-from openg2g.clock import SimulationClock
-from openg2g.common import ThreePhase
-from openg2g.datacenter.base import LLMBatchSizeControlledDatacenter, LLMDatacenterState
-from openg2g.datacenter.command import DatacenterCommand, SetBatchSize
-from openg2g.datacenter.config import (
-    DatacenterConfig,
-    InferenceRampSchedule,
-    PowerAugmentationConfig,
-    TrainingSchedule,
-)
-from openg2g.datacenter.layout import (
-    ActivationPolicy,
-    RampActivationPolicy,
-    ServerLayout,
-)
-from openg2g.datacenter.workloads.inference import InferenceData, InferencePowerAugmenter
-from openg2g.events import EventEmitter
-from openg2g.utils import split_integer_evenly
-logger = logging.getLogger(__name__)
-@dataclass(frozen=True)
-class OfflineDatacenterState(LLMDatacenterState):
-    """Extended state from the offline (trace-based) backend.
-    Adds per-model power breakdown to
-    [`LLMDatacenterState`][openg2g.datacenter.base.LLMDatacenterState].
-    """
-    power_by_model_w: dict[str, float] = field(default_factory=dict)
-@dataclass
-class OfflineWorkload:
-    """Complete offline simulation workload.
-    Bundles inference data with optional training overlays and inference
-    server ramp events.
-    Attributes:
-        inference_data: LLM inference workload with offline simulation
-            data (model specs, power templates, ITL fits).
-        inference_ramps: Inference server ramp schedule. `None` keeps all
-            servers active.
-        training: Training workload schedule. `None` disables training
-            overlay.
-    """
-    inference_data: InferenceData
-    inference_ramps: InferenceRampSchedule = field(default_factory=InferenceRampSchedule)
-    training: TrainingSchedule = field(default_factory=TrainingSchedule)
-class OfflineDatacenter(LLMBatchSizeControlledDatacenter[OfflineDatacenterState]):
-    """Trace-based datacenter simulation with step-by-step interface.
-    Each `step` call computes one timestep of power output by indexing
-    into pre-built per-GPU templates, applying per-server amplitude
-    scaling and noise, and summing across active servers per phase.
-    Batch size changes via `apply_control` take effect on the next
-    `step` call.
-    If `workload.inference_ramps` is set, a
-    [`RampActivationPolicy`][openg2g.datacenter.layout.RampActivationPolicy]
-    is created per model.
-    Args:
-        datacenter: Facility configuration (GPUs per server, base load).
-        workload: Offline workload configuration bundling inference data,
-            training overlays, and server ramp events.
-        dt_s: Simulation timestep (seconds).
-        seed: Random seed for layout generation, noise, and latency
-            sampling. Sub-seeds are derived deterministically.
-        power_augmentation: Per-server amplitude scaling and noise
-            settings.
-    """
-    def __init__(
-        self,
-        datacenter: DatacenterConfig,
-        workload: OfflineWorkload,
-        *,
-        dt_s: Fraction,
-        seed: int = 0,
-        power_augmentation: PowerAugmentationConfig | None = None,
-    ) -> None:
-        super().__init__()
-        if power_augmentation is None:
-            power_augmentation = PowerAugmentationConfig()
-        self._datacenter = datacenter
-        self._workload = workload
-        self._power_augmentation = power_augmentation
-        self._dt_s = dt_s
-        self._seed = int(seed)
-        self._models = list(workload.inference_data.models)
-        self._base_W_per_phase = float(datacenter.base_kw_per_phase) * 1e3
-        self._layout_rng = np.random.default_rng(self._seed)
-        self._batch_by_model: dict[str, int] = {ms.model_label: ms.initial_batch_size for ms in self._models}
-        self._layouts: dict[str, ServerLayout] = {}
-        self._policies: dict[str, ActivationPolicy] = {}
-        self._build_all_layouts()
-        self._inference_augmenter = InferencePowerAugmenter(
-            layouts=self._layouts,
-            policies=self._policies,
-            seed=self._seed + 12345,
-        )
-        self._global_step: int = 0
-        self._latency_rng = np.random.default_rng(self._seed + 54321)
-        logger.info(
-            "OfflineDatacenter: %d models, dt=%s s, seed=%d",
-            len(self._models),
-            dt_s,
-            seed,
-        )
-        for ms in self._models:
-            logger.info(
-                "  %s: %d replicas, %d GPUs/replica, batch=%d",
-                ms.model_label,
-                ms.num_replicas,
-                ms.gpus_per_replica,
-                ms.initial_batch_size,
-            )
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    def step(self, clock: SimulationClock, events: EventEmitter) -> OfflineDatacenterState:
-        t_now = clock.time_s
-        template_store = self._workload.inference_data.power_templates
-        # Build per-GPU power dict by indexing into templates with layout offsets.
-        per_gpu_by_model: dict[str, np.ndarray] = {}
-        for ms in self._models:
-            label = ms.model_label
-            if ms.num_replicas <= 0:
-                continue
-            batch = int(self._batch_by_model[label])
-            layout = self._layouts[label]
-            template = template_store.template(label, batch)
-            indices = (self._global_step + layout.stagger_offsets) % len(template)
-            per_gpu_by_model[label] = template[indices]
-        inference_aug = self._inference_augmenter.augment(per_gpu_by_model, t_now)
-        power_by_model = dict(inference_aug.power_by_model_w)
-        active_replicas_by_model = dict(inference_aug.active_replicas_by_model)
-        for ms in self._models:
-            power_by_model.setdefault(ms.model_label, 0.0)
-            active_replicas_by_model.setdefault(ms.model_label, 0)
-        # This is where we accumulate power across workloads.
-        phase_power = np.array(
-            [
-                self._base_W_per_phase + inference_aug.power_w.a,
-                self._base_W_per_phase + inference_aug.power_w.b,
-                self._base_W_per_phase + inference_aug.power_w.c,
-            ]
-        )
-        # Training overlay
-        for run, t_start, t_end in self._workload.training:
-            training_power_w = run.eval_power(float(t_now), t_start, t_end)
-            phase_power += training_power_w / 3.0
-        # ITL sampling
-        itl_fits = self._workload.inference_data.itl_fits
-        observed_itl_s_by_model: dict[str, float] = {}
-        for ms in self._models:
-            label = ms.model_label
-            n_rep = active_replicas_by_model.get(label, 0)
-            if itl_fits is None or n_rep <= 0:
-                observed_itl_s_by_model[label] = float("nan")
-                continue
-            batch = int(self._batch_by_model[label])
-            observed_itl_s_by_model[label] = itl_fits.sample_avg(
-                model_label=label,
-                batch_size=batch,
-                n_replicas=n_rep,
-                rng=self._latency_rng,
-            )
-        state = OfflineDatacenterState(
-            time_s=float(t_now),
-            power_w=ThreePhase(
-                a=float(phase_power[0]),
-                b=float(phase_power[1]),
-                c=float(phase_power[2]),
-            ),
-            power_by_model_w=power_by_model,
-            active_replicas_by_model=active_replicas_by_model,
-            batch_size_by_model=dict(self._batch_by_model),
-            observed_itl_s_by_model=observed_itl_s_by_model,
-        )
-        self._global_step += 1
-        return state
-    @functools.singledispatchmethod
-    def apply_control(self, command: DatacenterCommand, events: EventEmitter) -> None:
-        """Apply a control command. Dispatches on command type."""
-        raise TypeError(f"OfflineDatacenter does not support {type(command).__name__}")
-    @apply_control.register
-    def apply_control_set_batch_size(self, command: SetBatchSize, events: EventEmitter) -> None:
-        """Record new batch sizes. Changes take effect on the next step."""
-        if command.ramp_up_rate_by_model:
-            raise ValueError(
-                f"OfflineDatacenter does not support ramp_up_rate_by_model (got {command.ramp_up_rate_by_model}). "
-                f"Batch size changes are always immediate in trace-based simulation."
-            )
-        for label, b in command.batch_size_by_model.items():
-            b_int = int(b)
-            if b_int <= 0:
-                raise ValueError(f"Batch size must be positive for model {label!r}, got {b_int}.")
-            old = self._batch_by_model.get(str(label))
-            self._batch_by_model[str(label)] = b_int
-            if old != b_int:
-                logger.info("Batch size %s: %s -> %d", label, old, b_int)
-        events.emit(
-            "datacenter.batch_size.updated",
-            {"batch_size_by_model": dict(self._batch_by_model)},
-        )
-    def reset(self) -> None:
-        self._global_step = 0
-        self._batch_by_model = {ms.model_label: ms.initial_batch_size for ms in self._models}
-        self._layout_rng = np.random.default_rng(self._seed)
-        self._layouts = {}
-        self._policies = {}
-        self._build_all_layouts()
-        self._inference_augmenter = InferencePowerAugmenter(
-            layouts=self._layouts,
-            policies=self._policies,
-            seed=self._seed + 12345,
-        )
-        self._latency_rng = np.random.default_rng(self._seed + 54321)
-    def _build_all_layouts(self) -> None:
-        """Build layouts and activation policies for all models."""
-        schedule = self._workload.inference_ramps
-        rng = self._layout_rng
-        gpus_per_server = self._datacenter.gpus_per_server
-        amp_lo, amp_hi = self._power_augmentation.amplitude_scale_range
-        noise_fraction = self._power_augmentation.noise_fraction
-        template_store = self._workload.inference_data.power_templates
-        for ms in self._models:
-            if ms.num_replicas > 0:
-                any_batch = template_store.batch_sizes(ms.model_label)[0]
-                tpl_len = len(template_store.template(ms.model_label, any_batch))
-                num_servers = math.ceil(ms.num_replicas * ms.gpus_per_replica / gpus_per_server)
-                # Phase shuffle
-                sA, sB, sC = split_integer_evenly(num_servers, 3)
-                phase_list = np.asarray(([0] * sA) + ([1] * sB) + ([2] * sC), dtype=int)
-                rng.shuffle(phase_list)
-                # Policy dictates which servers are active at a given time.
-                self._policies[ms.model_label] = RampActivationPolicy(schedule, num_servers, rng)
-                # This offset determines for each server, how much to stagger its power template indexing.
-                stagger_offsets = rng.integers(low=0, high=max(tpl_len, 1), size=num_servers)
-                # Amplitude scales
-                amplitude_scales = rng.uniform(amp_lo, amp_hi, size=num_servers)
-                total_gpus = ms.num_replicas * ms.gpus_per_replica
-                gpus_per_server_list = np.full(num_servers, gpus_per_server, dtype=int)
-                tail = total_gpus - (num_servers - 1) * gpus_per_server
-                gpus_per_server_list[-1] = int(tail) if tail > 0 else gpus_per_server
-                self._layouts[ms.model_label] = ServerLayout(
-                    num_servers=num_servers,
-                    total_gpus=total_gpus,
-                    gpus_per_replica=ms.gpus_per_replica,
-                    gpus_per_server_list=gpus_per_server_list,
-                    phase_list=phase_list,
-                    stagger_offsets=stagger_offsets,
-                    amplitude_scales=amplitude_scales,
-                    noise_fraction=noise_fraction,
-                )
-    @property
-    def phase_share_by_model(self) -> dict[str, np.ndarray]:
-        """Per-model phase share vectors derived from server placement.
-        Returns:
-            Mapping of model label to a 3-element array `[frac_A, frac_B, frac_C]`
-                representing the fraction of servers on each phase.
-        """
-        shares: dict[str, np.ndarray] = {}
-        for label, layout in self._layouts.items():
-            counts = np.bincount(layout.phase_list, minlength=3).astype(float)
-            total = counts.sum()
-            if total > 0:
-                shares[label] = counts / total
-            else:
-                shares[label] = np.array([1 / 3, 1 / 3, 1 / 3], dtype=float)
-        return shares

openg2g/datacenter/online.py DELETED Viewed

@@ -1,1196 +0,0 @@
-"""Online (live GPU) datacenter backend with power augmentation.
-Connects to real vLLM inference servers for load generation and ITL
-measurement, and to zeusd instances for live GPU power monitoring.
-Power readings from a small number of real GPUs are augmented to
-datacenter scale using the shared
-[`InferencePowerAugmenter`][openg2g.datacenter.workloads.inference.InferencePowerAugmenter]
-pipeline.
-Requires `pip install zeus aiohttp`.
-"""
-from __future__ import annotations
-import asyncio
-import collections
-import contextlib
-import functools
-import json
-import logging
-import math
-import re
-import threading
-import time
-import urllib.request
-from collections.abc import Sequence
-from dataclasses import dataclass, field
-from fractions import Fraction
-from pathlib import Path
-from typing import Any
-import aiohttp
-import numpy as np
-from pydantic import BaseModel, ConfigDict
-from zeus.monitor.power_streaming import PowerStreamingClient
-from zeus.utils.zeusd import ZeusdConfig
-from openg2g.clock import SimulationClock
-from openg2g.common import ThreePhase
-from openg2g.datacenter.base import LLMBatchSizeControlledDatacenter, LLMDatacenterState
-from openg2g.datacenter.command import DatacenterCommand, SetBatchSize
-from openg2g.datacenter.config import (
-    DatacenterConfig,
-    InferenceModelSpec,
-    InferenceRampSchedule,
-    PowerAugmentationConfig,
-)
-from openg2g.datacenter.layout import (
-    ActivationPolicy,
-    RampActivationPolicy,
-    ServerLayout,
-)
-from openg2g.datacenter.workloads.inference import (
-    InferencePowerAugmenter,
-    RequestStore,
-)
-from openg2g.events import EventEmitter
-from openg2g.utils import split_integer_evenly
-logger = logging.getLogger(__name__)
-@dataclass(frozen=True)
-class OnlineDatacenterState(LLMDatacenterState):
-    """Extended state from the online (live GPU) backend.
-    The base `power_w`
-    field carries the augmented three-phase power (what the grid sees).
-    This subclass adds the measured (pre-augmentation) breakdown for
-    post-hoc analysis.
-    Attributes:
-        measured_power_w: Total measured three-phase power from real GPUs
-            (before augmentation), plus base load.
-        measured_power_w_by_model: Per-model total measured power from real
-            GPUs (watts).
-        augmented_power_w_by_model: Per-model augmented power (watts). This
-            is the power fed to the grid for each model after scaling up.
-        augmentation_factor_by_model: Per-model augmentation multiplier
-            (virtual replicas / real replicas).
-        prometheus_metrics_by_model: Per-model Prometheus metrics snapshot.
-            Keys are model labels, values are dicts with metric names like
-            `num_requests_running`, `num_requests_waiting`,
-            `kv_cache_usage_perc`, `num_preemptions_total`.
-    """
-    measured_power_w: ThreePhase = field(default_factory=lambda: ThreePhase(a=0.0, b=0.0, c=0.0))
-    measured_power_w_by_model: dict[str, float] = field(default_factory=dict)
-    augmented_power_w_by_model: dict[str, float] = field(default_factory=dict)
-    augmentation_factor_by_model: dict[str, float] = field(default_factory=dict)
-    prometheus_metrics_by_model: dict[str, dict[str, float]] = field(default_factory=dict)
-class GPUEndpointMapping(BaseModel):
-    """Maps a zeusd endpoint to specific GPUs.
-    Attributes:
-        host: Hostname or IP of the zeusd instance.
-        port: TCP port of the zeusd instance.
-        gpu_indices: GPU device indices to monitor on this endpoint.
-    """
-    model_config = ConfigDict(frozen=True)
-    host: str
-    port: int = 4938
-    gpu_indices: tuple[int, ...] = (0,)
-    @property
-    def endpoint_key(self) -> str:
-        """Return the `host:port` key used by `PowerStreamingClient`."""
-        return f"{self.host}:{self.port}"
-class VLLMDeployment(BaseModel):
-    """Deployment of one LLM model on a vLLM server.
-    !!! Warning
-        vLLM must be a patched version with the `POST /set_max_num_seqs`
-        endpoint implemented.
-    Pairs a reusable
-    [`InferenceModelSpec`][openg2g.datacenter.config.InferenceModelSpec]
-    with physical deployment details. `spec.num_replicas` is the
-    simulated (augmented) count for grid simulation. The real replica
-    count is derived from `gpu_endpoints` and `spec.gpus_per_replica`.
-    Tracks the current batch size (`max_num_seqs`) and provides
-    `set_batch_size()` to update it on the vLLM server.
-    Attributes:
-        spec: Model specification (shared with offline datacenter).
-        vllm_base_url: Base URL of the vLLM server (e.g. `http://node1:8000`).
-        gpu_endpoints: GPU endpoint mappings for power monitoring.
-        request_extra_body: Extra fields merged into every request dict
-            for this model (e.g. `chat_template_kwargs`).
-        batch_size: Current batch size (`max_num_seqs`). Initialized from
-            `spec.initial_batch_size` if not set explicitly.
-    """
-    spec: InferenceModelSpec
-    vllm_base_url: str
-    gpu_endpoints: tuple[GPUEndpointMapping, ...] = ()
-    request_extra_body: dict[str, Any] | None = None
-    batch_size: int = 0
-    def model_post_init(self, __context: Any) -> None:
-        if self.batch_size == 0:
-            self.batch_size = self.spec.initial_batch_size
-    @property
-    def model_label(self) -> str:
-        return self.spec.model_label
-    @property
-    def num_real_gpus(self) -> int:
-        """Total number of real GPUs for this model across all endpoints."""
-        return sum(len(ep.gpu_indices) for ep in self.gpu_endpoints)
-    @property
-    def num_real_replicas(self) -> int:
-        """Number of real replicas (real GPUs / GPUs per replica)."""
-        return self.num_real_gpus // max(self.spec.gpus_per_replica, 1)
-    @property
-    def augmentation_factor(self) -> float:
-        """Ratio of simulated replicas to real replicas."""
-        return self.spec.num_replicas / max(self.num_real_replicas, 1)
-    def set_batch_size(self, batch_size: int, ramp_up_rate: float = 0.0) -> None:
-        """Update batch size on the vLLM server and track it locally.
-        Sends `POST /set_max_num_seqs` to the vLLM server.
-        Args:
-            batch_size: New batch size (max_num_seqs) to set.
-            ramp_up_rate: Optional ramp-up rate for gradual increase.
-        """
-        old = self.batch_size
-        url = f"{self.vllm_base_url}/set_max_num_seqs?max_num_seqs={batch_size}"
-        if ramp_up_rate > 0:
-            url += f"&ramp_up_rate={ramp_up_rate}"
-        try:
-            req = urllib.request.Request(url, method="POST", data=b"")
-            with urllib.request.urlopen(req, timeout=2.0) as resp:
-                if resp.status >= 400:
-                    raise RuntimeError(
-                        f"Failed to set batch size {batch_size} on {self.vllm_base_url}: HTTP {resp.status}"
-                    )
-        except Exception:
-            logger.error(
-                "Failed to set batch size %d on %s (keeping old=%d)",
-                batch_size,
-                self.vllm_base_url,
-                old,
-                exc_info=True,
-            )
-            raise
-        self.batch_size = batch_size
-        if old != batch_size:
-            logger.info("Batch size %s: %d -> %d", self.model_label, old, batch_size)
-class LiveServerConfig(BaseModel):
-    """Configuration for interacting with live vLLM servers.
-    Groups settings related to load generation, ITL measurement, and
-    Prometheus monitoring. The online counterpart of offline's
-    trace/template data.
-    Attributes:
-        requests_dir: Directory containing per-model JSONL request files
-            (e.g. `{model_label}.jsonl`). If `None`, a minimal fallback
-            request is used for each model.
-        prometheus_poll_interval_s: How often to poll vLLM /metrics for
-            request counts and saturation monitoring. Set to 0 to disable.
-        max_output_tokens: Token limit for generated load requests (used
-            by the fallback request when no JSONL requests are provided).
-        itl_window_s: Sliding window for ITL averaging (seconds).
-    """
-    model_config = ConfigDict(frozen=True)
-    requests_dir: Path | None = None
-    prometheus_poll_interval_s: float = 0.5
-    max_output_tokens: int = 512
-    itl_window_s: float = 1.0
-STAGGER_BUFFER_S: float = 10.0
-"""Seconds of power history for temporal staggering.
-Also used as the stagger range when building
-[`ServerLayout`][openg2g.datacenter.layout.ServerLayout]
-(float offsets drawn from `[0, STAGGER_BUFFER_S)`).
-Not user-configurable. Patchable for testing via
-`openg2g.datacenter.online.STAGGER_BUFFER_S = ...`.
-"""
-def _check_vllm_health(base_url: str, timeout_s: float = 10.0) -> None:
-    """Verify a vLLM server is reachable via GET /health.
-    Args:
-        base_url: Base URL of the vLLM server (e.g. `http://node1:8000`).
-        timeout_s: HTTP timeout in seconds.
-    Raises:
-        RuntimeError: If the server is not reachable or unhealthy.
-    """
-    url = f"{base_url}/health"
-    try:
-        req = urllib.request.Request(url)
-        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
-            if resp.status != 200:
-                raise RuntimeError(f"vLLM health check failed: HTTP {resp.status} from {url}")
-    except Exception as e:
-        raise RuntimeError(f"vLLM health check failed for {url}: {e}") from e
-def _check_vllm_model(base_url: str, expected_model: str, timeout_s: float = 10.0) -> None:
-    """Verify a vLLM server is serving the expected model via GET /v1/models.
-    Args:
-        base_url: Base URL of the vLLM server.
-        expected_model: Model ID to expect in the response.
-        timeout_s: HTTP timeout in seconds.
-    Raises:
-        RuntimeError: If the model is not served or the endpoint is unreachable.
-    """
-    url = f"{base_url}/v1/models"
-    try:
-        req = urllib.request.Request(url)
-        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
-            if resp.status != 200:
-                raise RuntimeError(f"vLLM model check failed: HTTP {resp.status} from {url}")
-            data = json.loads(resp.read().decode())
-            served = [m["id"] for m in data.get("data", [])]
-            if expected_model not in served:
-                raise RuntimeError(f"vLLM at {base_url} serves {served}, expected '{expected_model}'")
-    except RuntimeError:
-        raise
-    except Exception as e:
-        raise RuntimeError(f"vLLM model check failed for {url}: {e}") from e
-def _check_zeusd_health(host: str, port: int = 4938, timeout_s: float = 10.0) -> None:
-    """Verify a zeusd instance is reachable via GET /discover.
-    Args:
-        host: Hostname of the zeusd instance.
-        port: TCP port.
-        timeout_s: HTTP timeout in seconds.
-    Raises:
-        RuntimeError: If the zeusd instance is unreachable.
-    """
-    url = f"http://{host}:{port}/discover"
-    try:
-        req = urllib.request.Request(url)
-        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
-            if resp.status != 200:
-                raise RuntimeError(f"zeusd health check failed: HTTP {resp.status} from {url}")
-    except RuntimeError:
-        raise
-    except Exception as e:
-        raise RuntimeError(f"zeusd health check failed for {url}: {e}") from e
-_GAUGE_RE = re.compile(r"^([a-zA-Z_:][a-zA-Z0-9_:]*)\{.*?\}\s+(.+)$|^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+(.+)$")
-_PROMETHEUS_METRICS = (
-    "vllm:num_requests_running",
-    "vllm:num_requests_waiting",
-    "vllm:num_preemptions_total",
-    "vllm:kv_cache_usage_perc",
-)
-def _parse_prometheus_text(text: str) -> dict[str, float]:
-    """Parse Prometheus text-format metrics and extract vLLM gauges.
-    Returns a dict with metric names (without `vllm:` prefix) mapped to
-    their summed values.
-    """
-    raw: dict[str, float] = {}
-    for line in text.splitlines():
-        line = line.strip()
-        if not line or line.startswith("#"):
-            continue
-        m = _GAUGE_RE.match(line)
-        if m:
-            name = m.group(1) or m.group(3)
-            val_str = m.group(2) or m.group(4)
-            if name in _PROMETHEUS_METRICS:
-                with contextlib.suppress(ValueError):
-                    raw[name] = raw.get(name, 0.0) + float(val_str)
-    result: dict[str, float] = {}
-    for metric in _PROMETHEUS_METRICS:
-        if metric in raw:
-            short = metric.removeprefix("vllm:")
-            result[short] = raw[metric]
-    return result
-class _PrometheusPoller:
-    """Polls vLLM /metrics endpoints for Prometheus gauges.
-    Runs as an async task inside `_LoadGenerator`'s event loop.
-    Provides thread-safe access to the latest snapshot per model.
-    """
-    def __init__(
-        self,
-        deployments: Sequence[VLLMDeployment],
-        poll_interval_s: float = 0.5,
-    ) -> None:
-        self._deployments = {d.model_label: d for d in deployments}
-        self._poll_interval_s = poll_interval_s
-        self._lock = threading.Lock()
-        self._latest: dict[str, dict[str, float]] = {}
-    def get_latest(self) -> dict[str, dict[str, float]]:
-        """Return the latest metrics snapshot per model (thread-safe)."""
-        with self._lock:
-            return dict(self._latest)
-    async def run(self, stop_event: threading.Event) -> None:
-        """Poll loop. Call as an asyncio task."""
-        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=5.0)) as session:
-            while not stop_event.is_set():
-                for label, dep in self._deployments.items():
-                    url = f"{dep.vllm_base_url}/metrics"
-                    try:
-                        async with session.get(url) as resp:
-                            if resp.status == 200:
-                                text = await resp.text()
-                                metrics = _parse_prometheus_text(text)
-                                with self._lock:
-                                    self._latest[label] = metrics
-                    except Exception:
-                        logger.debug("Prometheus poll failed for %s", label, exc_info=True)
-                await asyncio.sleep(self._poll_interval_s)
-class _LoadGenerator:
-    """Background load generator that saturates vLLM servers and measures ITL.
-    Runs a daemon thread with an asyncio event loop. For each model, a
-    semaphore-gated producer loop cycles through pre-built request dicts
-    endlessly. The semaphore size is `2 * max(feasible_batch_sizes)`,
-    ensuring the vLLM queue never drains even at the largest batch size
-    the OFO controller can set. Per-token inter-token latency (ITL) is
-    measured from SSE chunk arrival times using `usage.completion_tokens`
-    increments; first-token latency (TTFT) is excluded from ITL samples.
-    """
-    def __init__(
-        self,
-        deployments: Sequence[VLLMDeployment],
-        *,
-        request_store: RequestStore | None = None,
-        max_output_tokens: int = 512,
-        itl_window_s: float = 1.0,
-        prometheus_poller: _PrometheusPoller | None = None,
-    ) -> None:
-        self._deployments = {d.model_label: d for d in deployments}
-        self._requests: dict[str, list[dict]] = {}
-        if request_store is not None:
-            self._requests = dict(request_store.requests_by_model)
-        self._max_output_tokens = max_output_tokens
-        self._itl_window_s = itl_window_s
-        self._prometheus = prometheus_poller
-        self._lock = threading.Lock()
-        self._itl_samples: dict[str, collections.deque[tuple[float, float]]] = {}
-        for d in deployments:
-            self._itl_samples[d.model_label] = collections.deque()
-        self._thread: threading.Thread | None = None
-        self._stop_event = threading.Event()
-        self._loop: asyncio.AbstractEventLoop | None = None
-    def start(self) -> None:
-        if self._thread is not None:
-            raise RuntimeError("LoadGenerator already started")
-        self._stop_event.clear()
-        self._thread = threading.Thread(
-            target=self._run_thread,
-            name="load-generator",
-            daemon=True,
-        )
-        self._thread.start()
-    def stop(self) -> None:
-        self._stop_event.set()
-        if self._loop is not None:
-            self._loop.call_soon_threadsafe(self._loop.stop)
-        if self._thread is not None:
-            self._thread.join(timeout=10.0)
-            self._thread = None
-    def get_observed_itl(self, model_label: str, window_s: float | None = None) -> float:
-        """Return the windowed-average ITL for *model_label*, or NaN."""
-        if window_s is None:
-            window_s = self._itl_window_s
-        cutoff = time.monotonic() - window_s
-        with self._lock:
-            samples = self._itl_samples.get(model_label)
-            if not samples:
-                return float("nan")
-            recent = [itl for ts, itl in samples if ts >= cutoff]
-        if not recent:
-            return float("nan")
-        return sum(recent) / len(recent)
-    def _run_thread(self) -> None:
-        self._loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(self._loop)
-        try:
-            self._loop.run_until_complete(self._run_async())
-        except Exception:
-            if not self._stop_event.is_set():
-                logger.exception("LoadGenerator thread crashed")
-        finally:
-            self._loop.close()
-            self._loop = None
-    async def _run_async(self) -> None:
-        tasks: list[asyncio.Task] = []
-        for label, dep in self._deployments.items():
-            tasks.append(asyncio.create_task(self._model_producer(label, dep)))
-        if self._prometheus is not None:
-            tasks.append(asyncio.create_task(self._prometheus.run(self._stop_event)))
-        while not self._stop_event.is_set():
-            await asyncio.sleep(0.1)
-        for t in tasks:
-            t.cancel()
-        await asyncio.gather(*tasks, return_exceptions=True)
-    async def _model_producer(self, label: str, dep: VLLMDeployment) -> None:
-        """Semaphore-gated loop that continuously submits requests for one model.
-        Cycles through the JSONL request list endlessly. The semaphore
-        limits in-flight requests to `2 * max(feasible_batch_sizes)`,
-        ensuring the vLLM server always has a non-empty queue.
-        """
-        max_batch = max(dep.spec.feasible_batch_sizes)
-        sem = asyncio.Semaphore(2 * max_batch)
-        requests = self._requests.get(label, [])
-        req_idx = 0
-        active: set[asyncio.Task[None]] = set()
-        connector = aiohttp.TCPConnector(limit=0, ssl=False)
-        async with aiohttp.ClientSession(
-            timeout=aiohttp.ClientTimeout(total=300.0),
-            connector=connector,
-        ) as session:
-            while not self._stop_event.is_set():
-                await sem.acquire()
-                if self._stop_event.is_set():
-                    break
-                if requests:
-                    request_dict = requests[req_idx % len(requests)]
-                    req_idx += 1
-                else:
-                    request_dict = self._default_request(dep)
-                task = asyncio.create_task(self._single_request(label, dep, request_dict, session, sem))
-                active.add(task)
-                task.add_done_callback(active.discard)
-    def _default_request(self, dep: VLLMDeployment) -> dict:
-        """Build a minimal fallback request dict."""
-        return {
-            "model": dep.spec.model_id,
-            "messages": [{"role": "user", "content": "Hello, how are you?"}],
-            "max_completion_tokens": self._max_output_tokens,
-        }
-    async def _single_request(
-        self,
-        label: str,
-        dep: VLLMDeployment,
-        request_dict: dict,
-        session: aiohttp.ClientSession,
-        sem: asyncio.Semaphore,
-    ) -> None:
-        """Send one streaming chat-completion request and measure decoding ITL.
-        Uses `usage.completion_tokens` increments to correctly handle
-        multi-token bundles. First-token samples (TTFT) are skipped;
-        only decoding-phase ITL is recorded.
-        """
-        try:
-            url = f"{dep.vllm_base_url}/v1/chat/completions"
-            body = dict(request_dict)
-            body["stream"] = True
-            body["stream_options"] = {"include_usage": True, "continuous_usage_stats": True}
-            if "max_tokens" in body and "max_completion_tokens" not in body:
-                body["max_completion_tokens"] = body.pop("max_tokens")
-            current_completion_tokens = 0
-            most_recent_timestamp = time.perf_counter()
-            ttft_recorded = False
-            async with session.post(url, json=body) as response:
-                if response.status != 200:
-                    return
-                async for chunk_bytes in response.content:
-                    if self._stop_event.is_set():
-                        return
-                    chunk_bytes = chunk_bytes.strip()
-                    if not chunk_bytes:
-                        continue
-                    chunk_str = chunk_bytes.decode("utf-8")
-                    if chunk_str.startswith(":"):
-                        continue
-                    data_str = chunk_str.removeprefix("data: ")
-                    if data_str == "[DONE]":
-                        break
-                    try:
-                        data = json.loads(data_str)
-                    except json.JSONDecodeError:
-                        continue
-                    usage = data.get("usage")
-                    completion_tokens = usage and usage.get("completion_tokens")
-                    if not completion_tokens:
-                        continue
-                    timestamp = time.perf_counter()
-                    if not ttft_recorded:
-                        ttft_recorded = True
-                        current_completion_tokens = completion_tokens
-                    else:
-                        itl = timestamp - most_recent_timestamp
-                        inc = completion_tokens - current_completion_tokens
-                        current_completion_tokens = completion_tokens
-                        now_mono = time.monotonic()
-                        with self._lock:
-                            self._itl_samples[label].append((now_mono, itl))
-                            for _ in range(max(inc - 1, 0)):
-                                self._itl_samples[label].append((now_mono, 0.0))
-                    most_recent_timestamp = timestamp
-        except Exception:
-            if not self._stop_event.is_set():
-                logger.debug("Request to %s failed for %s", dep.vllm_base_url, label, exc_info=True)
-        finally:
-            sem.release()
-class _RollingPowerBuffer:
-    """Per-model rolling buffer of (timestamp, per_gpu_watts) readings.
-    Provides `sample_servers()` to look up historical per-GPU power at
-    different time offsets for each virtual server, enabling temporal
-    staggering of batch-size-change transients.
-    """
-    def __init__(self, model_labels: Sequence[str], max_samples: int = 10000) -> None:
-        self._buffers: dict[str, collections.deque[tuple[float, float]]] = {
-            label: collections.deque(maxlen=max_samples) for label in model_labels
-        }
-    def append(self, label: str, timestamp: float, per_gpu_w: float) -> None:
-        """Feed a new per-GPU power reading for a model."""
-        self._buffers[label].append((timestamp, per_gpu_w))
-    def sample_servers(
-        self,
-        label: str,
-        now: float,
-        stagger_offsets: np.ndarray,
-    ) -> np.ndarray:
-        """Look up per-GPU power at `now - offset[i]` for each virtual server.
-        Args:
-            label: Model label.
-            now: Current wall-clock time (monotonic).
-            stagger_offsets: Per-server time offsets (seconds), shape `(N,)`.
-        Returns:
-            Array of shape `(N,)` with per-GPU power for each server.
-        """
-        buf = self._buffers[label]
-        n = len(stagger_offsets)
-        result = np.zeros(n, dtype=float)
-        if not buf:
-            return result
-        for i in range(n):
-            result[i] = self._lookup(buf, now - stagger_offsets[i])
-        return result
-    def clear(self) -> None:
-        """Clear all buffers."""
-        for buf in self._buffers.values():
-            buf.clear()
-    @staticmethod
-    def _lookup(buf: collections.deque[tuple[float, float]], target_t: float) -> float:
-        """Find the power reading at or just before `target_t`."""
-        if not buf:
-            return 0.0
-        if target_t <= buf[0][0]:
-            return buf[0][1]
-        if target_t >= buf[-1][0]:
-            return buf[-1][1]
-        for i in range(len(buf) - 1, -1, -1):
-            if buf[i][0] <= target_t:
-                return buf[i][1]
-        return buf[0][1]
-class OnlineDatacenter(LLMBatchSizeControlledDatacenter[OnlineDatacenterState]):
-    """Live GPU datacenter backend with power augmentation.
-    Dispatches inference load to vLLM servers, streams GPU power from
-    zeusd, measures ITL from streaming responses, and augments power
-    readings to datacenter scale using the shared
-    [`InferencePowerAugmenter`][openg2g.datacenter.workloads.inference.InferencePowerAugmenter]
-    pipeline (same as
-    [`OfflineDatacenter`][openg2g.datacenter.offline.OfflineDatacenter]).
-    Call [`start`][.start] before the first [`step`][.step] and
-    [`stop`][.stop] after the simulation loop finishes.
-    `PowerStreamingClient` is constructed internally from the GPU
-    endpoints declared in each deployment. Health checks are always
-    performed during [`start`][.start].
-    Args:
-        datacenter: Facility configuration (GPUs per server, base load).
-        deployments: Model deployments with physical hardware mapping.
-        dt_s: Simulation timestep (seconds).
-        seed: Random seed for layout generation and noise.
-        power_augmentation: Per-server amplitude scaling and noise
-            settings.
-        inference_ramps: Inference server ramp event(s). `None` keeps
-            all servers active.
-        live_server: Configuration for interacting with live vLLM
-            servers. Request data is loaded from
-            `LiveServerConfig.requests_dir`.
-    """
-    def __init__(
-        self,
-        datacenter: DatacenterConfig,
-        deployments: Sequence[VLLMDeployment],
-        *,
-        dt_s: Fraction = Fraction(1, 10),
-        seed: int = 0,
-        power_augmentation: PowerAugmentationConfig | None = None,
-        inference_ramps: InferenceRampSchedule | None = None,
-        live_server: LiveServerConfig | None = None,
-    ) -> None:
-        super().__init__()
-        if power_augmentation is None:
-            power_augmentation = PowerAugmentationConfig()
-        if live_server is None:
-            live_server = LiveServerConfig()
-        self._dt_s = dt_s
-        self._seed = int(seed)
-        self._deployments = list(deployments)
-        self._deployment_map = {d.model_label: d for d in deployments}
-        self._datacenter_config = datacenter
-        self._power_augmentation = power_augmentation
-        self._live_server_config = live_server
-        self._base_W_per_phase = float(datacenter.base_kw_per_phase) * 1e3
-        self._inference_ramp_schedule = inference_ramps if inference_ramps is not None else InferenceRampSchedule()
-        servers_by_key: dict[str, ZeusdConfig] = {}
-        gpu_indices_by_key: dict[str, list[int]] = {}
-        for d in self._deployments:
-            for ep in d.gpu_endpoints:
-                key = ep.endpoint_key
-                if key not in gpu_indices_by_key:
-                    gpu_indices_by_key[key] = []
-                for idx in ep.gpu_indices:
-                    if idx not in gpu_indices_by_key[key]:
-                        gpu_indices_by_key[key].append(idx)
-                servers_by_key[key] = ZeusdConfig.tcp(
-                    ep.host,
-                    ep.port,
-                    gpu_indices=gpu_indices_by_key[key],
-                    cpu_indices=[],
-                )
-        self._power_client = PowerStreamingClient(servers=list(servers_by_key.values()))
-        self._prometheus = (
-            _PrometheusPoller(
-                deployments,
-                poll_interval_s=live_server.prometheus_poll_interval_s,
-            )
-            if live_server.prometheus_poll_interval_s > 0
-            else None
-        )
-        self._request_store = RequestStore.load(live_server.requests_dir) if live_server.requests_dir else None
-        self._load_gen = _LoadGenerator(
-            deployments,
-            request_store=self._request_store,
-            max_output_tokens=live_server.max_output_tokens,
-            itl_window_s=live_server.itl_window_s,
-            prometheus_poller=self._prometheus,
-        )
-        self._layout_rng = np.random.default_rng(self._seed)
-        self._layouts: dict[str, ServerLayout] = {}
-        self._policies: dict[str, ActivationPolicy] = {}
-        self._build_all_layouts()
-        self._inference_augmenter = InferencePowerAugmenter(
-            layouts=self._layouts,
-            policies=self._policies,
-            seed=self._seed + 12345,
-        )
-        self._rolling_buffer = _RollingPowerBuffer(
-            [d.model_label for d in deployments],
-            max_samples=max(int(STAGGER_BUFFER_S * 100), 1000),
-        )
-        self._started = False
-        logger.info(
-            "OnlineDatacenter: %d deployments, dt=%s s",
-            len(self._deployments),
-            dt_s,
-        )
-        for d in deployments:
-            layout = self._layouts.get(d.model_label)
-            n_servers = layout.num_servers if layout else 0
-            logger.info(
-                "  %s: %d real GPUs, %d simulated replicas (%.0fx augmentation), %d virtual servers, vllm=%s",
-                d.model_label,
-                d.num_real_gpus,
-                d.spec.num_replicas,
-                d.augmentation_factor,
-                n_servers,
-                d.vllm_base_url,
-            )
-    def _build_all_layouts(self) -> None:
-        """Build ServerLayout and activation policies for each deployed model.
-        The RNG invocation order per model must be: phase shuffle,
-        priority shuffle, stagger offsets, amplitude scales. We
-        interleave policy construction between the phase shuffle
-        and stagger/amplitude draws to preserve this ordering.
-        """
-        schedule = self._inference_ramp_schedule
-        gpus_per_server = self._datacenter_config.gpus_per_server
-        rng = self._layout_rng
-        amp_lo, amp_hi = self._power_augmentation.amplitude_scale_range
-        noise_fraction = self._power_augmentation.noise_fraction
-        stagger_s = float(STAGGER_BUFFER_S)
-        for d in self._deployments:
-            spec = d.spec
-            if spec.num_replicas > 0:
-                num_servers = math.ceil(spec.num_replicas * spec.gpus_per_replica / gpus_per_server)
-                # Phase shuffle (consumes RNG)
-                sA, sB, sC = split_integer_evenly(num_servers, 3)
-                phase_list = np.asarray(([0] * sA) + ([1] * sB) + ([2] * sC), dtype=int)
-                rng.shuffle(phase_list)
-                # Priority shuffle (consumes RNG) — must happen here
-                self._policies[d.model_label] = RampActivationPolicy(
-                    schedule,
-                    num_servers,
-                    rng,
-                )
-                # Stagger offsets (consumes RNG) — float for online
-                stagger_offsets = rng.uniform(0.0, max(stagger_s, 1e-9), size=num_servers)
-                # Amplitude scales (consumes RNG)
-                amplitude_scales = rng.uniform(amp_lo, amp_hi, size=num_servers)
-                total_gpus = spec.num_replicas * spec.gpus_per_replica
-                gpus_per_server_list = np.full(num_servers, gpus_per_server, dtype=int)
-                tail = total_gpus - (num_servers - 1) * gpus_per_server
-                gpus_per_server_list[-1] = int(tail) if tail > 0 else gpus_per_server
-                self._layouts[d.model_label] = ServerLayout(
-                    num_servers=num_servers,
-                    total_gpus=total_gpus,
-                    gpus_per_replica=spec.gpus_per_replica,
-                    gpus_per_server_list=gpus_per_server_list,
-                    phase_list=phase_list,
-                    stagger_offsets=stagger_offsets,
-                    amplitude_scales=amplitude_scales,
-                    noise_fraction=noise_fraction,
-                )
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    @property
-    def phase_share_by_model(self) -> dict[str, np.ndarray]:
-        """Per-model phase share vectors derived from server layout."""
-        shares: dict[str, np.ndarray] = {}
-        for label, layout in self._layouts.items():
-            counts = np.bincount(layout.phase_list, minlength=3).astype(float)
-            total = counts.sum()
-            if total > 0:
-                shares[label] = counts / total
-            else:
-                shares[label] = np.array([1 / 3, 1 / 3, 1 / 3], dtype=float)
-        return shares
-    def reset(self) -> None:
-        if self._started:
-            self._load_gen.stop()
-        self._load_gen = _LoadGenerator(
-            self._deployments,
-            request_store=self._request_store,
-            max_output_tokens=self._live_server_config.max_output_tokens,
-            itl_window_s=self._live_server_config.itl_window_s,
-            prometheus_poller=self._prometheus,
-        )
-        self._layout_rng = np.random.default_rng(self._seed)
-        self._layouts = {}
-        self._policies = {}
-        self._build_all_layouts()
-        self._inference_augmenter = InferencePowerAugmenter(
-            layouts=self._layouts,
-            policies=self._policies,
-            seed=self._seed + 12345,
-        )
-        self._rolling_buffer.clear()
-        for d in self._deployments:
-            d.batch_size = d.spec.initial_batch_size
-        self._started = False
-    def start(self) -> None:
-        """Start load generation, warm up servers, and fill the power buffer.
-        Sequence:
-            1. Run health checks on all vLLM servers and zeusd instances.
-            2. Wait for at least one power reading per endpoint (10 s timeout).
-            3. Set initial batch sizes on all vLLM servers.
-            4. Start load generation threads.
-            5. Warm up: poll power into the rolling buffer while waiting for
-               each model's `num_requests_running` to reach 95% of its
-               `initial_batch_size`. Fails after 60 s if any model does not
-               saturate.
-        """
-        if self._started:
-            raise RuntimeError("OnlineDatacenter already started")
-        logger.info("Starting OnlineDatacenter with %d deployments", len(self._deployments))
-        # 1. Health checks
-        logger.info("Running health checks...")
-        for d in self._deployments:
-            _check_vllm_health(d.vllm_base_url)
-            _check_vllm_model(d.vllm_base_url, d.spec.model_id)
-            for ep in d.gpu_endpoints:
-                _check_zeusd_health(ep.host, ep.port)
-        logger.info("All health checks passed")
-        # 2. Wait for power readings from all endpoints
-        all_endpoints: set[str] = set()
-        for d in self._deployments:
-            for ep in d.gpu_endpoints:
-                all_endpoints.add(ep.endpoint_key)
-        deadline = time.monotonic() + 10.0
-        while time.monotonic() < deadline:
-            readings = self._power_client.get_power()
-            if all_endpoints.issubset(readings.keys()):
-                logger.info("Power readings received from all %d endpoints", len(all_endpoints))
-                break
-            time.sleep(0.5)
-        else:
-            connected = set(self._power_client.get_power().keys())
-            missing = all_endpoints - connected
-            logger.warning("Timed out waiting for power readings from: %s", missing)
-        # 3. Set initial batch sizes on vLLM servers
-        for d in self._deployments:
-            d.set_batch_size(d.spec.initial_batch_size)
-        # 4. Start load generation (and Prometheus poller)
-        self._load_gen.start()
-        logger.info("LoadGenerator started")
-        # 5. Warm up: fill power buffer + wait for server saturation
-        self._warmup()
-        self._started = True
-        logger.info("OnlineDatacenter ready")
-    def _poll_power_into_buffer(self) -> tuple[float, dict[str, float]]:
-        """Read GPU power from all endpoints and feed the rolling buffer.
-        Returns:
-            Tuple of (monotonic timestamp, per-model average per-GPU watts).
-        """
-        now = time.monotonic()
-        raw_power = self._power_client.get_power()
-        per_gpu_by_model: dict[str, float] = {}
-        for d in self._deployments:
-            total_w = 0.0
-            n_gpus = 0
-            for ep in d.gpu_endpoints:
-                pr = raw_power.get(ep.endpoint_key)
-                if pr is None:
-                    continue
-                for idx in ep.gpu_indices:
-                    if idx in pr.gpu_power_w:
-                        total_w += pr.gpu_power_w[idx]
-                        n_gpus += 1
-            per_gpu_w = total_w / n_gpus if n_gpus > 0 else 0.0
-            self._rolling_buffer.append(d.model_label, now, per_gpu_w)
-            per_gpu_by_model[d.model_label] = per_gpu_w
-        return now, per_gpu_by_model
-    def _warmup(
-        self,
-        timeout_s: float = 60.0,
-        saturation_threshold: float = 0.95,
-        poll_interval_s: float = 0.1,
-    ) -> None:
-        """Fill the rolling power buffer and wait for vLLM server saturation.
-        Actively polls GPU power to fill the rolling buffer while monitoring
-        Prometheus `num_requests_running` to verify each model has reached
-        `saturation_threshold` of its `initial_batch_size`.
-        Completion requires both conditions for every model:
-            1. `num_requests_running >= saturation_threshold * initial_batch_size`
-            2. At least `stagger_buffer_s` has elapsed since that model first
-               reached saturation (so the buffer contains a full stagger
-               window of steady-state power data).
-        Args:
-            timeout_s: Maximum warmup duration in seconds.
-            saturation_threshold: Fraction of `initial_batch_size` that
-                `num_requests_running` must reach (0.0-1.0).
-            poll_interval_s: Seconds between power polls.
-        Raises:
-            RuntimeError: If any model fails to saturate within `timeout_s`.
-                Includes the `num_requests_running` trajectory for failed
-                models.
-        """
-        stagger_s = STAGGER_BUFFER_S
-        logger.info(
-            "Warming up: waiting for server saturation (%.0f%% of initial_batch_size) "
-            "+ %.1f s buffer fill per model...",
-            saturation_threshold * 100,
-            stagger_s,
-        )
-        warmup_start = time.monotonic()
-        deadline = warmup_start + timeout_s
-        last_log = warmup_start
-        trajectory: dict[str, list[tuple[float, float]]] = {d.model_label: [] for d in self._deployments}
-        saturation_time: dict[str, float | None] = {d.model_label: None for d in self._deployments}
-        while time.monotonic() < deadline:
-            now = time.monotonic()
-            elapsed = now - warmup_start
-            self._poll_power_into_buffer()
-            all_ready = True
-            if self._prometheus is not None:
-                prom = self._prometheus.get_latest()
-                for d in self._deployments:
-                    label = d.model_label
-                    running = prom.get(label, {}).get("num_requests_running", 0.0)
-                    trajectory[label].append((elapsed, running))
-                    target = d.spec.initial_batch_size * saturation_threshold
-                    if running >= target and saturation_time[label] is None:
-                        saturation_time[label] = now
-                        logger.info(
-                            "  %s saturated at t=%.1f s (num_requests_running=%.0f)",
-                            label,
-                            elapsed,
-                            running,
-                        )
-                    sat_t = saturation_time[label]
-                    if sat_t is None or (now - sat_t) < stagger_s:
-                        all_ready = False
-            else:
-                logger.warning(
-                    "Prometheus polling is disabled; cannot verify server saturation. "
-                    "Waiting %.1f s for power buffer only.",
-                    stagger_s,
-                )
-                if elapsed < stagger_s:
-                    all_ready = False
-            if all_ready:
-                logger.info("Warmup complete in %.1f s", elapsed)
-                return
-            if now - last_log >= 10.0:
-                last_log = now
-                if self._prometheus is not None:
-                    prom = self._prometheus.get_latest()
-                    for d in self._deployments:
-                        label = d.model_label
-                        running = prom.get(label, {}).get("num_requests_running", 0.0)
-                        target = d.spec.initial_batch_size
-                        sat_t = saturation_time[label]
-                        buf_s = (now - sat_t) if sat_t is not None else 0.0
-                        logger.info(
-                            "  Warmup %s: num_requests_running=%.0f / %d (%.0f%%), buffer=%.1f / %.1f s",
-                            label,
-                            running,
-                            target,
-                            running / max(target, 1) * 100,
-                            buf_s,
-                            stagger_s,
-                        )
-            time.sleep(poll_interval_s)
-        if self._prometheus is None:
-            raise RuntimeError(
-                f"Warmup timed out after {timeout_s:.0f} s waiting for power buffer to fill ({stagger_s:.1f} s)"
-            )
-        prom = self._prometheus.get_latest()
-        failed: list[str] = []
-        for d in self._deployments:
-            label = d.model_label
-            running = prom.get(label, {}).get("num_requests_running", 0.0)
-            sat_t = saturation_time[label]
-            not_saturated = running < d.spec.initial_batch_size * saturation_threshold
-            not_buffered = sat_t is None or (time.monotonic() - sat_t) < stagger_s
-            if not_saturated or not_buffered:
-                failed.append(label)
-        parts = [
-            f"Warmup timed out after {timeout_s:.0f} s. "
-            f"Models that failed to reach {saturation_threshold:.0%} of initial_batch_size:",
-        ]
-        for label in failed:
-            target = self._deployment_map[label].spec.initial_batch_size
-            traj = trajectory[label]
-            final = traj[-1][1] if traj else 0.0
-            parts.append(f"  {label} (target: {target}, reached: {final:.0f}):")
-            step = max(1, int(5.0 / poll_interval_s))
-            samples = traj[::step]
-            if traj and (not samples or samples[-1] is not traj[-1]):
-                samples.append(traj[-1])
-            entries = [f"t={t:.0f}s: {r:.0f}" for t, r in samples]
-            parts.append("    " + ", ".join(entries))
-        raise RuntimeError("\n".join(parts))
-    def stop(self) -> None:
-        """Stop load generation and power streaming."""
-        self._load_gen.stop()
-        self._power_client.stop()
-        self._started = False
-        logger.info("OnlineDatacenter stopped")
-    def step(self, clock: SimulationClock, events: EventEmitter) -> OnlineDatacenterState:
-        """Read live power, augment to datacenter scale, and return state."""
-        now, per_gpu_w_by_model = self._poll_power_into_buffer()
-        measured_power_by_model: dict[str, float] = {}
-        augmentation_factor_by_model: dict[str, float] = {}
-        for d in self._deployments:
-            label = d.model_label
-            measured_power_by_model[label] = per_gpu_w_by_model.get(label, 0.0) * d.num_real_gpus
-            augmentation_factor_by_model[label] = d.augmentation_factor
-        per_gpu_by_model: dict[str, np.ndarray] = {}
-        for d in self._deployments:
-            label = d.model_label
-            if label not in self._layouts:
-                continue
-            layout = self._layouts[label]
-            per_gpu_by_model[label] = self._rolling_buffer.sample_servers(label, now, layout.stagger_offsets)
-        inference_aug = self._inference_augmenter.augment(per_gpu_by_model, clock.time_s)
-        measured_total = sum(measured_power_by_model.values())
-        measured_per_phase = measured_total / 3.0
-        observed_itl: dict[str, float] = {
-            d.model_label: self._load_gen.get_observed_itl(d.model_label) for d in self._deployments
-        }
-        prometheus_metrics: dict[str, dict[str, float]] = {}
-        if self._prometheus is not None:
-            prometheus_metrics = self._prometheus.get_latest()
-        state = OnlineDatacenterState(
-            time_s=clock.time_s,
-            power_w=ThreePhase(
-                a=self._base_W_per_phase + inference_aug.power_w.a,
-                b=self._base_W_per_phase + inference_aug.power_w.b,
-                c=self._base_W_per_phase + inference_aug.power_w.c,
-            ),
-            batch_size_by_model={d.model_label: d.batch_size for d in self._deployments},
-            active_replicas_by_model=inference_aug.active_replicas_by_model,
-            observed_itl_s_by_model=observed_itl,
-            measured_power_w=ThreePhase(
-                a=measured_per_phase + self._base_W_per_phase,
-                b=measured_per_phase + self._base_W_per_phase,
-                c=measured_per_phase + self._base_W_per_phase,
-            ),
-            measured_power_w_by_model=measured_power_by_model,
-            augmented_power_w_by_model=inference_aug.power_by_model_w,
-            augmentation_factor_by_model=augmentation_factor_by_model,
-            prometheus_metrics_by_model=prometheus_metrics,
-        )
-        return state
-    @functools.singledispatchmethod
-    def apply_control(self, command: DatacenterCommand, events: EventEmitter) -> None:
-        """Apply a control command. Dispatches on command type."""
-        raise TypeError(f"OnlineDatacenter does not support {type(command).__name__}")
-    @apply_control.register
-    def apply_control_set_batch_size(self, command: SetBatchSize, events: EventEmitter) -> None:
-        """Apply batch size command by sending HTTP requests to vLLM servers."""
-        for label, b in command.batch_size_by_model.items():
-            label = str(label)
-            b_int = int(b)
-            if b_int <= 0:
-                raise ValueError(f"Batch size must be positive for model {label!r}, got {b_int}.")
-            dep = self._deployment_map.get(label)
-            if dep is not None:
-                dep.set_batch_size(b_int, ramp_up_rate=command.ramp_up_rate_by_model.get(label, 0.0))
-        events.emit(
-            "datacenter.batch_size.updated",
-            {"batch_size_by_model": {d.model_label: d.batch_size for d in self._deployments}},
-        )

openg2g/datacenter/workloads/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-"""Datacenter workloads.
-LLM inference workloads and training workloads.
-"""

openg2g/datacenter/workloads/inference.py DELETED Viewed

@@ -1,1363 +0,0 @@
-"""Inference workload: power traces, templates, ITL fits, and augmentation."""
-from __future__ import annotations
-import json
-import logging
-from collections.abc import Sequence
-from dataclasses import dataclass, field
-from fractions import Fraction
-from pathlib import Path
-from typing import Any, cast
-import numpy as np
-import pandas as pd
-from mlenergy_data.modeling import ITLMixtureModel
-from mlenergy_data.records import LLMRuns
-from pydantic import BaseModel, ConfigDict
-import openg2g
-from openg2g.common import ThreePhase
-from openg2g.datacenter.config import InferenceModelSpec
-from openg2g.datacenter.layout import ActivationPolicy, ServerLayout
-logger = logging.getLogger(__name__)
-class MLEnergySource(BaseModel):
-    """Per-model ML.ENERGY benchmark data extraction settings.
-    Attributes:
-        model_label: Simulation label for the model.
-        task: Benchmark task name (e.g. `"lm-arena-chat"`, `"gpqa"`).
-        gpu: GPU model name (e.g. `"H100"`).
-        batch_sizes: Batch sizes to extract from the benchmark data.
-        fit_exclude_batch_sizes: Batch sizes to exclude from logistic
-            curve fitting (but still included in trace extraction).
-    """
-    model_config = ConfigDict(frozen=True)
-    model_label: str
-    task: str
-    gpu: str
-    batch_sizes: tuple[int, ...]
-    fit_exclude_batch_sizes: tuple[int, ...] = ()
-@dataclass(frozen=True)
-class InferenceTrace:
-    """A single power trace measurement.
-    Attributes:
-        t_s: Time vector (seconds), monotonically increasing.
-        power_w: Total power vector (watts) across all measured GPUs,
-            same length as `t_s`.
-        measured_gpus: Number of GPUs used in the measurement.
-    """
-    t_s: np.ndarray
-    power_w: np.ndarray
-    measured_gpus: int
-    def __post_init__(self) -> None:
-        if len(self.t_s) != len(self.power_w):
-            raise ValueError(f"t_s and power_w must have the same length, got {len(self.t_s)} and {len(self.power_w)}")
-        if len(self.t_s) < 5:
-            raise ValueError("Trace too short (need at least 5 samples).")
-        if self.measured_gpus < 1:
-            raise ValueError(f"measured_gpus must be >= 1, got {self.measured_gpus}")
-def _build_per_gpu_power_template(
-    trace: InferenceTrace,
-    *,
-    dt_s: Fraction | float,
-    duration_s: Fraction | float,
-    steady_skip_s: float = 0.0,
-) -> np.ndarray:
-    """Build a per-GPU power template over [0, duration_s] by periodic repetition.
-    Args:
-        trace: Source power trace (total power across measured GPUs).
-        dt_s: Simulation timestep in seconds.
-        duration_s: Total simulation duration in seconds.
-        steady_skip_s: Skip this many seconds from the start of the trace
-            to avoid warm-up transients.
-    Returns:
-        1-D array of per-GPU power values at each simulation timestep.
-    """
-    trace_t = np.asarray(trace.t_s, float)
-    trace_p_total = np.asarray(trace.power_w, float)
-    mg = max(trace.measured_gpus, 1)
-    p_per_gpu = trace_p_total / mg
-    p_per_gpu = np.clip(p_per_gpu, 0.0, None)
-    if steady_skip_s > 0.0:
-        idx0 = np.searchsorted(trace_t, trace_t[0] + float(steady_skip_s))
-        if idx0 < trace_t.size - 5:
-            trace_t = trace_t[idx0:] - trace_t[idx0]
-            p_per_gpu = p_per_gpu[idx0:]
-    trace_t = trace_t - trace_t[0]
-    period = float(trace_t[-1] - trace_t[0])
-    if period <= 0:
-        raise ValueError("Non-positive trace duration.")
-    n_steps = int(np.ceil(float(duration_s) / float(dt_s))) + 1
-    t_grid = np.arange(n_steps, dtype=float) * float(dt_s)
-    t_mod = np.mod(t_grid, period)
-    template = np.interp(t_mod, trace_t, p_per_gpu, left=p_per_gpu[0], right=p_per_gpu[-1])
-    return np.clip(template, 0.0, None)
-class ITLFitStore:
-    """Per-model, per-batch-size ITL mixture distributions.
-    Indexed by `(model_label, batch_size)`. Provides:
-    - [`load`][.load]: load fits from a CSV produced by the data pipeline
-    - [`distributions`][.distributions]: access as a nested dict
-    - [`sample_avg`][.sample_avg]: sample a fleet-average ITL value
-    Attributes:
-        COL_MODEL_LABEL: Column name for model label in the CSV.
-        COL_BATCH_SIZE: Column name for batch size in the CSV.
-    """
-    COL_MODEL_LABEL = "model_label"
-    COL_BATCH_SIZE = "max_num_seqs"
-    def __init__(
-        self,
-        distributions: dict[str, dict[int, ITLMixtureModel]],
-        approx_sampling_thresh: int = 30,
-    ) -> None:
-        self._distributions = {
-            str(label): {int(b): m for b, m in per_batch.items()} for label, per_batch in distributions.items()
-        }
-        self._approx_sampling_thresh = int(approx_sampling_thresh)
-    @property
-    def distributions(self) -> dict[str, dict[int, ITLMixtureModel]]:
-        """Nested dict: `model_label -> batch_size -> ITLMixtureModel`."""
-        return self._distributions
-    def sample_avg(
-        self,
-        model_label: str,
-        batch_size: int,
-        n_replicas: int,
-        rng: np.random.Generator,
-    ) -> float:
-        """Sample a fleet-average ITL for the given model and batch size.
-        Uses `ITLMixtureModel.sample_avg` under the hood, with the
-        `approx_sampling_thresh` set at construction time.
-        Args:
-            model_label: Model label string.
-            batch_size: Current batch size.
-            n_replicas: Number of active replicas.
-            rng: NumPy random generator for sampling.
-        Returns:
-            Fleet-average ITL in seconds.
-        Raises:
-            KeyError: If model or batch size is not in the store.
-        """
-        model_dists = self._distributions.get(model_label)
-        if model_dists is None:
-            raise KeyError(f"No ITL distributions for model={model_label!r}")
-        params = model_dists.get(int(batch_size))
-        if params is None:
-            raise KeyError(
-                f"No ITL distributions for model={model_label!r}, batch={batch_size}. "
-                f"Available={sorted(model_dists.keys())}"
-            )
-        return params.sample_avg(
-            n_replicas=n_replicas,
-            rng=rng,
-            exact_threshold=self._approx_sampling_thresh,
-        )
-    @classmethod
-    def load(cls, csv_path: Path | str, approx_sampling_thresh: int = 30) -> ITLFitStore:
-        """Load ITL mixture fits from a CSV.
-        Expected columns: `model_label`, `max_num_seqs`, plus the
-        `itl_mix_*` parameter columns produced by
-        `ITLMixtureModel.to_dict()`.
-        Args:
-            csv_path: Path to the latency fits CSV.
-            approx_sampling_thresh: Replica count above which sampling
-                uses a CLT normal approximation instead of drawing
-                individual samples.
-        """
-        csv_path = Path(csv_path)
-        df = pd.read_csv(csv_path)
-        required_cols = [cls.COL_MODEL_LABEL, cls.COL_BATCH_SIZE]
-        missing = [c for c in required_cols if c not in df.columns]
-        if missing:
-            raise ValueError(f"{csv_path} missing columns: {missing}. Got: {list(df.columns)}")
-        distributions: dict[str, dict[int, ITLMixtureModel]] = {}
-        for row in df.to_dict(orient="records"):
-            label = str(row[cls.COL_MODEL_LABEL]).strip()
-            batch = int(row[cls.COL_BATCH_SIZE])
-            distributions.setdefault(label, {})[batch] = ITLMixtureModel.from_dict(row)
-        if not distributions:
-            raise ValueError(f"No ITL mixture rows loaded from {csv_path}")
-        return cls(distributions, approx_sampling_thresh=approx_sampling_thresh)
-    def save(self, csv_path: Path) -> None:
-        """Save ITL mixture fits to a CSV.
-        Args:
-            csv_path: Output CSV path.
-        """
-        csv_path = Path(csv_path)
-        csv_path.parent.mkdir(parents=True, exist_ok=True)
-        rows: list[dict[str, Any]] = []
-        for label in sorted(self._distributions):
-            for batch in sorted(self._distributions[label]):
-                model = self._distributions[label][batch]
-                rows.append(
-                    {
-                        self.COL_MODEL_LABEL: label,
-                        self.COL_BATCH_SIZE: batch,
-                        "itl_dist": "lognormal_mixture_2",
-                        **{f"itl_mix_{k}": v for k, v in model.to_dict().items()},
-                    }
-                )
-        pd.DataFrame(rows).to_csv(csv_path, index=False)
-class InferenceTemplateStore:
-    """Pre-built per-GPU power templates for a specific simulation config.
-    Created by [`InferenceTraceStore.build_templates`][..InferenceTraceStore.build_templates].
-    Use [`template`][.template] to look up a template by model label and batch size.
-    """
-    def __init__(
-        self,
-        templates: dict[tuple[str, int], np.ndarray],
-        batch_sizes_by_model: dict[str, list[int]],
-    ) -> None:
-        self._templates = templates
-        self._batch_sizes_by_model = batch_sizes_by_model
-    def template(self, model_label: str, batch_size: int) -> np.ndarray:
-        """Return a pre-built per-GPU power template."""
-        key = (str(model_label), int(batch_size))
-        if key not in self._templates:
-            raise KeyError(f"No template for model={model_label!r}, batch={batch_size}.")
-        return self._templates[key]
-    def batch_sizes(self, model_label: str) -> list[int]:
-        """List of batch sizes available for a model."""
-        sizes = self._batch_sizes_by_model.get(model_label)
-        if sizes is None:
-            raise KeyError(f"Unknown model: {model_label!r}")
-        return list(sizes)
-class InferenceTraceStore:
-    """Manages raw power traces loaded from CSV files.
-    Indexed by `(model_label, batch_size)`. Provides:
-    - [`load`][.load]: load traces discovered via a manifest CSV
-    - [`build_templates`][.build_templates]: build per-GPU power
-      templates for a specific simulation config, returning a
-      [`InferenceTemplateStore`][..InferenceTemplateStore]
-    """
-    MANIFEST_COL_MODEL_LABEL = "model_label"
-    MANIFEST_COL_NUM_GPUS = "num_gpus"
-    MANIFEST_COL_BATCH_SIZE = "max_num_seqs"
-    MANIFEST_COL_TRACE_FILE = "trace_file"
-    TRACE_COL_TIME = "relative_time_s"
-    TRACE_COL_POWER = "power_total_W"
-    def __init__(self, traces: dict[str, dict[int, InferenceTrace]]) -> None:
-        self._traces = {str(label): {int(b): tr for b, tr in per_batch.items()} for label, per_batch in traces.items()}
-    @classmethod
-    def load(cls, manifest: Path) -> InferenceTraceStore:
-        """Load traces discovered via a manifest CSV.
-        Trace file paths in the manifest are resolved relative to the
-        manifest file's parent directory.
-        Args:
-            manifest: Path to the manifest CSV (e.g. `traces_summary.csv`).
-                Expected columns: `model_label`, `num_gpus`, `max_num_seqs`,
-                `trace_file`.
-        """
-        manifest = Path(manifest)
-        base_dir = manifest.parent
-        df = pd.read_csv(manifest)
-        required_cols = [
-            cls.MANIFEST_COL_MODEL_LABEL,
-            cls.MANIFEST_COL_NUM_GPUS,
-            cls.MANIFEST_COL_BATCH_SIZE,
-            cls.MANIFEST_COL_TRACE_FILE,
-        ]
-        missing = [c for c in required_cols if c not in df.columns]
-        if missing:
-            raise ValueError(f"Manifest {manifest} missing columns: {missing}. Got: {list(df.columns)}")
-        traces: dict[str, dict[int, InferenceTrace]] = {}
-        for row in df.to_dict(orient="records"):
-            label = str(row[cls.MANIFEST_COL_MODEL_LABEL])
-            num_gpus = int(row[cls.MANIFEST_COL_NUM_GPUS])
-            batch = int(row[cls.MANIFEST_COL_BATCH_SIZE])
-            trace_path = base_dir / str(row[cls.MANIFEST_COL_TRACE_FILE])
-            if not trace_path.exists():
-                raise FileNotFoundError(f"Trace file not found: {trace_path} (model={label}, batch={batch})")
-            tdf = pd.read_csv(trace_path)
-            if cls.TRACE_COL_TIME not in tdf.columns or cls.TRACE_COL_POWER not in tdf.columns:
-                raise ValueError(
-                    f"{trace_path} must contain {cls.TRACE_COL_TIME!r} and "
-                    f"{cls.TRACE_COL_POWER!r}. Got: {list(tdf.columns)}"
-                )
-            t = tdf[cls.TRACE_COL_TIME].to_numpy(float)
-            p = tdf[cls.TRACE_COL_POWER].to_numpy(float)
-            if np.any(np.diff(t) < 0):
-                idx = np.argsort(t)
-                t, p = t[idx], p[idx]
-            traces.setdefault(label, {})[batch] = InferenceTrace(
-                t_s=t,
-                power_w=p,
-                measured_gpus=num_gpus,
-            )
-        return cls(traces)
-    def build_templates(
-        self,
-        *,
-        duration_s: Fraction | float,
-        dt_s: Fraction | float,
-        steady_skip_s: float = 0.0,
-    ) -> InferenceTemplateStore:
-        """Build per-GPU power templates for all traces.
-        Args:
-            duration_s: Total simulation duration (seconds).
-            dt_s: Simulation timestep (seconds).
-            steady_skip_s: Skip this many seconds from the start of each
-                trace to avoid warm-up transients.
-        Returns:
-            A [`InferenceTemplateStore`][openg2g.datacenter.workloads.inference.InferenceTemplateStore]
-                holding the built templates.
-        """
-        templates: dict[tuple[str, int], np.ndarray] = {}
-        batch_sizes_by_model: dict[str, list[int]] = {}
-        for label, per_batch in self._traces.items():
-            batch_sizes_by_model[label] = sorted(per_batch.keys())
-            for batch, tr in per_batch.items():
-                tpl = _build_per_gpu_power_template(
-                    tr,
-                    dt_s=dt_s,
-                    duration_s=duration_s,
-                    steady_skip_s=steady_skip_s,
-                )
-                templates[(label, batch)] = tpl
-        return InferenceTemplateStore(templates, batch_sizes_by_model)
-    def save(self, out_dir: Path) -> None:
-        """Save traces and manifest CSV to a directory.
-        Writes individual trace CSVs to `out_dir/traces/` and a manifest
-        CSV at `out_dir/traces_summary.csv`.
-        Args:
-            out_dir: Output directory.
-        """
-        out_dir = Path(out_dir)
-        traces_dir = out_dir / "traces"
-        traces_dir.mkdir(parents=True, exist_ok=True)
-        summary_rows: list[dict[str, Any]] = []
-        for label in sorted(self._traces):
-            for batch in sorted(self._traces[label]):
-                tr = self._traces[label][batch]
-                trace_name = f"{label}_num_gpus_{tr.measured_gpus}_max_num_seqs_{batch}.csv"
-                pd.DataFrame(
-                    {
-                        self.TRACE_COL_TIME: tr.t_s,
-                        self.TRACE_COL_POWER: tr.power_w,
-                    }
-                ).to_csv(traces_dir / trace_name, index=False)
-                summary_rows.append(
-                    {
-                        self.MANIFEST_COL_MODEL_LABEL: label,
-                        self.MANIFEST_COL_NUM_GPUS: tr.measured_gpus,
-                        self.MANIFEST_COL_BATCH_SIZE: batch,
-                        self.MANIFEST_COL_TRACE_FILE: f"traces/{trace_name}",
-                    }
-                )
-        pd.DataFrame(summary_rows).to_csv(out_dir / "traces_summary.csv", index=False)
-class InferenceData:
-    """LLM inference workload with offline simulation data.
-    Bundles model specifications with power templates and latency
-    distributions. Validates that all models have matching data entries.
-    Args:
-        models: Model specifications as a tuple of
-            [`InferenceModelSpec`][openg2g.datacenter.config.InferenceModelSpec].
-        power_templates: Pre-built per-GPU power templates for all models
-            and batch sizes, created via
-            [`InferenceTraceStore.build_templates`][..InferenceTraceStore.build_templates].
-        itl_fits: Per-model ITL mixture distributions. Required when using
-            controllers that read observed latency (e.g.,
-            `OFOBatchSizeController`). When omitted, NaN is reported for
-            observed latency.
-    """
-    def __init__(
-        self,
-        models: tuple[InferenceModelSpec, ...],
-        *,
-        power_templates: InferenceTemplateStore,
-        itl_fits: ITLFitStore | None = None,
-    ) -> None:
-        if isinstance(power_templates, InferenceTraceStore):
-            raise TypeError(
-                "Expected a InferenceTemplateStore, got InferenceTraceStore. "
-                "Call InferenceTraceStore.build_templates() first to create a InferenceTemplateStore."
-            )
-        if not models:
-            raise ValueError("models must not be empty.")
-        labels = [ms.model_label for ms in models]
-        if len(labels) != len(set(labels)):
-            raise ValueError(f"Duplicate model labels: {labels}")
-        self._models = models
-        self._power_templates: InferenceTemplateStore | None = power_templates
-        self._trace_store: InferenceTraceStore | None = None
-        self._itl_fit_store: ITLFitStore | None = None
-        self._itl_fits = itl_fits
-        self._itl_samples_df: pd.DataFrame | None = None
-        for ms in self._models:
-            try:
-                power_templates.batch_sizes(ms.model_label)
-            except KeyError:
-                raise ValueError(
-                    f"Power templates missing for model {ms.model_label!r}. "
-                    f"Ensure InferenceTraceStore contains traces for all models."
-                ) from None
-            if itl_fits is not None and ms.model_label not in itl_fits.distributions:
-                raise ValueError(
-                    f"ITL fits missing for model {ms.model_label!r}. "
-                    f"Available models in ITLFitStore: {sorted(itl_fits.distributions.keys())}"
-                )
-    @classmethod
-    def generate(
-        cls,
-        models: tuple[InferenceModelSpec, ...],
-        data_sources: dict[str, MLEnergySource],
-        *,
-        runs: Any = None,
-        mlenergy_data_dir: Path | None = None,
-        dt_s: float = 0.1,
-        seed: int = 0,
-        itl_sample_cap: int = 2048,
-    ) -> InferenceData:
-        """Generate inference data from ML.ENERGY benchmark data.
-        Produces power traces and ITL mixture fits for all models and
-        batch sizes specified in `data_sources`.
-        Args:
-            models: Model specifications.
-            data_sources: Per-model benchmark data extraction settings,
-                keyed by `model_label`.
-            runs: Pre-loaded `LLMRuns` object. If `None`, loads from
-                `mlenergy_data_dir` or the HuggingFace Hub.
-            mlenergy_data_dir: Path to compiled mlenergy-data directory.
-                Ignored if `runs` is provided.
-            dt_s: Trace timestep (seconds).
-            seed: Random seed for ITL fitting.
-            itl_sample_cap: Maximum ITL samples per run for fitting.
-        Returns:
-            A new `InferenceData` with generated traces and ITL fits (no
-            templates — call `InferenceTraceStore.build_templates()` on the
-            saved/loaded store to get templates).
-        """
-        if runs is None:
-            unique_tasks = {src.task for src in data_sources.values()}
-            if mlenergy_data_dir:
-                logger.info("Loading runs from %s (tasks: %s)", mlenergy_data_dir, sorted(unique_tasks))
-                runs = LLMRuns.from_directory(str(mlenergy_data_dir), stable_only=False).task(*unique_tasks)
-            else:
-                logger.info("Loading runs from Hugging Face Hub (tasks: %s)", sorted(unique_tasks))
-                runs = LLMRuns.from_hf(stable_only=False).task(*unique_tasks)
-        if not runs:
-            raise ValueError("No runs found for the specified tasks")
-        subsets_by_label: dict[str, Any] = {}
-        tl_frames: list[pd.DataFrame] = []
-        itl_frames: list[pd.DataFrame] = []
-        for ms in models:
-            src = data_sources.get(ms.model_label)
-            if src is None:
-                raise ValueError(f"No data source for model {ms.model_label!r}")
-            model_id = ms.model_id
-            if not model_id:
-                raise ValueError(f"model_id is required for data generation (model={ms.model_label!r})")
-            subset = (
-                runs.model_id(model_id).gpu_model(src.gpu).num_gpus(ms.gpus_per_replica).max_num_seqs(*src.batch_sizes)
-            )
-            if not subset:
-                raise ValueError(
-                    f"Config matched zero runs: model_id={model_id!r}, "
-                    f"gpu={src.gpu!r}, num_gpus={ms.gpus_per_replica}, "
-                    f"batch_sizes={src.batch_sizes}"
-                )
-            subsets_by_label[ms.model_label] = subset
-            logger.info(
-                "%s: %d runs (model_id=%s, gpu=%s, num_gpus=%d, batches=%s)",
-                ms.model_label,
-                len(subset),
-                model_id,
-                src.gpu,
-                ms.gpus_per_replica,
-                sorted({r.max_num_seqs for r in subset}),
-            )
-        logger.info("Downloading raw result files for %d models ...", len(subsets_by_label))
-        for subset in subsets_by_label.values():
-            subset.download_raw_files(file="results")
-        logger.info("Downloads complete. Extracting timelines and ITL samples ...")
-        for label, subset in subsets_by_label.items():
-            for run in subset:
-                tl = run.timelines(metric="power.device_instant")
-                tl["model_label"] = label
-                tl["num_gpus"] = run.num_gpus
-                tl["max_num_seqs"] = run.max_num_seqs
-                tl["run_index"] = len(tl_frames)
-                tl_frames.append(tl)
-            itl = subset.inter_token_latencies()
-            itl["model_label"] = label
-            itl_frames.append(itl)
-        all_tl = pd.concat(tl_frames, ignore_index=True)
-        itl_samples_df = pd.concat(itl_frames, ignore_index=True)
-        logger.info("Building trace store (%d timeline rows) and fitting ITL models ...", len(all_tl))
-        trace_store = _build_trace_store_from_timelines(all_tl, dt_s=dt_s)
-        itl_fit_store = _build_itl_fit_store(itl_samples_df, max_samples=itl_sample_cap, seed=seed)
-        return cls._from_stores(
-            models,
-            trace_store=trace_store,
-            itl_fit_store=itl_fit_store,
-            itl_samples_df=itl_samples_df,
-        )
-    @classmethod
-    def _from_stores(
-        cls,
-        models: tuple[InferenceModelSpec, ...],
-        *,
-        trace_store: InferenceTraceStore,
-        itl_fit_store: ITLFitStore,
-        itl_samples_df: pd.DataFrame | None = None,
-    ) -> InferenceData:
-        """Create from raw stores (internal, used by generate)."""
-        instance = object.__new__(cls)
-        instance._models = models
-        instance._trace_store = trace_store
-        instance._itl_fit_store = itl_fit_store
-        instance._power_templates = None
-        instance._itl_fits = itl_fit_store
-        instance._itl_samples_df = itl_samples_df
-        return instance
-    def save(self, out_dir: Path, *, plot: bool = False) -> None:
-        """Save traces and ITL fits to a directory.
-        Args:
-            out_dir: Output directory.
-            plot: If `True`, also write characterization plots (power
-                trajectories, ITL distributions).
-        """
-        out_dir = Path(out_dir)
-        out_dir.mkdir(parents=True, exist_ok=True)
-        if self._trace_store is not None:
-            self._trace_store.save(out_dir)
-        if self._itl_fits is not None:
-            self._itl_fits.save(out_dir / "latency_fits.csv")
-        (out_dir / "_manifest.json").write_text(
-            json.dumps({"openg2g_version": openg2g.__version__}, indent=2, sort_keys=True)
-        )
-        if plot and self._trace_store is not None:
-            _plot_power_trajectories(self._trace_store, self._models, out_dir)
-            itl_samples = self._itl_samples_df
-            if self._itl_fit_store is not None and itl_samples is not None:
-                for ms in self._models:
-                    _plot_itl_distributions(self._itl_fit_store, itl_samples, ms.model_label, out_dir)
-    @classmethod
-    def load(
-        cls,
-        data_dir: Path,
-        models: tuple[InferenceModelSpec, ...],
-        *,
-        duration_s: float = 600.0,
-        dt_s: float = 0.1,
-        steady_skip_s: float = 0.0,
-    ) -> InferenceData:
-        """Load from a generated data directory.
-        Loads traces from `traces_summary.csv`, builds templates, and
-        loads ITL fits from `latency_fits.csv`.
-        Args:
-            data_dir: Directory containing generated data.
-            models: Model specifications.
-            duration_s: Simulation duration for template building.
-            dt_s: Simulation timestep for template building.
-            steady_skip_s: Skip seconds for template building.
-        """
-        data_dir = Path(data_dir)
-        _check_version_stamp(data_dir, "InferenceData")
-        store = InferenceTraceStore.load(data_dir / "traces_summary.csv")
-        templates = store.build_templates(duration_s=duration_s, dt_s=dt_s, steady_skip_s=steady_skip_s)
-        itl_fits = ITLFitStore.load(data_dir / "latency_fits.csv")
-        return cls(models, power_templates=templates, itl_fits=itl_fits)
-    @classmethod
-    def ensure(
-        cls,
-        data_dir: Path,
-        models: tuple[InferenceModelSpec, ...],
-        data_sources: dict[str, MLEnergySource] | None = None,
-        *,
-        mlenergy_data_dir: Path | None = None,
-        plot: bool = False,
-        duration_s: float = 600.0,
-        dt_s: float = 0.1,
-        steady_skip_s: float = 0.0,
-    ) -> InferenceData:
-        """Load from `data_dir`, generating first if needed.
-        If `data_dir/traces_summary.csv` does not exist, generates
-        inference data from ML.ENERGY benchmark data and saves it.
-        Then loads and returns.
-        Args:
-            data_dir: Data directory (generated files go here).
-            models: Model specifications.
-            data_sources: Per-model benchmark data extraction settings,
-                keyed by `model_label`. Required when no cached data exists.
-            mlenergy_data_dir: Path to compiled mlenergy-data directory.
-            plot: If `True`, generate characterization plots on generation.
-            duration_s: Simulation duration for template building.
-            dt_s: Simulation timestep for template building.
-            steady_skip_s: Skip seconds for template building.
-        """
-        data_dir = Path(data_dir)
-        if not (data_dir / "traces_summary.csv").exists():
-            if data_sources is None:
-                raise ValueError("data_sources required for InferenceData generation (no cached data)")
-            logger.info("Generating inference data to %s ...", data_dir)
-            cls.generate(
-                models,
-                data_sources,
-                mlenergy_data_dir=mlenergy_data_dir,
-                dt_s=dt_s,
-            ).save(data_dir, plot=plot)
-        return cls.load(data_dir, models, duration_s=duration_s, dt_s=dt_s, steady_skip_s=steady_skip_s)
-    @property
-    def models(self) -> tuple[InferenceModelSpec, ...]:
-        """The model specifications."""
-        return self._models
-    @property
-    def power_templates(self) -> InferenceTemplateStore:
-        if self._power_templates is None:
-            raise RuntimeError("power_templates not available (generate-only instance). Load from disk first.")
-        return self._power_templates
-    @property
-    def itl_fits(self) -> ITLFitStore | None:
-        return self._itl_fits
-def _check_version_stamp(data_dir: Path, label: str) -> None:
-    """Log a warning if cached data was generated with a different openg2g version."""
-    manifest_path = data_dir / "_manifest.json"
-    if not manifest_path.exists():
-        return
-    try:
-        manifest = json.loads(manifest_path.read_text())
-    except (json.JSONDecodeError, OSError):
-        return
-    cached_version = manifest.get("openg2g_version", "unknown")
-    if cached_version != openg2g.__version__:
-        logger.warning(
-            "%s: cached data generated with openg2g %s (current %s). Consider regenerating.",
-            label,
-            cached_version,
-            openg2g.__version__,
-        )
-def _build_trace_store_from_timelines(tl: pd.DataFrame, *, dt_s: float) -> InferenceTraceStore:
-    """Build an InferenceTraceStore from raw timeline data.
-    Args:
-        tl: Combined timeline dataframe with columns `model_label`,
-            `num_gpus`, `max_num_seqs`, `run_index`, `relative_time_s`, `value`.
-        dt_s: Resampling timestep.
-    Returns:
-        An InferenceTraceStore with median-aggregated traces.
-    """
-    if tl.empty:
-        raise ValueError("No timeline rows extracted from selected runs")
-    traces: dict[str, dict[int, InferenceTrace]] = {}
-    keys = [
-        InferenceTraceStore.MANIFEST_COL_MODEL_LABEL,
-        InferenceTraceStore.MANIFEST_COL_NUM_GPUS,
-        InferenceTraceStore.MANIFEST_COL_BATCH_SIZE,
-    ]
-    for key, g in tl.groupby(keys, dropna=False):
-        if not isinstance(key, tuple):
-            raise TypeError(f"Expected tuple groupby key, got {type(key).__name__}")
-        model_label, num_gpus, batch = str(key[0]), cast(int, key[1]), cast(int, key[2])
-        series_list: list[tuple[np.ndarray, np.ndarray]] = []
-        t_ends: list[float] = []
-        for _run_index, rg in g.groupby("run_index"):
-            rr = rg.sort_values("relative_time_s")
-            t = rr["relative_time_s"].to_numpy(dtype=float)
-            p = rr["value"].to_numpy(dtype=float)
-            if t.size < 2:
-                continue
-            series_list.append((t, p))
-            t_ends.append(float(t[-1]))
-        if not series_list:
-            continue
-        t_end = float(np.median(np.asarray(t_ends, dtype=float)))
-        grid = np.arange(0.0, t_end + 1e-12, float(dt_s), dtype=float)
-        mats: list[np.ndarray] = []
-        for t, p in series_list:
-            mats.append(np.interp(grid, t, p, left=p[0], right=p[-1]))
-        mat = np.vstack(mats)
-        p_med = np.median(mat, axis=0)
-        traces.setdefault(model_label, {})[batch] = InferenceTrace(
-            t_s=grid,
-            power_w=p_med,
-            measured_gpus=int(num_gpus),
-        )
-    if not traces:
-        raise ValueError("No trace profiles extracted from timeline data")
-    return InferenceTraceStore(traces)
-def _build_itl_fit_store(
-    itl: pd.DataFrame,
-    *,
-    max_samples: int,
-    seed: int,
-) -> ITLFitStore:
-    """Build an ITLFitStore from raw ITL sample data.
-    Args:
-        itl: ITL sample dataframe with columns `model_label`, `num_gpus`,
-            `max_num_seqs`, `itl_s`.
-        max_samples: Maximum ITL samples per group for fitting.
-        seed: Random seed for ITL fitting.
-    Returns:
-        An ITLFitStore with fitted mixture distributions.
-    """
-    if itl.empty:
-        raise ValueError("No ITL samples provided")
-    distributions: dict[str, dict[int, ITLMixtureModel]] = {}
-    for key, g in itl.groupby(["model_label", "num_gpus", "max_num_seqs"], dropna=False):
-        if not isinstance(key, tuple):
-            raise TypeError(f"Expected tuple groupby key, got {type(key).__name__}")
-        model_label, _num_gpus, batch = str(key[0]), cast(int, key[1]), cast(int, key[2])
-        fit = ITLMixtureModel.fit(
-            g["itl_s"].to_numpy(dtype=float),
-            max_samples=max_samples,
-            seed=seed,
-        )
-        distributions.setdefault(model_label, {})[batch] = fit
-    if not distributions:
-        raise ValueError("No ITL fits produced")
-    return ITLFitStore(distributions)
-@dataclass(frozen=True)
-class InferenceAugmentedPower:
-    """Result of inference power augmentation for one simulation timestep.
-    Attributes:
-        power_w: Three-phase inference power (watts), excluding base load.
-        power_by_model_w: Per-model total active power (watts).
-        active_replicas_by_model: Per-model active replica count.
-    """
-    power_w: ThreePhase
-    power_by_model_w: dict[str, float] = field(default_factory=dict)
-    active_replicas_by_model: dict[str, int] = field(default_factory=dict)
-class InferencePowerAugmenter:
-    """Scales per-GPU inference power through server layouts to three-phase power.
-    Given per-GPU power values for each server (one value per server per
-    model), applies per-server scaling, noise, activation masking, and
-    phase summation to produce inference-level three-phase power.
-    This class is backend-agnostic. The offline datacenter feeds it
-    template-indexed values; the online datacenter can feed it
-    live-measured values. The datacenter backend is responsible for
-    adding facility base load on top of the returned inference power.
-    Args:
-        layouts: Per-model server layouts (physical topology).
-        policies: Per-model activation policies determining which servers
-            are active at each timestep.
-        seed: Random seed for noise RNG.
-    """
-    def __init__(
-        self,
-        layouts: dict[str, ServerLayout],
-        policies: dict[str, ActivationPolicy],
-        seed: int = 0,
-    ) -> None:
-        self._layouts = layouts
-        self._policies = policies
-        self._seed = int(seed)
-        self._rng = np.random.default_rng(self._seed)
-    def augment(
-        self,
-        per_gpu_by_model: dict[str, np.ndarray],
-        t: float,
-    ) -> InferenceAugmentedPower:
-        """Augment per-server per-GPU power to three-phase power.
-        Args:
-            per_gpu_by_model: Mapping of model label to per-GPU power
-                array of shape `(num_servers,)`. Only models with active
-                replicas should be included.
-            t: Current simulation time (seconds).
-        Returns:
-            Augmented inference power with three-phase totals, per-model
-                power, and per-model active replica counts.
-        """
-        phase_power = np.zeros(3, dtype=float)
-        power_by_model: dict[str, float] = {}
-        active_replicas_by_model: dict[str, int] = {}
-        for label, per_gpu in per_gpu_by_model.items():
-            layout = self._layouts[label]
-            policy = self._policies[label]
-            server_powers = per_gpu * layout.gpus_per_server_list * layout.amplitude_scales
-            if layout.noise_fraction > 0:
-                levels = np.maximum(server_powers, 1.0)
-                server_powers += self._rng.normal(0.0, 1.0, size=layout.num_servers) * layout.noise_fraction * levels
-            server_powers = np.maximum(server_powers, 0.0)
-            active_indices = policy.active_indices(t)
-            active_powers = server_powers[active_indices]
-            active_phases = layout.phase_list[active_indices]
-            model_phase_power = np.zeros(3, dtype=float)
-            np.add.at(model_phase_power, active_phases, active_powers)
-            phase_power += model_phase_power
-            power_by_model[label] = float(np.sum(active_powers))
-            active_gpus = int(np.sum(layout.gpus_per_server_list[active_indices]))
-            active_replicas_by_model[label] = active_gpus // layout.gpus_per_replica
-        return InferenceAugmentedPower(
-            power_w=ThreePhase(
-                a=float(phase_power[0]),
-                b=float(phase_power[1]),
-                c=float(phase_power[2]),
-            ),
-            power_by_model_w=power_by_model,
-            active_replicas_by_model=active_replicas_by_model,
-        )
-    def reset(self) -> None:
-        """Re-seed the noise RNG to its initial state."""
-        self._rng = np.random.default_rng(self._seed)
-def _lognorm_pdf(x: np.ndarray, sigma: float, scale: float) -> np.ndarray:
-    """Standard lognormal PDF: f(x; sigma, scale) for x > 0."""
-    x = np.asarray(x, dtype=float)
-    out = np.zeros_like(x)
-    mask = x > 0
-    xx = x[mask]
-    out[mask] = (1.0 / (xx * sigma * np.sqrt(2.0 * np.pi))) * np.exp(-(np.log(xx / scale) ** 2) / (2.0 * sigma * sigma))
-    return out
-def _plot_power_trajectories(
-    trace_store: InferenceTraceStore,
-    models: tuple[InferenceModelSpec, ...],
-    out_dir: Path,
-    *,
-    rolling_window: int = 10,
-) -> None:
-    """Plot total GPU power trajectories per batch size.
-    One subplot per model. Each curve is a different batch size.
-    Saves to `out_dir / "power_trajectories.png"`.
-    """
-    import matplotlib.pyplot as plt
-    model_labels = [m.model_label for m in models]
-    n_models = len(model_labels)
-    fig, axes = plt.subplots(n_models, 1, figsize=(10, 5), dpi=160, squeeze=False)
-    panel_labels = "abcdefghij"
-    for row, model_label in enumerate(model_labels):
-        ax = axes[row, 0]
-        per_batch = trace_store._traces.get(model_label, {})
-        if not per_batch:
-            ax.set_title(f"{model_label} (no traces found)")
-            continue
-        batches = sorted(per_batch.keys())
-        cmap = plt.get_cmap("tab10")
-        for i, batch in enumerate(batches):
-            tr = per_batch[batch]
-            time_s = tr.t_s.copy()
-            power_kw = tr.power_w.copy() / 1000.0
-            if rolling_window > 1 and len(power_kw) >= rolling_window:
-                kernel = np.ones(rolling_window) / rolling_window
-                smoothed = np.convolve(power_kw, kernel, mode="same")
-                half = rolling_window // 2
-                smoothed[:half] = power_kw[:half]
-                smoothed[-half:] = power_kw[-half:]
-                power_kw = smoothed
-            ax.plot(time_s, power_kw, label=f"batch={batch}", color=cmap(i))
-        label_char = panel_labels[row] if row < len(panel_labels) else ""
-        num_gpus = per_batch[batches[0]].measured_gpus
-        gpu_suffix = "GPUs" if num_gpus > 1 else "GPU"
-        ax.set_title(
-            f"({label_char}) {model_label}: Total-GPU Power ({num_gpus} {gpu_suffix})",
-            fontsize=13,
-        )
-        ax.set_ylabel("Power (kW)", fontsize=11)
-        if row == 0:
-            ax.legend(fontsize=9, ncol=len(batches), loc="lower center", frameon=True, framealpha=0.9)
-        ax.grid(True, alpha=0.3)
-        ax.set_xlim(left=0)
-    axes[-1, 0].set_xlabel("Time (seconds)", fontsize=11)
-    fig.tight_layout()
-    save_path = out_dir / "power_trajectories.png"
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(save_path, bbox_inches="tight", pad_inches=0.02)
-    plt.close(fig)
-    logger.info("Saved power trajectories plot to %s", save_path)
-def _plot_itl_distributions(
-    itl_fit_store: ITLFitStore,
-    itl_samples_df: pd.DataFrame,
-    model_label: str,
-    out_dir: Path,
-    *,
-    hist_bins: int = 120,
-    hist_alpha: float = 0.12,
-    x_lo_q: float = 0.5,
-    x_hi_q: float = 99.5,
-    grid_n: int = 1200,
-) -> None:
-    """Plot ITL mixture distribution overlay for one model.
-    Shows the fitted mixture PDF for each batch size overlaid, with
-    histograms and an inset showing steady/stall component decomposition
-    for the largest batch size. Saves to `out_dir / "itl_distributions_{model_label}.png"`.
-    """
-    import matplotlib.pyplot as plt
-    from mpl_toolkits.axes_grid1.inset_locator import inset_axes
-    model_dists = itl_fit_store.distributions.get(model_label, {})
-    if not model_dists:
-        logger.warning("No ITL distributions for model %s, skipping plot", model_label)
-        return
-    samples = itl_samples_df[itl_samples_df["model_label"] == model_label]
-    batches = sorted(model_dists.keys())
-    all_x = samples[samples["max_num_seqs"].isin(batches)]["itl_s"].to_numpy(dtype=float)
-    if len(all_x) == 0:
-        logger.warning("No ITL samples for model %s, skipping plot", model_label)
-        return
-    lo = float(np.percentile(all_x, x_lo_q))
-    hi = float(np.percentile(all_x, x_hi_q))
-    grid = np.linspace(lo, hi, grid_n)
-    fig, ax = plt.subplots(figsize=(7.2, 3.2), dpi=160)
-    cmap = plt.get_cmap("tab10") if len(batches) <= 10 else plt.get_cmap("tab20")
-    colors = {b: cmap(i % cmap.N) for i, b in enumerate(batches)}
-    for b in batches:
-        model = model_dists[b]
-        params = model.to_dict()
-        loc = float(params["loc"])
-        pi = float(params["pi_steady"])
-        s1 = float(params["sigma_steady"])
-        sc1 = float(params["scale_steady"])
-        s2 = float(params["sigma_stall"])
-        sc2 = float(params["scale_stall"])
-        shifted = grid - loc
-        pdf_mix = pi * _lognorm_pdf(shifted, s1, sc1) + (1 - pi) * _lognorm_pdf(shifted, s2, sc2)
-        c = colors[b]
-        bsamp = samples[samples["max_num_seqs"] == b]["itl_s"].to_numpy(dtype=float)
-        if len(bsamp) > 0:
-            ax.hist(bsamp, bins=hist_bins, range=(lo, hi), density=True, alpha=hist_alpha, color=c)
-        ax.plot(grid, pdf_mix, linewidth=2.2, color=c, label=f"batch={b}")
-    ax.set_title(f"(a) {model_label}: ITL distribution vs batch size")
-    ax.set_xlabel("Inter-token latency (seconds)")
-    ax.set_ylabel("Density")
-    ax.legend(ncol=4, fontsize=9, frameon=True)
-    ax.set_xlim(lo, hi)
-    inset_batch = max(batches)
-    inset_model = model_dists[inset_batch]
-    inset_params = inset_model.to_dict()
-    loc = float(inset_params["loc"])
-    pi = float(inset_params["pi_steady"])
-    s1 = float(inset_params["sigma_steady"])
-    sc1 = float(inset_params["scale_steady"])
-    s2 = float(inset_params["sigma_stall"])
-    sc2 = float(inset_params["scale_stall"])
-    bsamp = samples[samples["max_num_seqs"] == inset_batch]["itl_s"].to_numpy(dtype=float)
-    lo_i = float(np.percentile(bsamp, 0.5)) if len(bsamp) > 0 else lo
-    hi_i = float(np.percentile(bsamp, 99.5)) if len(bsamp) > 0 else hi
-    grid_i = np.linspace(lo_i, hi_i, 600)
-    shifted_i = grid_i - loc
-    pdf_steady = pi * _lognorm_pdf(shifted_i, s1, sc1)
-    pdf_stall = (1 - pi) * _lognorm_pdf(shifted_i, s2, sc2)
-    pdf_mix_i = pdf_steady + pdf_stall
-    axins = inset_axes(
-        ax,
-        width="38%",
-        height="55%",
-        loc="lower right",
-        bbox_to_anchor=(-0.1, 0.1, 1, 1),
-        bbox_transform=ax.transAxes,
-    )
-    if len(bsamp) > 0:
-        axins.hist(bsamp, bins=60, range=(lo_i, hi_i), density=True, alpha=0.20, color=colors[inset_batch])
-    axins.plot(grid_i, pdf_mix_i, lw=2.0, color=colors[inset_batch], label="mixture")
-    axins.plot(grid_i, pdf_steady, lw=1.6, ls="--", color="0.25", label="steady")
-    axins.plot(grid_i, pdf_stall, lw=1.6, ls=":", color="0.25", label="stall")
-    axins.set_title(f"(b) batch={inset_batch} components", fontsize=9)
-    axins.set_xlim(lo_i, hi_i)
-    axins.tick_params(axis="both", labelsize=8)
-    axins.grid(True, alpha=0.25)
-    axins.legend(fontsize=8, frameon=True, loc="upper right")
-    fig.tight_layout()
-    save_path = out_dir / f"itl_distributions_{model_label}.png"
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    fig.savefig(save_path, bbox_inches="tight", pad_inches=0.02)
-    plt.close(fig)
-    logger.info("Saved ITL distributions plot for %s to %s", model_label, save_path)
-class RequestsConfig(BaseModel):
-    """Configuration for building per-model JSONL request files.
-    Attributes:
-        dataset: Dataset to sample prompts from (`"gpqa"` or `"lm-arena-chat"`).
-        num_requests: Number of requests to sample per model.
-        max_completion_tokens: Maximum output tokens per request.
-        seed: Random seed for dataset shuffling and oversampling.
-        system_prompt: System prompt prepended to every request.
-    """
-    model_config = ConfigDict(frozen=True)
-    dataset: str = "lm-arena-chat"
-    num_requests: int = 1000
-    max_completion_tokens: int = 512
-    seed: int = 0
-    system_prompt: str = "You are a helpful AI assistant."
-class RequestStore:
-    """Per-model request dicts for online load generation.
-    Each model's requests are stored as a list of OpenAI Chat Completion
-    streaming request dicts, suitable for submission to a vLLM server.
-    Attributes:
-        requests_by_model: Mapping from model label to request dicts.
-    """
-    def __init__(self, requests_by_model: dict[str, list[dict]]) -> None:
-        self.requests_by_model = requests_by_model
-    @classmethod
-    def generate(
-        cls,
-        models: Sequence[InferenceModelSpec],
-        config: RequestsConfig | None = None,
-        *,
-        extra_body_by_model: dict[str, dict] | None = None,
-    ) -> RequestStore:
-        """Sample prompts and build per-model request dicts.
-        Requires `pip install datasets openai`.
-        Args:
-            models: Model specifications. Uses `model_id` for the API
-                model field.
-            config: Request generation config. Uses defaults if `None`.
-            extra_body_by_model: Optional per-model extra fields merged
-                into every request dict (e.g. `chat_template_kwargs`).
-                Keyed by `model_label`.
-        """
-        import random as _random
-        from datasets import load_dataset
-        from openai.types.chat import (
-            ChatCompletionAssistantMessageParam,
-            ChatCompletionContentPartTextParam,
-            ChatCompletionMessageParam,
-            ChatCompletionSystemMessageParam,
-            ChatCompletionUserMessageParam,
-        )
-        from openai.types.chat.completion_create_params import CompletionCreateParamsStreaming
-        if config is None:
-            config = RequestsConfig()
-        def _text_part(text: str) -> ChatCompletionContentPartTextParam:
-            return ChatCompletionContentPartTextParam(type="text", text=text)
-        def _prompt_to_messages(prompt: str | list[str]) -> list[ChatCompletionMessageParam]:
-            if isinstance(prompt, str):
-                return [ChatCompletionUserMessageParam(role="user", content=[_text_part(prompt)])]
-            msgs: list[ChatCompletionMessageParam] = [
-                ChatCompletionUserMessageParam(role="user", content=[_text_part(prompt[0])])
-            ]
-            for i, turn in enumerate(prompt[1:]):
-                if i % 2 == 0:
-                    msgs.append(ChatCompletionAssistantMessageParam(role="assistant", content=[_text_part(turn)]))
-                else:
-                    msgs.append(ChatCompletionUserMessageParam(role="user", content=[_text_part(turn)]))
-            return msgs
-        def _maybe_oversample(items: list, target: int, seed: int) -> None:
-            if len(items) >= target:
-                return
-            rng = _random.Random(seed)
-            original = list(items)
-            while len(items) < target:
-                items.append(rng.choice(original))
-        def _sample_lm_arena_chat(num_requests: int, seed: int) -> list[str | list[str]]:
-            data = load_dataset("lmarena-ai/arena-human-preference-100k", split="train").shuffle(seed=seed)
-            prompts: list[str | list[str]] = []
-            for item in data:
-                num_turns = item["turn"]
-                conversation = item["conversation_a"]
-                for turns in range(num_turns):
-                    if len(prompts) >= num_requests:
-                        break
-                    messages: list[str] = []
-                    for message in conversation[: 2 * turns + 1]:
-                        messages.append(message["content"])
-                    prompts.append(messages if len(messages) > 1 else messages[0])
-                if len(prompts) >= num_requests:
-                    break
-            _maybe_oversample(prompts, num_requests, seed)
-            return prompts
-        def _sample_gpqa(num_requests: int, seed: int) -> list[str | list[str]]:
-            data = load_dataset("Idavidrein/gpqa", "gpqa_extended", split="train", streaming=True).shuffle(seed=seed)
-            _random.seed(seed)
-            prompts: list[str | list[str]] = []
-            for item in data:
-                if len(prompts) >= num_requests:
-                    break
-                choices = [
-                    item["Incorrect Answer 1"].strip(),
-                    item["Incorrect Answer 2"].strip(),
-                    item["Incorrect Answer 3"].strip(),
-                    item["Correct Answer"].strip(),
-                ]
-                _random.shuffle(choices)
-                question = item["Question"]
-                prompt = f"What is the correct answer to the following question: {question}\n\nChoices:"
-                for letter, choice in zip("ABCD", choices, strict=True):
-                    prompt += f"\n({letter}) {choice}"
-                prompts.append(prompt)
-            _maybe_oversample(prompts, num_requests, seed)
-            return prompts
-        samplers = {"lm-arena-chat": _sample_lm_arena_chat, "gpqa": _sample_gpqa}
-        sampler = samplers.get(config.dataset)
-        if sampler is None:
-            raise ValueError(f"Unknown dataset: {config.dataset!r}. Available: {sorted(samplers)}")
-        extra = extra_body_by_model or {}
-        requests_by_model: dict[str, list[dict]] = {}
-        for spec in models:
-            label = spec.model_label
-            model_id = spec.model_id
-            logger.info("Sampling %d %s prompts for %s (%s)...", config.num_requests, config.dataset, label, model_id)
-            prompts = sampler(num_requests=config.num_requests, seed=config.seed)
-            system_msgs: list[ChatCompletionMessageParam] = []
-            if config.system_prompt:
-                system_msgs.append(ChatCompletionSystemMessageParam(role="system", content=config.system_prompt))
-            template = CompletionCreateParamsStreaming(
-                model=model_id,
-                messages=system_msgs,
-                max_completion_tokens=config.max_completion_tokens,
-                stream=True,
-                stream_options={"include_usage": True, "continuous_usage_stats": True},
-            )
-            if label in extra:
-                template.update(extra[label])
-            reqs: list[dict] = []
-            for prompt in prompts:
-                request = dict(template)
-                request["messages"] = list(template["messages"]) + _prompt_to_messages(prompt)
-                reqs.append(request)
-            requests_by_model[label] = reqs
-        return cls(requests_by_model)
-    def save(self, out_dir: Path) -> None:
-        """Write per-model JSONL files to `out_dir`.
-        Args:
-            out_dir: Output directory. Created if it doesn't exist.
-        """
-        out_dir = Path(out_dir)
-        out_dir.mkdir(parents=True, exist_ok=True)
-        for label, reqs in self.requests_by_model.items():
-            out_path = out_dir / f"{label}.jsonl"
-            with open(out_path, "w") as f:
-                for req in reqs:
-                    f.write(json.dumps(req) + "\n")
-            logger.info("Wrote %d requests for %s to %s", len(reqs), label, out_path)
-    @classmethod
-    def load(cls, out_dir: Path) -> RequestStore:
-        """Load per-model JSONL files from `out_dir`.
-        Args:
-            out_dir: Directory containing `{model_label}.jsonl` files.
-        """
-        requests_by_model: dict[str, list[dict]] = {}
-        out_dir = Path(out_dir)
-        for path in sorted(out_dir.glob("*.jsonl")):
-            label = path.stem
-            reqs: list[dict] = []
-            with open(path) as f:
-                for line in f:
-                    line = line.strip()
-                    if line:
-                        reqs.append(json.loads(line))
-            requests_by_model[label] = reqs
-            logger.info("Loaded %d requests for %s", len(reqs), label)
-        return cls(requests_by_model)
-    @classmethod
-    def ensure(
-        cls,
-        out_dir: Path,
-        models: Sequence[InferenceModelSpec] | None = None,
-        config: RequestsConfig | None = None,
-        *,
-        extra_body_by_model: dict[str, dict] | None = None,
-    ) -> RequestStore:
-        """Load request files from `out_dir`, generating first if needed.
-        Args:
-            out_dir: Directory for JSONL files.
-            models: Required if request files don't exist yet.
-            config: Request generation config. Uses defaults if `None`.
-            extra_body_by_model: Optional per-model extra fields for
-                request generation. Keyed by `model_label`.
-        """
-        out_dir = Path(out_dir)
-        if not out_dir.exists():
-            if models is None:
-                raise ValueError("models required (no cached request data)")
-            logger.info("Generating request files to %s ...", out_dir)
-            cls.generate(models, config, extra_body_by_model=extra_body_by_model).save(out_dir)
-        return cls.load(out_dir)

openg2g/datacenter/workloads/training.py DELETED Viewed

@@ -1,200 +0,0 @@
-"""Training workload: typed trace data and periodic overlay evaluation."""
-from __future__ import annotations
-import logging
-from dataclasses import dataclass
-from pathlib import Path
-import numpy as np
-import pandas as pd
-from pydantic import BaseModel, ConfigDict
-logger = logging.getLogger(__name__)
-class TrainingTraceParams(BaseModel):
-    """Parameters for synthetic training-like power trace generation.
-    Attributes:
-        duration_s: Total duration (seconds).
-        dt_s: Timestep (seconds).
-        seed: Random seed.
-        P_hi: High plateau power (W).
-        P_lo: Low plateau power (W).
-        sigma_hi: Noise std in high plateaus (W).
-        sigma_lo: Noise std in low plateaus (W).
-        seg_lo_range: Duration range for low segments (seconds).
-        seg_hi_range: Duration range for high segments (seconds).
-        dip_prob_per_sec: Expected brief dips per second.
-        dip_depth_range: Depth range for brief dips (W below current level).
-        dip_dur_range: Duration range for brief dips (seconds).
-        smooth_window_s: Smoothing window width (seconds).
-        ramp_s: Initial warm-up ramp duration (seconds).
-        ramp_from: Power at ramp start (W).
-    """
-    model_config = ConfigDict(frozen=True)
-    duration_s: float = 1000.0
-    dt_s: float = 0.1
-    seed: int = 2
-    P_hi: float = 225.0
-    P_lo: float = 175.0
-    sigma_hi: float = 50.0
-    sigma_lo: float = 50.0
-    seg_lo_range: tuple[float, float] = (10.0, 15.0)
-    seg_hi_range: tuple[float, float] = (35.0, 40.0)
-    dip_prob_per_sec: float = 0.010
-    dip_depth_range: tuple[float, float] = (120.0, 125.0)
-    dip_dur_range: tuple[float, float] = (0.06, 0.14)
-    smooth_window_s: float = 0.30
-    ramp_s: float = 18.0
-    ramp_from: float = 50.0
-def _generate_training_like_trace(params: TrainingTraceParams) -> tuple[np.ndarray, np.ndarray]:
-    """Generate a synthetic training-like per-GPU power trace.
-    Args:
-        params: Generation parameters.
-    Returns:
-        Tuple of (time_array, power_array).
-    """
-    rng = np.random.default_rng(params.seed)
-    t = np.arange(0.0, params.duration_s, params.dt_s)
-    n = t.size
-    env = np.empty(n, dtype=float)
-    i = 0
-    state_hi = True
-    while i < n:
-        if state_hi:
-            dur = rng.uniform(*params.seg_hi_range)
-            level = params.P_hi
-        else:
-            dur = rng.uniform(*params.seg_lo_range)
-            level = params.P_lo
-        j = min(n, i + int(np.round(dur / params.dt_s)))
-        env[i:j] = level
-        i = j
-        state_hi = not state_hi
-    noise = np.zeros(n, dtype=float)
-    hi_mask = env > (params.P_hi + params.P_lo) / 2
-    noise[hi_mask] = rng.normal(0.0, params.sigma_hi, size=hi_mask.sum())
-    noise[~hi_mask] = rng.normal(0.0, params.sigma_lo, size=(~hi_mask).sum())
-    p = env + noise
-    w = max(1, int(np.round(params.smooth_window_s / params.dt_s)))
-    if w > 1:
-        kernel = np.ones(w) / w
-        p = np.convolve(p, kernel, mode="same")
-    n_dips = rng.poisson(params.dip_prob_per_sec * params.duration_s)
-    for _ in range(n_dips):
-        t0 = rng.uniform(0.0, params.duration_s)
-        k0 = int(t0 / params.dt_s)
-        dur = rng.uniform(*params.dip_dur_range)
-        k1 = min(n, k0 + int(np.round(dur / params.dt_s)))
-        if k1 <= k0:
-            continue
-        depth = rng.uniform(*params.dip_depth_range)
-        p[k0:k1] = np.maximum(p[k0:k1] - depth, 0.0)
-    if params.ramp_s > 0:
-        k_ramp = min(n, int(np.round(params.ramp_s / params.dt_s)))
-        ramp = np.linspace(params.ramp_from, params.P_hi, k_ramp)
-        p[:k_ramp] = np.minimum(p[:k_ramp], ramp)
-    return t, p
-@dataclass(frozen=True)
-class TrainingTrace:
-    """A single-GPU training power trace.
-    Attributes:
-        t_s: Time vector (seconds), monotonically increasing.
-        power_w: Power vector (watts) for one GPU, same length as `t_s`.
-    """
-    COL_TIME = "t_s"
-    COL_POWER = "power_W"
-    t_s: np.ndarray
-    power_w: np.ndarray
-    def __post_init__(self) -> None:
-        if len(self.t_s) != len(self.power_w):
-            raise ValueError(f"t_s and power_w must have the same length, got {len(self.t_s)} and {len(self.power_w)}")
-        if len(self.t_s) < 2:
-            raise ValueError("Training trace must have >= 2 samples.")
-    @classmethod
-    def generate(cls, params: TrainingTraceParams | None = None) -> TrainingTrace:
-        """Generate a synthetic training-like power trace.
-        Args:
-            params: Generation parameters. Uses defaults if `None`.
-        Returns:
-            A new [`TrainingTrace`][.] with generated data.
-        """
-        if params is None:
-            params = TrainingTraceParams()
-        t, p = _generate_training_like_trace(params)
-        return cls(t_s=t, power_w=p)
-    def save(self, csv_path: Path) -> None:
-        """Save the trace to a CSV file.
-        Args:
-            csv_path: Output CSV path.
-        """
-        csv_path = Path(csv_path)
-        csv_path.parent.mkdir(parents=True, exist_ok=True)
-        df = pd.DataFrame({self.COL_TIME: self.t_s, self.COL_POWER: self.power_w})
-        df.to_csv(csv_path, index=False)
-    @classmethod
-    def load(cls, csv_path: Path) -> TrainingTrace:
-        """Load a training trace from CSV.
-        Args:
-            csv_path: Path to CSV with columns `t_s` and `power_W`.
-        """
-        csv_path = Path(csv_path)
-        df = pd.read_csv(csv_path)
-        if cls.COL_TIME not in df.columns or cls.COL_POWER not in df.columns:
-            raise ValueError(
-                f"{csv_path} must have columns {cls.COL_TIME!r} and {cls.COL_POWER!r}. Got {list(df.columns)}"
-            )
-        t = df[cls.COL_TIME].to_numpy(float)
-        p = np.clip(df[cls.COL_POWER].to_numpy(float), 0.0, None)
-        if np.any(np.diff(t) < 0):
-            idx = np.argsort(t)
-            t, p = t[idx], p[idx]
-        return cls(t_s=t, power_w=p)
-    @classmethod
-    def ensure(cls, csv_path: Path, params: TrainingTraceParams | None = None) -> TrainingTrace:
-        """Load from `csv_path`, generating first if needed.
-        Args:
-            csv_path: Path to the training trace CSV.
-            params: Generation parameters. Required when no cached file exists.
-                Uses defaults if `None` and generation is needed.
-        """
-        csv_path = Path(csv_path)
-        if not csv_path.exists():
-            logger.info("Generating training trace to %s ...", csv_path)
-            cls.generate(params).save(csv_path)
-        return cls.load(csv_path)

openg2g/events.py DELETED Viewed

@@ -1,60 +0,0 @@
-"""Clock-aligned simulation event primitives."""
-from __future__ import annotations
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Literal
-if TYPE_CHECKING:
-    from openg2g.clock import SimulationClock
-    from openg2g.coordinator import SimulationLog
-EventSource = Literal["coordinator", "controller", "datacenter", "grid", "custom"]
-@dataclass(frozen=True)
-class SimEvent:
-    """Structured simulation event with canonical clock metadata.
-    Attributes:
-        tick: Integer tick at which the event was emitted.
-        t_s: Simulation time in seconds.
-        source: Component family that emitted the event.
-        topic: Dot-separated event topic string.
-        data: Arbitrary key-value payload.
-    """
-    tick: int
-    t_s: float
-    source: EventSource
-    topic: str
-    data: dict[str, Any] = field(default_factory=dict)
-@dataclass
-class EventEmitter:
-    """Source-bound event helper that stamps [`SimEvent`][..SimEvent]
-    instances with clock metadata.
-    Attributes:
-        clock: Simulation clock for timestamping events.
-        log: `SimulationLog` that receives emitted events.
-        source: Component family label attached to all events.
-    """
-    clock: SimulationClock
-    log: SimulationLog
-    source: EventSource
-    def emit(self, topic: str, data: dict[str, Any] | None = None) -> None:
-        """Emit one event with current clock metadata."""
-        t_s = float(self.clock.time_s)
-        self.log.emit(
-            SimEvent(
-                tick=int(self.clock.step),
-                t_s=t_s,
-                source=self.source,
-                topic=str(topic),
-                data={} if data is None else dict(data),
-            )
-        )

openg2g/grid/__init__.py DELETED Viewed

File without changes

openg2g/grid/base.py DELETED Viewed

@@ -1,203 +0,0 @@
-"""Abstract base class for grid backends and grid-level types."""
-from __future__ import annotations
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from fractions import Fraction
-from typing import Generic, TypeVar, final
-import numpy as np
-from openg2g.clock import SimulationClock
-from openg2g.common import ThreePhase
-from openg2g.events import EventEmitter
-from openg2g.grid.command import GridCommand
-from openg2g.grid.config import TapPosition
-@dataclass(frozen=True)
-class PhaseVoltages:
-    """Per-phase voltage magnitudes in per-unit.
-    Phases missing from the bus have NaN for that field.
-    Attributes:
-        a: Phase A voltage magnitude (pu).
-        b: Phase B voltage magnitude (pu).
-        c: Phase C voltage magnitude (pu).
-    """
-    a: float
-    b: float
-    c: float
-@dataclass(frozen=True)
-class BusVoltages:
-    """Per-bus, per-phase voltage map.
-    Access: voltages["671"].a -> Vpu for bus 671, phase A.
-    Buses missing a phase have NaN for that field.
-    """
-    _data: dict[str, PhaseVoltages]
-    def __getitem__(self, bus: str) -> PhaseVoltages:
-        return self._data[bus]
-    def buses(self) -> list[str]:
-        """Return the list of bus names."""
-        return list(self._data.keys())
-    def __contains__(self, bus: str) -> bool:
-        return bus in self._data
-@dataclass(frozen=True)
-class GridState:
-    """State emitted by the grid simulator each timestep.
-    Attributes:
-        time_s: Simulation time in seconds.
-        voltages: Per-bus, per-phase voltage magnitudes.
-        tap_positions: Current regulator tap positions, or `None` if
-            no regulator is present.
-    """
-    time_s: float
-    voltages: BusVoltages
-    tap_positions: TapPosition | None = None
-GridStateT = TypeVar("GridStateT", bound=GridState)
-class GridBackend(Generic[GridStateT], ABC):
-    """Interface for grid simulation backends."""
-    _INIT_SENTINEL = object()
-    def __init__(self) -> None:
-        self._state: GridStateT | None = None
-        self._history: list[GridStateT] = []
-        self._grid_base_init = GridBackend._INIT_SENTINEL
-    def _check_base_init(self) -> None:
-        if getattr(self, "_grid_base_init", None) is not GridBackend._INIT_SENTINEL:
-            raise TypeError(f"{type(self).__name__}.__init__ must call super().__init__().")
-    @property
-    @abstractmethod
-    def dt_s(self) -> Fraction:
-        """Native timestep as a Fraction (seconds)."""
-    @final
-    @property
-    def state(self) -> GridStateT:
-        """Latest emitted state.
-        Raises:
-            RuntimeError: If accessed before the first `step()` call.
-        """
-        self._check_base_init()
-        if self._state is None:
-            raise RuntimeError(f"{type(self).__name__}.state accessed before first step().")
-        return self._state
-    @final
-    def history(self, n: int | None = None) -> list[GridStateT]:
-        """Return emitted state history (all, or latest `n`)."""
-        self._check_base_init()
-        if n is None:
-            return list(self._history)
-        if n <= 0:
-            return []
-        return list(self._history[-int(n) :])
-    @final
-    def do_step(
-        self,
-        clock: SimulationClock,
-        power_samples_w: list[ThreePhase],
-        events: EventEmitter,
-    ) -> GridStateT:
-        """Call `step`, record the state, and return it.
-        Called by the coordinator. Subclasses should not override this.
-        """
-        self._check_base_init()
-        state = self.step(clock, power_samples_w, events)
-        self._state = state
-        self._history.append(state)
-        return state
-    @abstractmethod
-    def step(
-        self,
-        clock: SimulationClock,
-        power_samples_w: list[ThreePhase],
-        events: EventEmitter,
-    ) -> GridStateT:
-        """Advance one native timestep and return state for this step."""
-    @abstractmethod
-    def apply_control(self, command: GridCommand, events: EventEmitter) -> None:
-        """Apply one control command."""
-    @abstractmethod
-    def voltages_vector(self) -> np.ndarray:
-        """Return voltage magnitudes in `v_index` order."""
-    @abstractmethod
-    def estimate_sensitivity(self, perturbation_kw: float = 100.0) -> tuple[np.ndarray, np.ndarray]:
-        """Estimate voltage sensitivity matrix (H = dv/dp) and return `(H, v0)`."""
-    @property
-    @abstractmethod
-    def v_index(self) -> list[tuple[str, int]]:
-        """Fixed (bus, phase) ordering used by [`voltages_vector`][..voltages_vector]."""
-    @final
-    def do_reset(self) -> None:
-        """Clear history and call `reset`.
-        Called by the coordinator. Subclasses should not override this.
-        """
-        self._check_base_init()
-        self._state = None
-        self._history.clear()
-        self.reset()
-    @abstractmethod
-    def reset(self) -> None:
-        """Reset simulation state to initial conditions.
-        Called by the coordinator (via `do_reset`) before each
-        [`start`][..start]. Must clear all simulation state: counters,
-        cached values. Configuration (dt_s, case files, tap schedules)
-        is not affected. History is cleared automatically by
-        `do_reset`.
-        Abstract so every implementation explicitly enumerates its state.
-        A forgotten field is a bug -- not clearing it silently corrupts
-        the second run.
-        """
-    def start(self) -> None:
-        """Acquire per-run resources (solver circuits, connections).
-        Called after [`reset`][..reset], before the simulation loop.
-        Override for backends that need resource acquisition (e.g.,
-        [`OpenDSSGrid`][openg2g.grid.opendss.OpenDSSGrid] compiles its
-        DSS circuit here). No-op by default because most offline
-        components have no resources to acquire.
-        """
-    def stop(self) -> None:
-        """Release per-run resources. Simulation state is preserved.
-        Called after the simulation loop in LIFO order. Override for
-        backends that acquired resources in [`start`][..start]. No-op
-        by default.
-        """

openg2g/grid/command.py DELETED Viewed

@@ -1,31 +0,0 @@
-"""Command types targeting grid backends."""
-from __future__ import annotations
-from dataclasses import dataclass
-from openg2g.grid.config import TapPosition
-class GridCommand:
-    """Base for commands targeting the grid backend.
-    Subclass this for each concrete grid command kind.
-    The coordinator routes commands to backends based on this type hierarchy.
-    """
-    def __init__(self) -> None:
-        if type(self) is GridCommand:
-            raise TypeError("GridCommand cannot be instantiated directly; subclass it.")
-@dataclass(frozen=True)
-class SetTaps(GridCommand):
-    """Set regulator tap positions.
-    Attributes:
-        tap_position: Per-phase tap ratios. Phases set to `None` are
-            unchanged.
-    """
-    tap_position: TapPosition

openg2g/grid/config.py DELETED Viewed

@@ -1,92 +0,0 @@
-"""Grid configuration and schedule types."""
-from __future__ import annotations
-from collections.abc import Iterator
-from dataclasses import dataclass
-@dataclass(frozen=True)
-class TapPosition:
-    """Regulator tap position per phase, as per-unit tap ratios.
-    Each field is the tap ratio for the corresponding phase regulator.
-    Phases set to `None` are left unchanged when applied.  At least
-    one phase must be specified.
-    Combine with [`at`][.at] and `|` to build a [`TapSchedule`][..TapSchedule]:
-    ```python
-    TAP_STEP = 0.00625  # standard 5/8% tap step
-    schedule = (
-        TapPosition(a=1.0 + 14 * TAP_STEP, b=1.0 + 6 * TAP_STEP, c=1.0 + 15 * TAP_STEP).at(t=0)
-        | TapPosition(a=1.1).at(t=1500)
-        | TapPosition(a=1.0625, c=1.0625).at(t=3300)
-    )
-    ```
-    """
-    a: float | None = None
-    b: float | None = None
-    c: float | None = None
-    def __post_init__(self) -> None:
-        if self.a is None and self.b is None and self.c is None:
-            raise ValueError("TapPosition requires at least one phase (a, b, or c).")
-    def at(self, t: float) -> TapSchedule:
-        """Schedule this position at time `t` seconds."""
-        return TapSchedule(((t, self),))
-class TapSchedule:
-    """Ordered sequence of scheduled tap positions.
-    Build using [`TapPosition.at`][..TapPosition.at] and the `|` operator:
-    ```python
-    TAP_STEP = 0.00625  # standard 5/8% tap step
-    schedule = (
-        TapPosition(a=1.0 + 14 * TAP_STEP, b=1.0 + 6 * TAP_STEP, c=1.0 + 15 * TAP_STEP).at(t=0)
-        | TapPosition(a=1.0 + 16 * TAP_STEP).at(t=25 * 60)
-    )
-    ```
-    Raises:
-        ValueError: If two entries share the same timestamp.
-    """
-    __slots__ = ("_entries",)
-    def __init__(self, entries: tuple[tuple[float, TapPosition], ...]) -> None:
-        self._entries = tuple(sorted(entries, key=lambda e: e[0]))
-        times = [t for t, _ in self._entries]
-        if len(times) != len(set(times)):
-            seen: set[float] = set()
-            dupes = sorted({t for t in times if t in seen or seen.add(t)})
-            raise ValueError(f"TapSchedule has duplicate timestamps: {dupes}")
-    def __or__(self, other: TapSchedule) -> TapSchedule:
-        return TapSchedule(self._entries + other._entries)
-    def __iter__(self) -> Iterator[tuple[float, TapPosition]]:
-        return iter(self._entries)
-    def __len__(self) -> int:
-        return len(self._entries)
-    def __bool__(self) -> bool:
-        return bool(self._entries)
-    def __repr__(self) -> str:
-        parts: list[str] = []
-        for t, p in self._entries:
-            fields = []
-            if p.a is not None:
-                fields.append(f"a={p.a}")
-            if p.b is not None:
-                fields.append(f"b={p.b}")
-            if p.c is not None:
-                fields.append(f"c={p.c}")
-            parts.append(f"TapPosition({', '.join(fields)}).at(t={t})")
-        return " | ".join(parts)

openg2g/grid/opendss.py DELETED Viewed

@@ -1,476 +0,0 @@
-"""OpenDSS-based grid simulator."""
-from __future__ import annotations
-import functools
-import logging
-import math
-from fractions import Fraction
-from pathlib import Path
-from typing import TYPE_CHECKING, Literal
-import numpy as np
-from openg2g.clock import SimulationClock
-from openg2g.common import ThreePhase
-from openg2g.events import EventEmitter
-from openg2g.grid.base import BusVoltages, GridBackend, GridState, PhaseVoltages
-from openg2g.grid.command import GridCommand, SetTaps
-from openg2g.grid.config import TapPosition
-if TYPE_CHECKING:
-    from opendssdirect import dss
-else:
-    try:
-        from opendssdirect.OpenDSSDirect import OpenDSSDirect
-        dss = OpenDSSDirect(prefer_lists=False)
-    except ImportError:
-        dss = None
-logger = logging.getLogger(__name__)
-_PHASES = (1, 2, 3)
-_PHASE_NAME = {1: "A", 2: "B", 3: "C"}
-_PHASE_TO_ATTR = {1: "a", 2: "b", 3: "c"}
-_DC_LOAD_NAMES = ("DataCenterA", "DataCenterB", "DataCenterC")
-class OpenDSSGrid(GridBackend[GridState]):
-    """OpenDSS-based grid simulator for distribution-level voltage analysis.
-    !!! Info
-        `OpenDSSDirect.py` is required to use this component.
-        Install with: `pip install openg2g[opendss]`.
-    This component uses OpenDSS purely as a power flow solver. The user's DSS
-    case file defines the network topology and any built-in controls (voltage
-    regulators, capacitor banks, etc.). The `dss_controls` flag determines
-    whether OpenDSS iterates those controls during each solve:
-    - `dss_controls=False` (default): Uses `SolveNoControl()`. OpenDSS runs
-      a single power flow without iterating any built-in control loops.
-      RegControls are disabled after initial tap setting. All voltage
-      regulation is managed externally through
-      [`apply_control`][.apply_control] commands (e.g., from
-      [`TapScheduleController`][openg2g.controller.tap_schedule.TapScheduleController]
-      or
-      [`OFOBatchSizeController`][openg2g.controller.ofo.OFOBatchSizeController]).
-    - `dss_controls=True`: Uses `Solve()`. OpenDSS iterates its built-in
-      control loops (RegControls, CapControls, etc.) as defined in the case
-      file. Use this when you want DSS-native control automation.
-    Args:
-        dss_case_dir: Absolute path to the directory containing OpenDSS case
-            files (e.g. line codes, bus coordinates).
-        dss_master_file: Name of the master DSS file, relative to
-            `dss_case_dir` (e.g. `"IEEE13Nodeckt.dss"`). OpenDSS resolves
-            all `redirect` and `BusCoords` paths in the master file
-            relative to this directory.
-        dc_bus: Bus name where the datacenter is connected.
-        dc_bus_kv: Line-to-line voltage (kV) at the datacenter bus.
-        power_factor: Power factor of the datacenter loads.
-        dt_s: Grid simulation timestep (seconds).
-        connection_type: Connection type for DC loads (default `"wye"`).
-        dss_controls: Whether to let OpenDSS iterate its built-in control
-            loops during each solve. Default False.
-        initial_tap_position: Initial regulator tap position applied before
-            the first solve. Each field is a per-unit tap ratio.
-        exclude_buses: Buses to exclude from voltage indexing (e.g., source bus).
-    """
-    def __init__(
-        self,
-        *,
-        dss_case_dir: str | Path,
-        dss_master_file: str,
-        dc_bus: str,
-        dc_bus_kv: float,
-        power_factor: float,
-        dt_s: Fraction = Fraction(1),
-        connection_type: Literal["wye", "delta"] = "wye",
-        dss_controls: bool = False,
-        initial_tap_position: TapPosition | None = None,
-        exclude_buses: tuple[str, ...] = ("rg60",),
-    ) -> None:
-        super().__init__()
-        if dss is None:
-            raise RuntimeError("OpenDSSDirect is required. Install with: pip install openg2g[opendss]")
-        self._case_dir = str(Path(dss_case_dir).resolve())
-        self._master = str(dss_master_file)
-        self._dc_bus = str(dc_bus)
-        self._dc_bus_kv = float(dc_bus_kv)
-        self._power_factor = float(power_factor)
-        pf = max(min(self._power_factor, 0.999999), 1e-6)
-        self._tanphi = math.tan(math.acos(pf))
-        self._dt_s = dt_s
-        self._connection_type: Literal["wye", "delta"] = connection_type
-        self._dss_controls = bool(dss_controls)
-        self._initial_tap_position = initial_tap_position
-        self._reg_map: dict[str, tuple[str, int, int]] | None = None
-        self._phase_to_reg: dict[int, str] | None = None
-        self._exclude_buses = tuple(str(b) for b in exclude_buses)
-        # Simulation state (cleared by reset)
-        self._prev_power: ThreePhase | None = None
-        # DSS-derived data (populated by start)
-        self._started = False
-        self.all_buses: list[str] = []
-        self.buses_with_phase: dict[int, list[str]] = {}
-        self._v_index: list[tuple[str, int]] = []
-    @property
-    def dt_s(self) -> Fraction:
-        return self._dt_s
-    @property
-    def v_index(self) -> list[tuple[str, int]]:
-        if not self._started:
-            raise RuntimeError("OpenDSSGrid.v_index accessed before start().")
-        return list(self._v_index)
-    def step(
-        self,
-        clock: SimulationClock,
-        power_samples_w: list[ThreePhase],
-        events: EventEmitter,
-    ) -> GridState:
-        """Advance one grid period and return the resulting grid state.
-        Uses the most recent power sample from the accumulated buffer to
-        run a single power flow solve. If no samples are provided (grid
-        runs faster than datacenter), the last known power is reused.
-        Args:
-            clock: Current simulation clock.
-            power_samples_w: List of
-                [`ThreePhase`][openg2g.common.ThreePhase] power samples
-                (Watts) accumulated since the last grid step.
-        Returns:
-            [`GridState`][openg2g.grid.base.GridState] with voltages
-                from the solve.
-        """
-        if not power_samples_w:
-            if self._prev_power is None:
-                raise RuntimeError("OpenDSSGrid.step() called with no power samples and no previous power.")
-            power = self._prev_power
-        else:
-            power = power_samples_w[-1]
-        self._prev_power = power
-        kW_A = power.a / 1e3
-        kW_B = power.b / 1e3
-        kW_C = power.c / 1e3
-        for name, kw in zip(_DC_LOAD_NAMES, (kW_A, kW_B, kW_C), strict=True):
-            dss.Loads.Name(name)
-            dss.Loads.kW(kw)
-            dss.Loads.kvar(kw * self._tanphi)
-        self._solve()
-        voltages = self._snapshot_bus_voltages()
-        return GridState(time_s=clock.time_s, voltages=voltages, tap_positions=self._read_current_taps())
-    @functools.singledispatchmethod
-    def apply_control(self, command: GridCommand, events: EventEmitter) -> None:
-        """Apply a control command. Dispatches on command type."""
-        raise TypeError(f"OpenDSSGrid does not support {type(command).__name__}")
-    @apply_control.register
-    def apply_control_set_taps(self, command: SetTaps, events: EventEmitter) -> None:
-        tap_map = self._tap_position_to_reg_dict(command.tap_position)
-        self._set_reg_taps(tap_map)
-        events.emit(
-            "grid.taps.updated",
-            {"tap_position": command.tap_position},
-        )
-    def reset(self) -> None:
-        self._prev_power = None
-        self._started = False
-    def start(self) -> None:
-        self._init_dss()
-        self._v_index = self._build_v_index()
-        self._build_vmag_indices()
-        self._build_snapshot_indices()
-        self._started = True
-        logger.info(
-            "OpenDSSGrid: case=%s, dc_bus=%s, dt=%s s, dss_controls=%s, %d buses, %d bus-phase pairs",
-            self._master,
-            self._dc_bus,
-            self._dt_s,
-            self._dss_controls,
-            len(self.all_buses),
-            len(self._v_index),
-        )
-    def voltages_vector(self) -> np.ndarray:
-        """Return voltage magnitudes (pu) in the fixed
-        [`v_index`][openg2g.grid.base.GridBackend.v_index] ordering."""
-        if not self._started:
-            raise RuntimeError("OpenDSSGrid.voltages_vector() called before start().")
-        vmag = dss.Circuit.AllBusMagPu()
-        return vmag[self._v_index_to_vmag]
-    def estimate_sensitivity(
-        self,
-        perturbation_kw: float = 100.0,
-    ) -> tuple[np.ndarray, np.ndarray]:
-        """Estimate voltage sensitivity matrix H = dv/dp (pu per kW).
-        Uses finite differences on the 3 single-phase DC loads.
-        Returns:
-            Tuple of `(sensitivity, baseline_voltages)`.
-                `sensitivity` has shape `(M, 3)` where M is the number
-                of bus-phase pairs in
-                [`v_index`][openg2g.grid.base.GridBackend.v_index].
-                `baseline_voltages` has shape `(M,)`.
-        """
-        perturbation_kw = float(perturbation_kw)
-        if perturbation_kw <= 0:
-            raise ValueError("perturbation_kw must be positive.")
-        dq_kvar = perturbation_kw * self._tanphi
-        # Always use SolveNoControl so that DSS-native controls
-        # (RegControls, CapControls) don't move between the baseline
-        # and perturbed solves. We need the open-loop plant sensitivity
-        # dv/dp, not the closed-loop response.
-        dss.Solution.SolveNoControl()
-        baseline_voltages = self.voltages_vector()
-        # Baseline P, Q for each DC load
-        p0 = np.zeros(3, dtype=float)
-        q0 = np.zeros(3, dtype=float)
-        for j, ld in enumerate(_DC_LOAD_NAMES):
-            dss.Loads.Name(ld)
-            p0[j] = float(dss.Loads.kW())
-            q0[j] = float(dss.Loads.kvar())
-        M = len(self._v_index)
-        sensitivity = np.zeros((M, 3), dtype=float)
-        for j, ld in enumerate(_DC_LOAD_NAMES):
-            dss.Text.Command(f"Edit Load.{ld} kW={p0[j] + perturbation_kw:.6f} kvar={q0[j] + dq_kvar:.6f}")
-            dss.Solution.SolveNoControl()
-            sensitivity[:, j] = (self.voltages_vector() - baseline_voltages) / perturbation_kw
-            # Restore load to baseline before next perturbation
-            dss.Text.Command(f"Edit Load.{ld} kW={p0[j]:.6f} kvar={q0[j]:.6f}")
-        # Re-solve with all loads restored (use normal solve to leave
-        # DSS in its expected state for subsequent step() calls)
-        self._solve()
-        return sensitivity, baseline_voltages
-    def _init_dss(self) -> None:
-        dss.Basic.ClearAll()
-        master_path = str(Path(self._case_dir) / self._master)
-        dss.Text.Command(f'Compile "{master_path}"')
-        self._reg_map = self._cache_regcontrol_map()
-        self._phase_to_reg = self._build_phase_to_reg_map(self._reg_map)
-        # Add 3 single-phase DC loads
-        if self._connection_type == "wye":
-            load_kv = self._dc_bus_kv / math.sqrt(3.0)
-        elif self._connection_type == "delta":
-            load_kv = self._dc_bus_kv
-        else:
-            raise ValueError(f"Unsupported connection_type: {self._connection_type!r}")
-        for ph, nm in zip(_PHASES, _DC_LOAD_NAMES, strict=True):
-            dss.Text.Command(
-                f"New Load.{nm} bus1={self._dc_bus}.{ph} phases=1 "
-                f"conn={self._connection_type} kV={load_kv:.6f} kW=0 kvar=0 model=1"
-            )
-        dss.Text.Command("Reset")
-        dss.Text.Command("Set Mode=Time")
-        dss.Text.Command(f"Set Stepsize={float(self._dt_s)}s")
-        if self._dss_controls:
-            dss.Text.Command("Set ControlMode=Time")
-        else:
-            dss.Text.Command("Set ControlMode=Off")
-        if self._initial_tap_position is not None:
-            self._set_reg_taps(self._tap_position_to_reg_dict(self._initial_tap_position))
-        self._solve()
-        self._cache_node_map()
-        self._cache_buses_with_phases()
-    def _solve(self) -> None:
-        """Run the OpenDSS power flow solver."""
-        if self._dss_controls:
-            dss.Solution.Solve()
-        else:
-            dss.Solution.SolveNoControl()
-    def _cache_buses_with_phases(self) -> None:
-        """Populate `all_buses` and `buses_with_phase` from the compiled circuit."""
-        self.all_buses = list(dss.Circuit.AllBusNames())
-        self.buses_with_phase = {ph: [] for ph in _PHASES}
-        for bus, phase in self._node_map:
-            if phase in _PHASES:
-                self.buses_with_phase[phase].append(bus)
-    def _cache_node_map(self) -> None:
-        """Cache the mapping from AllBusMagPu indices to (bus, phase) pairs."""
-        self._node_map: list[tuple[str, int]] = []
-        for name in dss.Circuit.AllNodeNames():
-            parts = name.split(".")
-            bus = parts[0]
-            phase = int(parts[1]) if len(parts) > 1 else 0
-            self._node_map.append((bus, phase))
-    def _build_vmag_indices(self) -> None:
-        """Pre-compute index arrays for fast voltage vector extraction."""
-        node_idx = {(bus, ph): i for i, (bus, ph) in enumerate(self._node_map)}
-        self._v_index_to_vmag = np.array(
-            [node_idx[(bus, ph)] for bus, ph in self._v_index],
-            dtype=int,
-        )
-    def _build_snapshot_indices(self) -> None:
-        """Pre-compute index arrays for `_snapshot_bus_voltages`.
-        Builds a `(num_buses, 3)` array where entry `[b, p]` is the
-        index into `AllBusMagPu()` for bus `b`, phase `p+1`, or -1 if
-        that bus-phase pair doesn't exist (mapped to NaN at read time).
-        """
-        bus_to_idx = {bus: i for i, bus in enumerate(self.all_buses)}
-        n_buses = len(self.all_buses)
-        # -1 means "missing phase -> NaN"
-        self._snap_indices = np.full((n_buses, 3), -1, dtype=int)
-        for vmag_idx, (bus, phase) in enumerate(self._node_map):
-            if 1 <= phase <= 3:
-                bus_idx = bus_to_idx.get(bus)
-                if bus_idx is not None:
-                    self._snap_indices[bus_idx, phase - 1] = vmag_idx
-    def _snapshot_bus_voltages(self) -> BusVoltages:
-        """Snapshot all per-bus, per-phase voltage magnitudes into BusVoltages.
-        Uses pre-computed index arrays and a single `AllBusMagPu()` bulk
-        read. Missing bus-phase pairs (index == -1) are set to NaN.
-        """
-        vmag = dss.Circuit.AllBusMagPu()
-        # Append a NaN sentinel so index -1 reads as NaN
-        vmag_ext = np.append(vmag, float("nan"))
-        volts = vmag_ext[self._snap_indices]
-        data = {
-            bus: PhaseVoltages(a=float(volts[i, 0]), b=float(volts[i, 1]), c=float(volts[i, 2]))
-            for i, bus in enumerate(self.all_buses)
-        }
-        return BusVoltages(_data=data)
-    def _build_v_index(self) -> list[tuple[str, int]]:
-        excl = {b.lower() for b in self._exclude_buses}
-        v_index: list[tuple[str, int]] = []
-        for ph in _PHASES:
-            for b in self.buses_with_phase.get(ph, []):
-                if str(b).lower() in excl:
-                    continue
-                v_index.append((str(b), int(ph)))
-        return v_index
-    @staticmethod
-    def _cache_regcontrol_map() -> dict[str, tuple[str, int, int]]:
-        """Enumerate RegControls and discover their transformer, winding, and phase.
-        Returns:
-            Mapping of `rc_name -> (transformer_name, winding, phase)` where
-                phase is 1/2/3 for A/B/C. Phase is determined from the
-                transformer's bus connections (e.g., `"650.1"` -> phase 1).
-        """
-        reg_map: dict[str, tuple[str, int, int]] = {}
-        for rc in dss.RegControls:
-            rc_name = rc.Name().lower()
-            xf = rc.Transformer()
-            w = int(rc.Winding())
-            # Discover phase from transformer bus connections
-            dss.Transformers.Name(xf)
-            bus_names = list(dss.CktElement.BusNames())
-            phase = 0
-            for bus_str in bus_names:
-                parts = str(bus_str).split(".")
-                if len(parts) >= 2:
-                    phase = int(parts[1])
-                    break
-            if phase not in (1, 2, 3):
-                raise RuntimeError(
-                    f"Cannot determine phase for RegControl '{rc_name}' "
-                    f"(transformer={xf}, buses={bus_names}). "
-                    f"Expected bus format 'name.phase' with phase in {{1,2,3}}."
-                )
-            reg_map[rc_name] = (xf, w, phase)
-        return reg_map
-    @staticmethod
-    def _build_phase_to_reg_map(reg_map: dict[str, tuple[str, int, int]]) -> dict[int, str]:
-        """Build reverse mapping from phase (1/2/3) to RegControl name."""
-        phase_to_reg: dict[int, str] = {}
-        for rc_name, (_xf, _wdg, phase) in reg_map.items():
-            if phase in phase_to_reg:
-                logger.warning(
-                    "Multiple RegControls on phase %s: '%s' and '%s'. Using '%s'.",
-                    _PHASE_NAME[phase],
-                    phase_to_reg[phase],
-                    rc_name,
-                    rc_name,
-                )
-            phase_to_reg[phase] = rc_name
-        return phase_to_reg
-    def _tap_position_to_reg_dict(self, pos: TapPosition) -> dict[str, float]:
-        """Map phase tap ratios to OpenDSS RegControl names using discovered mapping."""
-        if self._phase_to_reg is None:
-            raise RuntimeError("_phase_to_reg not initialized; call start() first")
-        d: dict[str, float] = {}
-        for phase, attr in _PHASE_TO_ATTR.items():
-            val = getattr(pos, attr)
-            if val is not None and phase in self._phase_to_reg:
-                d[self._phase_to_reg[phase]] = val
-        return d
-    def _set_reg_taps(self, tap_map: dict[str, float]) -> None:
-        """Write tap ratios to OpenDSS RegControl transformers."""
-        if self._reg_map is None:
-            self._reg_map = self._cache_regcontrol_map()
-        tap_map_lc = {str(k).lower(): float(v) for k, v in tap_map.items()}
-        for rc_key, (xfmr, wdg, _phase) in self._reg_map.items():
-            if rc_key in tap_map_lc:
-                tap_pu = tap_map_lc[rc_key]
-                dss.Text.Command(f"Edit Transformer.{xfmr} Wdg={wdg} Tap={tap_pu:.6f}")
-    def _read_current_taps(self) -> TapPosition:
-        """Read current regulator tap positions from OpenDSS."""
-        if self._reg_map is None:
-            self._reg_map = self._cache_regcontrol_map()
-        if self._phase_to_reg is None:
-            self._phase_to_reg = self._build_phase_to_reg_map(self._reg_map)
-        phase_taps: dict[str, float | None] = {"a": None, "b": None, "c": None}
-        for _rc_key, (xfmr, wdg, phase) in self._reg_map.items():
-            dss.Transformers.Name(xfmr)
-            dss.Transformers.Wdg(wdg)
-            attr = _PHASE_TO_ATTR.get(phase)
-            if attr is not None:
-                phase_taps[attr] = float(dss.Transformers.Tap())
-        return TapPosition(a=phase_taps["a"], b=phase_taps["b"], c=phase_taps["c"])

openg2g/metrics/__init__.py DELETED Viewed

File without changes

openg2g/metrics/voltage.py DELETED Viewed

@@ -1,94 +0,0 @@
-"""Voltage violation metrics for all-bus, all-phase analysis."""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from openg2g.grid.base import GridState
-@dataclass
-class VoltageStats:
-    """Summary voltage statistics over a simulation run.
-    Attributes:
-        worst_vmin: Lowest voltage observed across all buses and phases (pu).
-        worst_vmax: Highest voltage observed across all buses and phases (pu).
-        violation_time_s: Total time with at least one bus-phase violating
-            voltage bounds (seconds).
-        integral_violation_pu_s: Integrated voltage violation magnitude
-            across all bus-phase pairs (pu * s).
-    """
-    worst_vmin: float
-    worst_vmax: float
-    violation_time_s: float
-    integral_violation_pu_s: float
-def compute_allbus_voltage_stats(
-    grid_states: list[GridState],
-    *,
-    v_min: float = 0.95,
-    v_max: float = 1.05,
-    exclude_buses: tuple[str, ...] = ("rg60",),
-) -> VoltageStats:
-    """Compute voltage violation statistics across all buses and phases.
-    For each snapshot the integral violation sums
-    `max(v_min - v, 0) + max(v - v_max, 0)` over every non-excluded
-    bus-phase pair, then integrates over time.  A snapshot counts as
-    "violated" when this sum is positive.
-    Args:
-        grid_states: Sequence of [`GridState`][openg2g.grid.base.GridState]
-            objects from a simulation run.
-        v_min: Lower voltage bound (pu).
-        v_max: Upper voltage bound (pu).
-        exclude_buses: Bus names to exclude from statistics (case-insensitive).
-    """
-    if len(grid_states) < 2:
-        raise ValueError(
-            f"At least two grid states are required to compute voltage statistics (got {len(grid_states)})."
-        )
-    times = np.array([gs.time_s for gs in grid_states], dtype=float)
-    dt = float(np.median(np.diff(times)))
-    # Collect bus-phase columns from the first snapshot (all snapshots
-    # share the same set of buses for a given OpenDSS circuit).
-    exclude = {b.lower() for b in exclude_buses}
-    bus_names = [b for b in grid_states[0].voltages.buses() if b.lower() not in exclude]
-    # Build (T, N) voltage matrix where N = num_buses * 3.
-    T = len(grid_states)
-    N = len(bus_names) * 3
-    V = np.empty((T, N), dtype=float)
-    for t, gs in enumerate(grid_states):
-        col = 0
-        for bus in bus_names:
-            tp = gs.voltages[bus]
-            V[t, col] = tp.a
-            V[t, col + 1] = tp.b
-            V[t, col + 2] = tp.c
-            col += 3
-    valid = ~np.isnan(V)
-    worst_vmin = float(np.min(np.where(valid, V, np.inf)))
-    worst_vmax = float(np.max(np.where(valid, V, -np.inf)))
-    # Per-timestep violation: sum over all bus-phase pairs
-    viol = np.where(valid, np.maximum(v_min - V, 0.0) + np.maximum(V - v_max, 0.0), 0.0)
-    viol_sum = np.sum(viol, axis=1)  # shape (T,)
-    violation_steps = int(np.count_nonzero(viol_sum > 0.0))
-    integral_violation = float(np.sum(viol_sum * dt))
-    return VoltageStats(
-        worst_vmin=float(worst_vmin),
-        worst_vmax=float(worst_vmax),
-        violation_time_s=float(violation_steps * dt),
-        integral_violation_pu_s=float(integral_violation),
-    )

openg2g/utils.py DELETED Viewed

@@ -1,18 +0,0 @@
-"""Shared utility functions."""
-from __future__ import annotations
-def split_integer_evenly(n: int, k: int) -> list[int]:
-    """Split integer *n* into *k* non-negative integers whose sum is *n*,
-    differing by at most 1.
-    Example:
-    ```python
-    split_integer_evenly(10, 3)  # -> [4, 3, 3]
-    split_integer_evenly(2, 5)   # -> [1, 1, 0, 0, 0]
-    ```
-    """
-    q, r = divmod(int(n), int(k))
-    return [q + (1 if i < r else 0) for i in range(k)]

pyproject.toml CHANGED Viewed

@@ -3,8 +3,8 @@ requires = ["setuptools>=64"]
 build-backend = "setuptools.build_meta"
 [project]
-name = "openg2g"
-dynamic = ["version"]
 description = "A GPU-to-Grid simulation library for datacenter-grid cooperation."
 requires-python = ">=3.10"
 license = "Apache-2.0"
@@ -16,6 +16,8 @@ dependencies = [
     "aiohttp",
     "zeus>=0.15.0",
     "mlenergy-data",
 ]
 [project.urls]
@@ -38,11 +40,11 @@ dev = [
     {include-group = "examples"},
 ]
-[tool.setuptools.dynamic]
-version = {attr = "openg2g.__version__"}
 [tool.setuptools.packages.find]
-include = ["openg2g*"]
 [tool.ruff]
 target-version = "py310"

 build-backend = "setuptools.build_meta"
 [project]
+name = "bus-system-backend"
+version = "0.1.0"
 description = "A GPU-to-Grid simulation library for datacenter-grid cooperation."
 requires-python = ">=3.10"
 license = "Apache-2.0"
     "aiohttp",
     "zeus>=0.15.0",
     "mlenergy-data",
+    "openg2g[opendss]",
 ]
 [project.urls]
     {include-group = "examples"},
 ]
 [tool.setuptools.packages.find]
+where = ["."]
+include = ["your_project_name*"]
+exclude = ["data*", "outputs*", "scripts*", "tests*"]
 [tool.ruff]
 target-version = "py310"

server.py CHANGED Viewed

@@ -33,29 +33,25 @@ from  openg2g.grid.config import TapPosition
 from  openg2g.controller.tap_schedule import TapScheduleController
 from  openg2g.metrics.voltage import compute_allbus_voltage_stats
-import asyncio, uuid, time
-from concurrent.futures import ProcessPoolExecutor
-import sqlite3, json
-conn = sqlite3.connect("jobs.db", check_same_thread=False, timeout=30)
-conn.execute("PRAGMA journal_mode=WAL;")
-# create table to track background simulation jobs
-conn.execute("""
-CREATE TABLE IF NOT EXISTS jobs (
-    id TEXT PRIMARY KEY,
-    status TEXT,
-    result TEXT,
-    error TEXT
-)
-""")
-conn.commit()
 #currently set to 2 for free tier at hf
 _pool        = ProcessPoolExecutor(max_workers=2)
-_jobs: dict  = {}
 _start_time  = time.time()
@@ -134,7 +130,7 @@ def _get_trace_power(model_label: str, num_gpus: int, max_num_seqs: int,
     return [p * num_replicas for p in power_W]
-print(f"  [startup] data dir: {_DATA_DIR}  exists={_DATA_DIR.exists()}")
 _load_traces_index()  # load at startup
@@ -330,7 +326,7 @@ def _run_full(req_dict: dict) -> dict:
 """Get per-bus voltage (worst phase per bus)."""
-def _voltages(gs, debug=False) -> list[float]:
     result = []
     for name in BUSES_ORDERED:
         try:
@@ -338,13 +334,13 @@ def _voltages(gs, debug=False) -> list[float]:
             vals = [float(v) for v in [tp.a, tp.b, tp.c]
                     if not math.isnan(float(v)) and 0.5 < float(v) < 1.5]
             result.append(min(vals) if vals else None)
-        except Exception:
             result.append(None)
     known = [v for v in result if v is not None]
     avg   = sum(known) / len(known) if known else 1.0
     result = [v if v is not None else avg for v in result]
-    if debug:
-        print(f"  [V] {[round(v,4) for v in result]}")
     return result
@@ -390,43 +386,10 @@ def health():
-@app.get("/api/status")
-def status():
-    active = conn.execute(
-        "SELECT COUNT(*) FROM jobs WHERE status='pending'"
-    ).fetchone()[0]
-    total = conn.execute(
-        "SELECT COUNT(*) FROM jobs"
-    ).fetchone()[0]
-    return {
-        "active_jobs": active,
-        "total_jobs": total,
-        "workers": _pool._max_workers,
-    }
-@app.get("/api/job/{job_id}")
-def get_job(job_id: str):
-    row = conn.execute(
-        "SELECT status, result, error FROM jobs WHERE id=?",
-        (job_id,)
-    ).fetchone()
-    if not row:
-        raise HTTPException(404, "Job not found")
-    status, result, error = row
-    if status == "done":
-        return {"status": status, "result": json.loads(result)}
-    elif status == "error":
-        return {"status": status, "detail": error}
-    else:
-        return {"status": status}
 """Return available traces"""
 @app.get("/api/traces")
@@ -459,18 +422,18 @@ def list_traces():
 """Baseline grid simulation, no workload"""
 @app.post("/api/powerflow")
 async def powerflow(req: PowerflowRequest):
-    print(f"\nPowerflow v={req.substationVoltage}")
     try:
         dc   = _build_dc(scale=0.001, duration_s=5)
         grid = _build_grid(req.substationVoltage, "671")
         log  = _run(dc, grid, req.substationVoltage, "671", 5)
-        vs   = _voltages(log.grid_states[-1], debug=True)
-        print(f" min={min(vs):.4f}  max={max(vs):.4f}")
         return {"buses": [{"id": i+1, "voltage": v, "activePower": 0.0,
                             "reactivePower": 0.0} for i, v in enumerate(vs)],
                 "lines": []}
     except Exception as e:
-        import traceback; traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
@@ -478,34 +441,16 @@ async def powerflow(req: PowerflowRequest):
 """Simulate AI workload impact on grid using GPU traces."""
 @app.post("/api/llm-impact")
 async def llm_impact(req: LLMImpactRequest):
-    job_id = uuid.uuid4().hex
-    conn.execute(
-        "INSERT INTO jobs (id, status) VALUES (?, ?)",
-        (job_id, "pending")
-    )
-    conn.commit()
-    async def run_and_store():
-        try:
-            loop = asyncio.get_event_loop()
-            result = await loop.run_in_executor(_pool, _run_full, req.dict())
-            conn.execute(
-                "UPDATE jobs SET status=?, result=? WHERE id=?",
-                ("done", json.dumps(result), job_id)
-            )
-            conn.commit()
-        except Exception as e:
-            conn.execute(
-                "UPDATE jobs SET status=?, error=? WHERE id=?",
-                ("error", str(e), job_id)
-            )
-            conn.commit()
-    asyncio.create_task(run_and_store())
-    return {"job_id": job_id}
@@ -527,13 +472,12 @@ async def heatmap(req: HeatmapRequest):
 if __name__ == "__main__":
-    print("\n" + "="*70)
-    print("="*70)
-    print(f"   Data:   {_DATA_DIR}  ready={_DATA_DIR.exists()}")
     df = _load_traces_index()
     if not df.empty:
         models = df["model_label"].unique().tolist()
-        print(f"   Models: {models}")
-        print(f"   Traces: {len(df)} configurations")
-    print("="*70 + "\n")
     uvicorn.run("server:app", host="0.0.0.0", port=8080, workers=1, log_level="info")

 from  openg2g.controller.tap_schedule import TapScheduleController
 from  openg2g.metrics.voltage import compute_allbus_voltage_stats
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+)
+logger = logging.getLogger(__name__)
+import asyncio, time
+from concurrent.futures import ProcessPoolExecutor
+import json
 #currently set to 2 for free tier at hf
 _pool        = ProcessPoolExecutor(max_workers=2)
 _start_time  = time.time()
     return [p * num_replicas for p in power_W]
+logger.info(f"Data dir: {_DATA_DIR} exists={_DATA_DIR.exists()}")
 _load_traces_index()  # load at startup
 """Get per-bus voltage (worst phase per bus)."""
+def _voltages(gs) -> list[float]:
     result = []
     for name in BUSES_ORDERED:
         try:
             vals = [float(v) for v in [tp.a, tp.b, tp.c]
                     if not math.isnan(float(v)) and 0.5 < float(v) < 1.5]
             result.append(min(vals) if vals else None)
+        except Exception as e:
+            logger.debug(f"Bus {name} voltage unavailable: {e}")
             result.append(None)
     known = [v for v in result if v is not None]
     avg   = sum(known) / len(known) if known else 1.0
     result = [v if v is not None else avg for v in result]
+    logger.debug(f"Voltages: {[round(v,4) for v in result]}")
     return result
 """Return available traces"""
 @app.get("/api/traces")
 """Baseline grid simulation, no workload"""
 @app.post("/api/powerflow")
 async def powerflow(req: PowerflowRequest):
+    logger.info(f"Powerflow request v={req.substationVoltage}")
     try:
         dc   = _build_dc(scale=0.001, duration_s=5)
         grid = _build_grid(req.substationVoltage, "671")
         log  = _run(dc, grid, req.substationVoltage, "671", 5)
+        vs = _voltages(log.grid_states[-1])
+        logger.info(f"Powerflow result min={min(vs):.4f} max={max(vs):.4f}")
         return {"buses": [{"id": i+1, "voltage": v, "activePower": 0.0,
                             "reactivePower": 0.0} for i, v in enumerate(vs)],
                 "lines": []}
     except Exception as e:
+        logger.exception("Powerflow failed")
         raise HTTPException(status_code=500, detail=str(e))
 """Simulate AI workload impact on grid using GPU traces."""
 @app.post("/api/llm-impact")
 async def llm_impact(req: LLMImpactRequest):
+    logger.info(f"Simulation request: {req.modelLabel} bus={req.targetBus}")
+    try:
+        loop = asyncio.get_event_loop()
+        result = await loop.run_in_executor(_pool, _run_full, req.dict())
+        return result
+    except Exception as e:
+        logger.exception("Simulation failed")
+        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
+    logger.info("=" * 70)
+    logger.info(f"Data dir: {_DATA_DIR} ready={_DATA_DIR.exists()}")
     df = _load_traces_index()
     if not df.empty:
         models = df["model_label"].unique().tolist()
+        logger.info(f"Models: {models}")
+        logger.info(f"Traces: {len(df)} configurations")
+    logger.info("=" * 70)
     uvicorn.run("server:app", host="0.0.0.0", port=8080, workers=1, log_level="info")