Add GOP parameter: fixed (4/8/16) and dynamic (energy-adaptive)
Browse filesMirrors codec_tools' grouping_mode {fixed, readiness} for the
patch-selection stage of the demo.
- global : one group across the whole video (current behavior).
- 4/8/16 : fixed-size groups; budget split equally, top-K within
each group. Forces patches to spread along the timeline
instead of clumping on the single most salient moment.
- dynamic : adaptive groups via build_dynamic_groups, walking sampled
frames in time order and closing each group when its
cumulative score sum reaches total_energy / target_groups.
This is the simplified readiness rule from
codec_tools/pipeline/process_video_bitcost_mv_mask_collage.py
(equal-energy groups, no coverage_bins / delta-ratio
refinement β this is a demo).
The codec-vs-uniform chart now shows group boundaries as dashed
vertical lines on the codec panel, and the title carries the
GOP label and group count. Run info JSON gains:
- params.gop (resolved label)
- gop_groups[] ({start, end, n_frames, selected})
|
@@ -208,6 +208,114 @@ def global_topk_masks(
|
|
| 208 |
return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
|
| 209 |
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
|
| 212 |
"""Convert to gray-white wash: gray * (1-fade) + white * fade."""
|
| 213 |
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
|
|
@@ -352,6 +460,8 @@ def make_charts(
|
|
| 352 |
total_duration_sec: float,
|
| 353 |
total_patches_budget: int,
|
| 354 |
saliency_signal: str,
|
|
|
|
|
|
|
| 355 |
):
|
| 356 |
"""Two side-by-side panels comparing codec selection vs uniform sampling.
|
| 357 |
|
|
@@ -389,8 +499,13 @@ def make_charts(
|
|
| 389 |
edgecolor="#312e81", linewidth=0.4,
|
| 390 |
)
|
| 391 |
total_selected = sum(counts)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
ax1.set_title(
|
| 393 |
-
f"Codec selection Β· {saliency_signal} Β· {
|
|
|
|
| 394 |
fontsize=10, color="#1e293b",
|
| 395 |
)
|
| 396 |
ax1.set_xlabel("time (s)", fontsize=9)
|
|
@@ -400,6 +515,18 @@ def make_charts(
|
|
| 400 |
ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
|
| 401 |
ax1.spines[["top", "right"]].set_visible(False)
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
# βββ Right: uniform-sampling baseline at the same budget ββββββββββββ
|
| 404 |
n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
|
| 405 |
uniform_times = (
|
|
@@ -443,6 +570,7 @@ def process(
|
|
| 443 |
score_log_scale: bool = False,
|
| 444 |
bitcost_pct: float = 99.0,
|
| 445 |
fade_strength: float = 0.55,
|
|
|
|
| 446 |
progress=gr.Progress(track_tqdm=False),
|
| 447 |
):
|
| 448 |
if not video_path:
|
|
@@ -501,7 +629,9 @@ def process(
|
|
| 501 |
grids = compute_score_grids(resized, int(patch_size), saliency_signal)
|
| 502 |
if score_log_scale:
|
| 503 |
grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
|
| 504 |
-
masks, actual_selected =
|
|
|
|
|
|
|
| 505 |
norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
|
| 506 |
|
| 507 |
mode = (viz_mode or "selection").lower()
|
|
@@ -553,7 +683,17 @@ def process(
|
|
| 553 |
"score_log_scale": bool(score_log_scale),
|
| 554 |
"bitcost_pct": float(bitcost_pct),
|
| 555 |
"fade_strength": float(fade_strength),
|
|
|
|
| 556 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
"frame_window": {
|
| 558 |
"first_decoded": int(f_start),
|
| 559 |
"last_decoded": int(f_end),
|
|
@@ -586,6 +726,7 @@ def process(
|
|
| 586 |
chart_fig = make_charts(
|
| 587 |
grids, masks, fids, fps, duration_sec,
|
| 588 |
int(total_patches), saliency_signal,
|
|
|
|
| 589 |
)
|
| 590 |
|
| 591 |
progress(1.0, desc="Done")
|
|
@@ -873,6 +1014,21 @@ with gr.Blocks(**_BLOCK_KW) as demo:
|
|
| 873 |
patch_size = gr.Radio(
|
| 874 |
PATCH_CHOICES, value=14, label="Patch size (px)",
|
| 875 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
|
| 877 |
with gr.Group(elem_classes="ovc-card"):
|
| 878 |
gr.Markdown("### Quick presets")
|
|
@@ -993,6 +1149,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
|
|
| 993 |
viz_mode, heatmap_alpha,
|
| 994 |
start_sec, end_sec,
|
| 995 |
saliency_signal, score_log_scale, bitcost_pct, fade_strength,
|
|
|
|
| 996 |
],
|
| 997 |
outputs=[vis_out, canvas_out, info_out, chart_out],
|
| 998 |
)
|
|
|
|
| 208 |
return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
|
| 209 |
|
| 210 |
|
| 211 |
+
def build_dynamic_groups(
|
| 212 |
+
grids: List[np.ndarray], target_groups: int = 4, min_group_frames: int = 1,
|
| 213 |
+
) -> List[Tuple[int, int]]:
|
| 214 |
+
"""Adaptive temporal grouping by cumulative saliency energy.
|
| 215 |
+
|
| 216 |
+
Walk sampled frames in time order, accumulate frame-level score sums,
|
| 217 |
+
and close the current group once the running total reaches
|
| 218 |
+
`total_energy / target_groups`. Groups end up roughly equal in
|
| 219 |
+
*information content* rather than equal in frame count β this is the
|
| 220 |
+
same intuition as codec_tools' readiness mode, simplified for the
|
| 221 |
+
demo (no temporal-coverage / marginal-gain refinement)."""
|
| 222 |
+
n = len(grids)
|
| 223 |
+
if n == 0:
|
| 224 |
+
return []
|
| 225 |
+
if n <= target_groups:
|
| 226 |
+
return [(i, i) for i in range(n)]
|
| 227 |
+
|
| 228 |
+
energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
|
| 229 |
+
total = energies.sum()
|
| 230 |
+
if total <= 1e-8:
|
| 231 |
+
# Degenerate: pure even split.
|
| 232 |
+
size = max(1, n // target_groups)
|
| 233 |
+
groups: List[Tuple[int, int]] = []
|
| 234 |
+
cursor = 0
|
| 235 |
+
while cursor < n and len(groups) < target_groups:
|
| 236 |
+
end = min(n - 1, cursor + size - 1)
|
| 237 |
+
if len(groups) == target_groups - 1:
|
| 238 |
+
end = n - 1
|
| 239 |
+
groups.append((cursor, end))
|
| 240 |
+
cursor = end + 1
|
| 241 |
+
return groups
|
| 242 |
+
|
| 243 |
+
target_per_group = total / target_groups
|
| 244 |
+
groups = []
|
| 245 |
+
start = 0
|
| 246 |
+
cum = 0.0
|
| 247 |
+
for i in range(n):
|
| 248 |
+
cum += energies[i]
|
| 249 |
+
groups_left = target_groups - len(groups) - 1
|
| 250 |
+
frames_left_after = n - i - 1
|
| 251 |
+
# Close this group if energy budget hit AND we still leave room for
|
| 252 |
+
# the remaining groups (each needs >= min_group_frames frames).
|
| 253 |
+
threshold_hit = cum >= target_per_group
|
| 254 |
+
room_ok = frames_left_after >= groups_left * min_group_frames
|
| 255 |
+
size_ok = (i - start + 1) >= min_group_frames
|
| 256 |
+
if threshold_hit and room_ok and size_ok and len(groups) < target_groups - 1:
|
| 257 |
+
groups.append((start, i))
|
| 258 |
+
start = i + 1
|
| 259 |
+
cum = 0.0
|
| 260 |
+
# Tail group (whatever frames remain).
|
| 261 |
+
if start <= n - 1:
|
| 262 |
+
groups.append((start, n - 1))
|
| 263 |
+
return groups
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def grouped_topk_masks(
|
| 267 |
+
grids: List[np.ndarray], total_k: int, gop: str,
|
| 268 |
+
) -> Tuple[List[np.ndarray], int, List[Tuple[int, int]], str]:
|
| 269 |
+
"""Select patches under a GOP grouping strategy.
|
| 270 |
+
|
| 271 |
+
GOP modes:
|
| 272 |
+
- "global": one big group across the whole video β top-K global.
|
| 273 |
+
- "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
|
| 274 |
+
budget is split equally across groups, top-K picked within each.
|
| 275 |
+
- "dynamic": adaptive groups (see build_dynamic_groups), targeting
|
| 276 |
+
4 groups by default; each group gets an equal share of the budget.
|
| 277 |
+
|
| 278 |
+
Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
|
| 279 |
+
"""
|
| 280 |
+
n = len(grids)
|
| 281 |
+
if n == 0:
|
| 282 |
+
return [], 0, [], gop
|
| 283 |
+
|
| 284 |
+
mode = (gop or "global").strip().lower()
|
| 285 |
+
|
| 286 |
+
if mode in ("global", "none", "0", ""):
|
| 287 |
+
masks, actual = global_topk_masks(grids, int(total_k))
|
| 288 |
+
return masks, actual, [(0, n - 1)], "global"
|
| 289 |
+
|
| 290 |
+
if mode == "dynamic":
|
| 291 |
+
groups = build_dynamic_groups(grids, target_groups=min(4, max(1, n)))
|
| 292 |
+
else:
|
| 293 |
+
try:
|
| 294 |
+
g_size = max(1, int(mode))
|
| 295 |
+
except ValueError:
|
| 296 |
+
g_size = n
|
| 297 |
+
groups = []
|
| 298 |
+
cursor = 0
|
| 299 |
+
while cursor < n:
|
| 300 |
+
end = min(n - 1, cursor + g_size - 1)
|
| 301 |
+
groups.append((cursor, end))
|
| 302 |
+
cursor = end + 1
|
| 303 |
+
|
| 304 |
+
num_groups = max(1, len(groups))
|
| 305 |
+
per_group_budget = max(1, int(total_k) // num_groups)
|
| 306 |
+
|
| 307 |
+
# Initialize empty masks, then fill per-group selections.
|
| 308 |
+
out_masks = [np.zeros(g.shape, dtype=np.uint8) for g in grids]
|
| 309 |
+
actual_total = 0
|
| 310 |
+
for (s, e) in groups:
|
| 311 |
+
sub = grids[s:e + 1]
|
| 312 |
+
sub_masks, sub_actual = global_topk_masks(sub, per_group_budget)
|
| 313 |
+
for i, sm in enumerate(sub_masks):
|
| 314 |
+
out_masks[s + i] = sm
|
| 315 |
+
actual_total += sub_actual
|
| 316 |
+
return out_masks, actual_total, groups, mode
|
| 317 |
+
|
| 318 |
+
|
| 319 |
def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
|
| 320 |
"""Convert to gray-white wash: gray * (1-fade) + white * fade."""
|
| 321 |
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
| 460 |
total_duration_sec: float,
|
| 461 |
total_patches_budget: int,
|
| 462 |
saliency_signal: str,
|
| 463 |
+
groups: List[Tuple[int, int]] = None,
|
| 464 |
+
gop_label: str = "global",
|
| 465 |
):
|
| 466 |
"""Two side-by-side panels comparing codec selection vs uniform sampling.
|
| 467 |
|
|
|
|
| 499 |
edgecolor="#312e81", linewidth=0.4,
|
| 500 |
)
|
| 501 |
total_selected = sum(counts)
|
| 502 |
+
n_groups = len(groups) if groups else 1
|
| 503 |
+
gop_str = (
|
| 504 |
+
gop_label if gop_label in ("global", "dynamic") else f"GOP={gop_label}"
|
| 505 |
+
)
|
| 506 |
ax1.set_title(
|
| 507 |
+
f"Codec selection Β· {saliency_signal} Β· {gop_str} "
|
| 508 |
+
f"({n_groups} groups) Β· {total_selected} patches",
|
| 509 |
fontsize=10, color="#1e293b",
|
| 510 |
)
|
| 511 |
ax1.set_xlabel("time (s)", fontsize=9)
|
|
|
|
| 515 |
ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
|
| 516 |
ax1.spines[["top", "right"]].set_visible(False)
|
| 517 |
|
| 518 |
+
# Group boundary lines (skip if there's just one big group).
|
| 519 |
+
if groups and len(groups) > 1 and times:
|
| 520 |
+
for (_, end_idx) in groups[:-1]:
|
| 521 |
+
if end_idx + 1 < len(times):
|
| 522 |
+
bx = (times[end_idx] + times[end_idx + 1]) / 2.0
|
| 523 |
+
else:
|
| 524 |
+
bx = times[end_idx] + bar_w
|
| 525 |
+
ax1.axvline(
|
| 526 |
+
bx, color="#94a3b8", linestyle=(0, (4, 3)),
|
| 527 |
+
alpha=0.55, linewidth=0.9,
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
# βββ Right: uniform-sampling baseline at the same budget ββββββββββββ
|
| 531 |
n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
|
| 532 |
uniform_times = (
|
|
|
|
| 570 |
score_log_scale: bool = False,
|
| 571 |
bitcost_pct: float = 99.0,
|
| 572 |
fade_strength: float = 0.55,
|
| 573 |
+
gop: str = "global",
|
| 574 |
progress=gr.Progress(track_tqdm=False),
|
| 575 |
):
|
| 576 |
if not video_path:
|
|
|
|
| 629 |
grids = compute_score_grids(resized, int(patch_size), saliency_signal)
|
| 630 |
if score_log_scale:
|
| 631 |
grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
|
| 632 |
+
masks, actual_selected, groups, gop_resolved = grouped_topk_masks(
|
| 633 |
+
grids, int(total_patches), str(gop or "global"),
|
| 634 |
+
)
|
| 635 |
norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
|
| 636 |
|
| 637 |
mode = (viz_mode or "selection").lower()
|
|
|
|
| 683 |
"score_log_scale": bool(score_log_scale),
|
| 684 |
"bitcost_pct": float(bitcost_pct),
|
| 685 |
"fade_strength": float(fade_strength),
|
| 686 |
+
"gop": gop_resolved,
|
| 687 |
},
|
| 688 |
+
"gop_groups": [
|
| 689 |
+
{
|
| 690 |
+
"start_frame_idx": int(s),
|
| 691 |
+
"end_frame_idx": int(e),
|
| 692 |
+
"n_frames": int(e - s + 1),
|
| 693 |
+
"selected": int(sum(int(m.sum()) for m in masks[s:e + 1])),
|
| 694 |
+
}
|
| 695 |
+
for (s, e) in groups
|
| 696 |
+
],
|
| 697 |
"frame_window": {
|
| 698 |
"first_decoded": int(f_start),
|
| 699 |
"last_decoded": int(f_end),
|
|
|
|
| 726 |
chart_fig = make_charts(
|
| 727 |
grids, masks, fids, fps, duration_sec,
|
| 728 |
int(total_patches), saliency_signal,
|
| 729 |
+
groups=groups, gop_label=gop_resolved,
|
| 730 |
)
|
| 731 |
|
| 732 |
progress(1.0, desc="Done")
|
|
|
|
| 1014 |
patch_size = gr.Radio(
|
| 1015 |
PATCH_CHOICES, value=14, label="Patch size (px)",
|
| 1016 |
)
|
| 1017 |
+
gop = gr.Radio(
|
| 1018 |
+
[
|
| 1019 |
+
("Global β one budget across the whole video", "global"),
|
| 1020 |
+
("GOP = 4 β fixed 4-frame groups", "4"),
|
| 1021 |
+
("GOP = 8 β fixed 8-frame groups", "8"),
|
| 1022 |
+
("GOP = 16 β fixed 16-frame groups", "16"),
|
| 1023 |
+
("Dynamic β adaptive groups by saliency energy", "dynamic"),
|
| 1024 |
+
],
|
| 1025 |
+
value="global",
|
| 1026 |
+
label="GOP (group of pictures)",
|
| 1027 |
+
info="Splits sampled frames into groups; the patch budget "
|
| 1028 |
+
"is allocated equally across groups, top-K within "
|
| 1029 |
+
"each. Dynamic mode mirrors codec_tools' readiness "
|
| 1030 |
+
"grouping (equal-energy groups).",
|
| 1031 |
+
)
|
| 1032 |
|
| 1033 |
with gr.Group(elem_classes="ovc-card"):
|
| 1034 |
gr.Markdown("### Quick presets")
|
|
|
|
| 1149 |
viz_mode, heatmap_alpha,
|
| 1150 |
start_sec, end_sec,
|
| 1151 |
saliency_signal, score_log_scale, bitcost_pct, fade_strength,
|
| 1152 |
+
gop,
|
| 1153 |
],
|
| 1154 |
outputs=[vis_out, canvas_out, info_out, chart_out],
|
| 1155 |
)
|