FeilongTang commited on
Commit
33347ae
Β·
1 Parent(s): 67e22fc

Add GOP parameter: fixed (4/8/16) and dynamic (energy-adaptive)

Browse files

Mirrors codec_tools' grouping_mode {fixed, readiness} for the
patch-selection stage of the demo.

- global : one group across the whole video (current behavior).
- 4/8/16 : fixed-size groups; budget split equally, top-K within
each group. Forces patches to spread along the timeline
instead of clumping on the single most salient moment.
- dynamic : adaptive groups via build_dynamic_groups, walking sampled
frames in time order and closing each group when its
cumulative score sum reaches total_energy / target_groups.
This is the simplified readiness rule from
codec_tools/pipeline/process_video_bitcost_mv_mask_collage.py
(equal-energy groups, no coverage_bins / delta-ratio
refinement β€” this is a demo).

The codec-vs-uniform chart now shows group boundaries as dashed
vertical lines on the codec panel, and the title carries the
GOP label and group count. Run info JSON gains:
- params.gop (resolved label)
- gop_groups[] ({start, end, n_frames, selected})

Files changed (1) hide show
  1. app.py +159 -2
app.py CHANGED
@@ -208,6 +208,114 @@ def global_topk_masks(
208
  return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
209
 
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
212
  """Convert to gray-white wash: gray * (1-fade) + white * fade."""
213
  gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
@@ -352,6 +460,8 @@ def make_charts(
352
  total_duration_sec: float,
353
  total_patches_budget: int,
354
  saliency_signal: str,
 
 
355
  ):
356
  """Two side-by-side panels comparing codec selection vs uniform sampling.
357
 
@@ -389,8 +499,13 @@ def make_charts(
389
  edgecolor="#312e81", linewidth=0.4,
390
  )
391
  total_selected = sum(counts)
 
 
 
 
392
  ax1.set_title(
393
- f"Codec selection Β· {saliency_signal} Β· {total_selected} patches",
 
394
  fontsize=10, color="#1e293b",
395
  )
396
  ax1.set_xlabel("time (s)", fontsize=9)
@@ -400,6 +515,18 @@ def make_charts(
400
  ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
401
  ax1.spines[["top", "right"]].set_visible(False)
402
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  # ─── Right: uniform-sampling baseline at the same budget ────────────
404
  n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
405
  uniform_times = (
@@ -443,6 +570,7 @@ def process(
443
  score_log_scale: bool = False,
444
  bitcost_pct: float = 99.0,
445
  fade_strength: float = 0.55,
 
446
  progress=gr.Progress(track_tqdm=False),
447
  ):
448
  if not video_path:
@@ -501,7 +629,9 @@ def process(
501
  grids = compute_score_grids(resized, int(patch_size), saliency_signal)
502
  if score_log_scale:
503
  grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
504
- masks, actual_selected = global_topk_masks(grids, int(total_patches))
 
 
505
  norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
506
 
507
  mode = (viz_mode or "selection").lower()
@@ -553,7 +683,17 @@ def process(
553
  "score_log_scale": bool(score_log_scale),
554
  "bitcost_pct": float(bitcost_pct),
555
  "fade_strength": float(fade_strength),
 
556
  },
 
 
 
 
 
 
 
 
 
557
  "frame_window": {
558
  "first_decoded": int(f_start),
559
  "last_decoded": int(f_end),
@@ -586,6 +726,7 @@ def process(
586
  chart_fig = make_charts(
587
  grids, masks, fids, fps, duration_sec,
588
  int(total_patches), saliency_signal,
 
589
  )
590
 
591
  progress(1.0, desc="Done")
@@ -873,6 +1014,21 @@ with gr.Blocks(**_BLOCK_KW) as demo:
873
  patch_size = gr.Radio(
874
  PATCH_CHOICES, value=14, label="Patch size (px)",
875
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
 
877
  with gr.Group(elem_classes="ovc-card"):
878
  gr.Markdown("### Quick presets")
@@ -993,6 +1149,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
993
  viz_mode, heatmap_alpha,
994
  start_sec, end_sec,
995
  saliency_signal, score_log_scale, bitcost_pct, fade_strength,
 
996
  ],
997
  outputs=[vis_out, canvas_out, info_out, chart_out],
998
  )
 
208
  return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
209
 
210
 
211
+ def build_dynamic_groups(
212
+ grids: List[np.ndarray], target_groups: int = 4, min_group_frames: int = 1,
213
+ ) -> List[Tuple[int, int]]:
214
+ """Adaptive temporal grouping by cumulative saliency energy.
215
+
216
+ Walk sampled frames in time order, accumulate frame-level score sums,
217
+ and close the current group once the running total reaches
218
+ `total_energy / target_groups`. Groups end up roughly equal in
219
+ *information content* rather than equal in frame count β€” this is the
220
+ same intuition as codec_tools' readiness mode, simplified for the
221
+ demo (no temporal-coverage / marginal-gain refinement)."""
222
+ n = len(grids)
223
+ if n == 0:
224
+ return []
225
+ if n <= target_groups:
226
+ return [(i, i) for i in range(n)]
227
+
228
+ energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
229
+ total = energies.sum()
230
+ if total <= 1e-8:
231
+ # Degenerate: pure even split.
232
+ size = max(1, n // target_groups)
233
+ groups: List[Tuple[int, int]] = []
234
+ cursor = 0
235
+ while cursor < n and len(groups) < target_groups:
236
+ end = min(n - 1, cursor + size - 1)
237
+ if len(groups) == target_groups - 1:
238
+ end = n - 1
239
+ groups.append((cursor, end))
240
+ cursor = end + 1
241
+ return groups
242
+
243
+ target_per_group = total / target_groups
244
+ groups = []
245
+ start = 0
246
+ cum = 0.0
247
+ for i in range(n):
248
+ cum += energies[i]
249
+ groups_left = target_groups - len(groups) - 1
250
+ frames_left_after = n - i - 1
251
+ # Close this group if energy budget hit AND we still leave room for
252
+ # the remaining groups (each needs >= min_group_frames frames).
253
+ threshold_hit = cum >= target_per_group
254
+ room_ok = frames_left_after >= groups_left * min_group_frames
255
+ size_ok = (i - start + 1) >= min_group_frames
256
+ if threshold_hit and room_ok and size_ok and len(groups) < target_groups - 1:
257
+ groups.append((start, i))
258
+ start = i + 1
259
+ cum = 0.0
260
+ # Tail group (whatever frames remain).
261
+ if start <= n - 1:
262
+ groups.append((start, n - 1))
263
+ return groups
264
+
265
+
266
+ def grouped_topk_masks(
267
+ grids: List[np.ndarray], total_k: int, gop: str,
268
+ ) -> Tuple[List[np.ndarray], int, List[Tuple[int, int]], str]:
269
+ """Select patches under a GOP grouping strategy.
270
+
271
+ GOP modes:
272
+ - "global": one big group across the whole video β€” top-K global.
273
+ - "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
274
+ budget is split equally across groups, top-K picked within each.
275
+ - "dynamic": adaptive groups (see build_dynamic_groups), targeting
276
+ 4 groups by default; each group gets an equal share of the budget.
277
+
278
+ Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
279
+ """
280
+ n = len(grids)
281
+ if n == 0:
282
+ return [], 0, [], gop
283
+
284
+ mode = (gop or "global").strip().lower()
285
+
286
+ if mode in ("global", "none", "0", ""):
287
+ masks, actual = global_topk_masks(grids, int(total_k))
288
+ return masks, actual, [(0, n - 1)], "global"
289
+
290
+ if mode == "dynamic":
291
+ groups = build_dynamic_groups(grids, target_groups=min(4, max(1, n)))
292
+ else:
293
+ try:
294
+ g_size = max(1, int(mode))
295
+ except ValueError:
296
+ g_size = n
297
+ groups = []
298
+ cursor = 0
299
+ while cursor < n:
300
+ end = min(n - 1, cursor + g_size - 1)
301
+ groups.append((cursor, end))
302
+ cursor = end + 1
303
+
304
+ num_groups = max(1, len(groups))
305
+ per_group_budget = max(1, int(total_k) // num_groups)
306
+
307
+ # Initialize empty masks, then fill per-group selections.
308
+ out_masks = [np.zeros(g.shape, dtype=np.uint8) for g in grids]
309
+ actual_total = 0
310
+ for (s, e) in groups:
311
+ sub = grids[s:e + 1]
312
+ sub_masks, sub_actual = global_topk_masks(sub, per_group_budget)
313
+ for i, sm in enumerate(sub_masks):
314
+ out_masks[s + i] = sm
315
+ actual_total += sub_actual
316
+ return out_masks, actual_total, groups, mode
317
+
318
+
319
  def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
320
  """Convert to gray-white wash: gray * (1-fade) + white * fade."""
321
  gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
 
460
  total_duration_sec: float,
461
  total_patches_budget: int,
462
  saliency_signal: str,
463
+ groups: List[Tuple[int, int]] = None,
464
+ gop_label: str = "global",
465
  ):
466
  """Two side-by-side panels comparing codec selection vs uniform sampling.
467
 
 
499
  edgecolor="#312e81", linewidth=0.4,
500
  )
501
  total_selected = sum(counts)
502
+ n_groups = len(groups) if groups else 1
503
+ gop_str = (
504
+ gop_label if gop_label in ("global", "dynamic") else f"GOP={gop_label}"
505
+ )
506
  ax1.set_title(
507
+ f"Codec selection Β· {saliency_signal} Β· {gop_str} "
508
+ f"({n_groups} groups) Β· {total_selected} patches",
509
  fontsize=10, color="#1e293b",
510
  )
511
  ax1.set_xlabel("time (s)", fontsize=9)
 
515
  ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
516
  ax1.spines[["top", "right"]].set_visible(False)
517
 
518
+ # Group boundary lines (skip if there's just one big group).
519
+ if groups and len(groups) > 1 and times:
520
+ for (_, end_idx) in groups[:-1]:
521
+ if end_idx + 1 < len(times):
522
+ bx = (times[end_idx] + times[end_idx + 1]) / 2.0
523
+ else:
524
+ bx = times[end_idx] + bar_w
525
+ ax1.axvline(
526
+ bx, color="#94a3b8", linestyle=(0, (4, 3)),
527
+ alpha=0.55, linewidth=0.9,
528
+ )
529
+
530
  # ─── Right: uniform-sampling baseline at the same budget ────────────
531
  n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
532
  uniform_times = (
 
570
  score_log_scale: bool = False,
571
  bitcost_pct: float = 99.0,
572
  fade_strength: float = 0.55,
573
+ gop: str = "global",
574
  progress=gr.Progress(track_tqdm=False),
575
  ):
576
  if not video_path:
 
629
  grids = compute_score_grids(resized, int(patch_size), saliency_signal)
630
  if score_log_scale:
631
  grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
632
+ masks, actual_selected, groups, gop_resolved = grouped_topk_masks(
633
+ grids, int(total_patches), str(gop or "global"),
634
+ )
635
  norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
636
 
637
  mode = (viz_mode or "selection").lower()
 
683
  "score_log_scale": bool(score_log_scale),
684
  "bitcost_pct": float(bitcost_pct),
685
  "fade_strength": float(fade_strength),
686
+ "gop": gop_resolved,
687
  },
688
+ "gop_groups": [
689
+ {
690
+ "start_frame_idx": int(s),
691
+ "end_frame_idx": int(e),
692
+ "n_frames": int(e - s + 1),
693
+ "selected": int(sum(int(m.sum()) for m in masks[s:e + 1])),
694
+ }
695
+ for (s, e) in groups
696
+ ],
697
  "frame_window": {
698
  "first_decoded": int(f_start),
699
  "last_decoded": int(f_end),
 
726
  chart_fig = make_charts(
727
  grids, masks, fids, fps, duration_sec,
728
  int(total_patches), saliency_signal,
729
+ groups=groups, gop_label=gop_resolved,
730
  )
731
 
732
  progress(1.0, desc="Done")
 
1014
  patch_size = gr.Radio(
1015
  PATCH_CHOICES, value=14, label="Patch size (px)",
1016
  )
1017
+ gop = gr.Radio(
1018
+ [
1019
+ ("Global β€” one budget across the whole video", "global"),
1020
+ ("GOP = 4 β€” fixed 4-frame groups", "4"),
1021
+ ("GOP = 8 β€” fixed 8-frame groups", "8"),
1022
+ ("GOP = 16 β€” fixed 16-frame groups", "16"),
1023
+ ("Dynamic β€” adaptive groups by saliency energy", "dynamic"),
1024
+ ],
1025
+ value="global",
1026
+ label="GOP (group of pictures)",
1027
+ info="Splits sampled frames into groups; the patch budget "
1028
+ "is allocated equally across groups, top-K within "
1029
+ "each. Dynamic mode mirrors codec_tools' readiness "
1030
+ "grouping (equal-energy groups).",
1031
+ )
1032
 
1033
  with gr.Group(elem_classes="ovc-card"):
1034
  gr.Markdown("### Quick presets")
 
1149
  viz_mode, heatmap_alpha,
1150
  start_sec, end_sec,
1151
  saliency_signal, score_log_scale, bitcost_pct, fade_strength,
1152
+ gop,
1153
  ],
1154
  outputs=[vis_out, canvas_out, info_out, chart_out],
1155
  )