FeilongTang commited on
Commit
257cddf
·
1 Parent(s): ff548c5

Use one IPPP canvas per GOP group

Browse files
Files changed (1) hide show
  1. app.py +72 -96
app.py CHANGED
@@ -17,8 +17,9 @@ Pipeline (mirrors codec_tools/pipeline/process_video_bitcost_readiness.py):
17
  5. Render a "selection visualization" video: kept patches stay in
18
  full color, dropped patches are faded to a gray-white wash so the
19
  viewer can see exactly which patches the codec stage chose.
20
- 6. Pack the selected patches in time-order, raster scan, into a
21
- single canvas image (the artifact LLaVA-OneVision2 consumes).
 
22
  """
23
 
24
  import json
@@ -49,7 +50,7 @@ DEMO_VIDEO_PATH = os.path.join(
49
  )
50
  DEMO_PRESET = (
51
  DEMO_VIDEO_PATH, # video_in
52
- 16, # sample_frames
53
  14, # patch_size
54
  1024, # total_patches
55
  150000, # max_pixels
@@ -61,7 +62,6 @@ DEMO_PRESET = (
61
  96.0, # bitcost_pct
62
  0.55, # fade_strength
63
  "dynamic", # gop
64
- 4, # target_canvases
65
  )
66
 
67
 
@@ -258,27 +258,40 @@ def global_topk_masks(
258
 
259
 
260
  def build_dynamic_groups(
261
- grids: List[np.ndarray], target_groups: int = 4, min_group_frames: int = 1,
 
 
 
262
  ) -> List[Tuple[int, int]]:
263
  """Adaptive temporal grouping by cumulative saliency energy.
264
 
265
- Walk sampled frames in time order, accumulate frame-level score sums,
266
- and close the current group once the running total reaches
267
- `total_energy / target_groups`. Groups end up roughly equal in
268
- *information content* rather than equal in frame count this is the
269
- same intuition as codec_tools' readiness mode, simplified for the
270
- demo (no temporal-coverage / marginal-gain refinement)."""
271
  n = len(grids)
272
  if n == 0:
273
  return []
274
- if n <= target_groups:
275
- return [(i, i) for i in range(n)]
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
278
  total = energies.sum()
279
  if total <= 1e-8:
280
- # Degenerate: pure even split.
281
- size = max(1, n // target_groups)
282
  groups: List[Tuple[int, int]] = []
283
  cursor = 0
284
  while cursor < n and len(groups) < target_groups:
@@ -295,18 +308,16 @@ def build_dynamic_groups(
295
  cum = 0.0
296
  for i in range(n):
297
  cum += energies[i]
 
298
  groups_left = target_groups - len(groups) - 1
299
  frames_left_after = n - i - 1
300
- # Close this group if energy budget hit AND we still leave room for
301
- # the remaining groups (each needs >= min_group_frames frames).
302
- threshold_hit = cum >= target_per_group
303
- room_ok = frames_left_after >= groups_left * min_group_frames
304
- size_ok = (i - start + 1) >= min_group_frames
305
- if threshold_hit and room_ok and size_ok and len(groups) < target_groups - 1:
306
  groups.append((start, i))
307
  start = i + 1
308
  cum = 0.0
309
- # Tail group (whatever frames remain).
310
  if start <= n - 1:
311
  groups.append((start, n - 1))
312
  return groups
@@ -321,8 +332,8 @@ def grouped_topk_masks(
321
  - "global": one big group across the whole video — top-K global.
322
  - "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
323
  budget is split equally across groups, top-K picked within each.
324
- - "dynamic": adaptive groups (see build_dynamic_groups), targeting
325
- 4 groups by default; each group gets an equal share of the budget.
326
 
327
  Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
328
  """
@@ -337,7 +348,8 @@ def grouped_topk_masks(
337
  return masks, actual, [(0, n - 1)], "global"
338
 
339
  if mode == "dynamic":
340
- groups = build_dynamic_groups(grids, target_groups=min(4, max(1, n)))
 
341
  else:
342
  try:
343
  g_size = max(1, int(mode))
@@ -349,6 +361,7 @@ def grouped_topk_masks(
349
  end = min(n - 1, cursor + g_size - 1)
350
  groups.append((cursor, end))
351
  cursor = end + 1
 
352
 
353
  num_groups = max(1, len(groups))
354
  target_k = max(0, int(total_k))
@@ -383,7 +396,7 @@ def grouped_topk_masks(
383
  for i, sm in enumerate(sub_masks):
384
  out_masks[s + i] = sm
385
  actual_total += sub_actual
386
- return out_masks, actual_total, groups, mode
387
 
388
 
389
  def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
@@ -551,39 +564,22 @@ def _build_ippp_canvas(
551
  return canvas, n_overlays
552
 
553
 
554
- def _allocate_canvases_per_group(
555
- target_canvases: int, num_groups: int,
556
- ) -> List[int]:
557
- """Split a total target canvas count across N groups as evenly as
558
- possible; the first `remainder` groups get +1 each."""
559
- target = max(1, int(target_canvases))
560
- n = max(1, int(num_groups))
561
- base, rem = divmod(target, n)
562
- out = [base + (1 if i < rem else 0) for i in range(n)]
563
- # Floor to at least 1 canvas per group so no group is invisible.
564
- return [max(1, x) for x in out]
565
-
566
-
567
  def pack_canvases_per_group(
568
  frames: List[np.ndarray],
569
  masks: List[np.ndarray],
570
  groups: List[Tuple[int, int]],
571
  patch: int,
572
- target_canvases: int = 4,
573
  ) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
574
- """Pack exactly `target_canvases` IPPP canvases for the whole video,
575
- distributing them across GOP groups as evenly as possible.
576
 
577
- Each group's frame range [s..e] is split into K consecutive sub-ranges
578
- (K = canvases allocated to that group). Each sub-range [ss..ee] becomes
579
- one canvas:
580
- - frame ss is the I-frame: its whole image goes to the canvas top.
581
- - frames ss+1..ee are P-frames: only saliency-selected patches go
582
- below the I-frame, packed time-major in a wb-wide raster grid.
583
 
584
  Returns:
585
- canvases — list of np.ndarray, length == target_canvases
586
- (or fewer if some groups have only 1 frame).
587
  sub_ranges — list of (group_idx, sub_start, sub_end) parallel to
588
  canvases, for caption / debugging.
589
  total_selected — I-frame patches (counted as full grid) + P-frame
@@ -595,34 +591,18 @@ def pack_canvases_per_group(
595
  if not groups or not frames:
596
  return [np.full((patch, patch, 3), 255, dtype=np.uint8)], [(0, 0, 0)], 0
597
 
598
- per_group_counts = _allocate_canvases_per_group(target_canvases, len(groups))
599
-
600
  for g_idx, (s, e) in enumerate(groups):
601
  if s >= len(frames):
602
  continue
603
- group_len = e - s + 1
604
- k = max(1, min(per_group_counts[g_idx], group_len))
605
- # Split [s..e] into k consecutive sub-ranges of (almost) equal size.
606
- base, rem = divmod(group_len, k)
607
- cursor = s
608
- for sub_i in range(k):
609
- sub_len = base + (1 if sub_i < rem else 0)
610
- ss = cursor
611
- ee = min(e, cursor + sub_len - 1)
612
- cursor = ee + 1
613
- canvas, n_p_overlays = _build_ippp_canvas(
614
- frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
615
- patch=patch,
616
- )
617
- canvases.append(canvas)
618
- sub_ranges.append((g_idx, ss, ee))
619
- # Accounting:
620
- # - I-frame counts as the full grid (anchor, every position
621
- # starts from it).
622
- # - Each P-frame overlay is +1 (positions may be overlaid
623
- # multiple times by later P-frames; we count each hit).
624
- hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
625
- total_selected += hb * wb + n_p_overlays
626
 
627
  if not canvases:
628
  canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
@@ -756,7 +736,7 @@ def make_charts(
756
  )
757
 
758
  n_groups = len(groups) if groups else 1
759
- gop_str = gop_label if gop_label in ("global", "dynamic") else f"GOP={gop_label}"
760
  ax.set_title(
761
  f"Cumulative patches selected over time · {saliency_signal} · "
762
  f"{gop_str} ({n_groups} groups)",
@@ -791,7 +771,7 @@ def process(
791
  bitcost_pct: float = 99.0,
792
  fade_strength: float = 0.55,
793
  gop: str = "global",
794
- target_canvases: int = 4,
795
  progress=gr.Progress(track_tqdm=False),
796
  ):
797
  if not video_path:
@@ -886,7 +866,7 @@ def process(
886
  progress(0.85, desc="Packing canvases (IPPP)")
887
  canvases, sub_ranges, n_selected = pack_canvases_per_group(
888
  resized, masks, groups, int(patch_size),
889
- target_canvases=int(target_canvases),
890
  )
891
  canvas_items: List[Tuple[str, str]] = []
892
  for idx, canv in enumerate(canvases):
@@ -927,7 +907,8 @@ def process(
927
  "bitcost_pct": float(bitcost_pct),
928
  "fade_strength": float(fade_strength),
929
  "gop": gop_resolved,
930
- "target_canvases": int(target_canvases),
 
931
  },
932
  "gop_groups": [
933
  {
@@ -1519,7 +1500,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
1519
  label="Visualization mode",
1520
  )
1521
  sample_frames = gr.Slider(
1522
- 4, 64, value=16, step=1, label="Sampled frames",
1523
  )
1524
  top_k = gr.Slider(
1525
  16, 16384, value=1024, step=16,
@@ -1538,22 +1519,17 @@ with gr.Blocks(**_BLOCK_KW) as demo:
1538
  ("GOP = 4 — fixed 4-frame groups", "4"),
1539
  ("GOP = 8 — fixed 8-frame groups", "8"),
1540
  ("GOP = 16 — fixed 16-frame groups", "16"),
1541
- ("Dynamic adaptive groups by saliency energy", "dynamic"),
1542
  ],
1543
  value="8",
1544
  label="GOP (group of pictures)",
1545
- info="Splits sampled frames into groups; the patch budget "
1546
- "is allocated equally across groups, top-K within "
1547
- "each. Dynamic mode mirrors codec_tools' readiness "
1548
- "grouping (equal-energy groups).",
1549
- )
1550
- target_canvases = gr.Slider(
1551
- 1, 16, value=4, step=1,
1552
- label="Target canvases (total per video)",
1553
- info="Fixed canvas count regardless of GOP. The budget is "
1554
- "split across groups; each group is further sliced "
1555
- "into sub-ranges of consecutive frames, one IPPP "
1556
- "canvas per sub-range.",
1557
  )
1558
 
1559
  with gr.Accordion("Time window", open=False):
@@ -1653,7 +1629,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
1653
  '<div id="ovc-footer">'
1654
  '<b>OneVision Encoder</b> · codec-style patch saliency demo · '
1655
  'Sobel + frame-diff stand in for the ffmpeg bitcost patch · '
1656
- 'global top-K selection across all sampled frames.'
1657
  '</div>'
1658
  )
1659
 
@@ -1664,7 +1640,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
1664
  viz_mode, heatmap_alpha,
1665
  start_sec, end_sec,
1666
  saliency_signal, score_log_scale, bitcost_pct, fade_strength,
1667
- gop, target_canvases,
1668
  ],
1669
  outputs=[vis_out, canvas_out, info_out, chart_out],
1670
  )
@@ -1676,7 +1652,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
1676
  video_in, sample_frames, patch_size, top_k, max_pixels,
1677
  viz_mode, heatmap_alpha, start_sec, end_sec,
1678
  saliency_signal, score_log_scale, bitcost_pct, fade_strength,
1679
- gop, target_canvases,
1680
  ],
1681
  )
1682
 
 
17
  5. Render a "selection visualization" video: kept patches stay in
18
  full color, dropped patches are faded to a gray-white wash so the
19
  viewer can see exactly which patches the codec stage chose.
20
+ 6. Pack one canvas per GOP group: the first frame of each group is
21
+ kept whole as the I-frame, and later frames only overwrite their
22
+ selected patches as P-frame updates.
23
  """
24
 
25
  import json
 
50
  )
51
  DEMO_PRESET = (
52
  DEMO_VIDEO_PATH, # video_in
53
+ 32, # sample_frames
54
  14, # patch_size
55
  1024, # total_patches
56
  150000, # max_pixels
 
62
  96.0, # bitcost_pct
63
  0.55, # fade_strength
64
  "dynamic", # gop
 
65
  )
66
 
67
 
 
258
 
259
 
260
  def build_dynamic_groups(
261
+ grids: List[np.ndarray],
262
+ min_group_frames: int = 8,
263
+ max_group_frames: int = 64,
264
+ preferred_group_frames: int = 32,
265
  ) -> List[Tuple[int, int]]:
266
  """Adaptive temporal grouping by cumulative saliency energy.
267
 
268
+ Groups are energy-adaptive, but constrained to a practical codec-stream
269
+ range: by default each group spans roughly 8-64 sampled frames, with a
270
+ preference around 32 frames/group. Each group later becomes exactly one
271
+ IPPP canvas whose first frame is kept whole as the I-frame."""
 
 
272
  n = len(grids)
273
  if n == 0:
274
  return []
275
+
276
+ min_len = max(1, int(min_group_frames))
277
+ max_len = max(min_len, int(max_group_frames))
278
+ preferred = min(max_len, max(min_len, int(preferred_group_frames)))
279
+
280
+ if n <= max_len:
281
+ return [(0, n - 1)]
282
+
283
+ min_groups = max(1, math.ceil(n / max_len))
284
+ max_groups = max(1, n // min_len)
285
+ target_groups = max(1, math.ceil(n / preferred))
286
+ target_groups = min(max(target_groups, min_groups), max_groups)
287
+ if target_groups <= 1:
288
+ return [(0, n - 1)]
289
 
290
  energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
291
  total = energies.sum()
292
  if total <= 1e-8:
293
+ # Degenerate: pure even split, still respecting the group-size range.
294
+ size = max(min_len, min(max_len, math.ceil(n / target_groups)))
295
  groups: List[Tuple[int, int]] = []
296
  cursor = 0
297
  while cursor < n and len(groups) < target_groups:
 
308
  cum = 0.0
309
  for i in range(n):
310
  cum += energies[i]
311
+ group_len = i - start + 1
312
  groups_left = target_groups - len(groups) - 1
313
  frames_left_after = n - i - 1
314
+ min_room_ok = frames_left_after >= groups_left * min_len
315
+ threshold_hit = cum >= target_per_group and group_len >= min_len
316
+ force_close = group_len >= max_len
317
+ if len(groups) < target_groups - 1 and min_room_ok and (threshold_hit or force_close):
 
 
318
  groups.append((start, i))
319
  start = i + 1
320
  cum = 0.0
 
321
  if start <= n - 1:
322
  groups.append((start, n - 1))
323
  return groups
 
332
  - "global": one big group across the whole video — top-K global.
333
  - "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
334
  budget is split equally across groups, top-K picked within each.
335
+ - "dynamic": codec-stream-style adaptive groups (see
336
+ build_dynamic_groups), defaulting to roughly 8-64 frames/group.
337
 
338
  Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
339
  """
 
348
  return masks, actual, [(0, n - 1)], "global"
349
 
350
  if mode == "dynamic":
351
+ groups = build_dynamic_groups(grids)
352
+ resolved_label = "codec-stream"
353
  else:
354
  try:
355
  g_size = max(1, int(mode))
 
361
  end = min(n - 1, cursor + g_size - 1)
362
  groups.append((cursor, end))
363
  cursor = end + 1
364
+ resolved_label = mode
365
 
366
  num_groups = max(1, len(groups))
367
  target_k = max(0, int(total_k))
 
396
  for i, sm in enumerate(sub_masks):
397
  out_masks[s + i] = sm
398
  actual_total += sub_actual
399
+ return out_masks, actual_total, groups, resolved_label
400
 
401
 
402
  def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
 
564
  return canvas, n_overlays
565
 
566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  def pack_canvases_per_group(
568
  frames: List[np.ndarray],
569
  masks: List[np.ndarray],
570
  groups: List[Tuple[int, int]],
571
  patch: int,
572
+ target_canvases: int = 1,
573
  ) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
574
+ """Pack exactly one IPPP canvas per GOP group.
 
575
 
576
+ Each group's first frame is kept whole as the I-frame, and the
577
+ remaining frames in that same group contribute only their selected
578
+ patches as P-frame overlays. `target_canvases` is kept only for API
579
+ compatibility and is ignored.
 
 
580
 
581
  Returns:
582
+ canvases — list of np.ndarray, length == number of groups.
 
583
  sub_ranges — list of (group_idx, sub_start, sub_end) parallel to
584
  canvases, for caption / debugging.
585
  total_selected — I-frame patches (counted as full grid) + P-frame
 
591
  if not groups or not frames:
592
  return [np.full((patch, patch, 3), 255, dtype=np.uint8)], [(0, 0, 0)], 0
593
 
 
 
594
  for g_idx, (s, e) in enumerate(groups):
595
  if s >= len(frames):
596
  continue
597
+ ss, ee = s, e
598
+ canvas, n_p_overlays = _build_ippp_canvas(
599
+ frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
600
+ patch=patch,
601
+ )
602
+ canvases.append(canvas)
603
+ sub_ranges.append((g_idx, ss, ee))
604
+ hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
605
+ total_selected += hb * wb + n_p_overlays
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
 
607
  if not canvases:
608
  canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
 
736
  )
737
 
738
  n_groups = len(groups) if groups else 1
739
+ gop_str = gop_label if gop_label in ("global", "codec-stream") else f"GOP={gop_label}"
740
  ax.set_title(
741
  f"Cumulative patches selected over time · {saliency_signal} · "
742
  f"{gop_str} ({n_groups} groups)",
 
771
  bitcost_pct: float = 99.0,
772
  fade_strength: float = 0.55,
773
  gop: str = "global",
774
+ target_canvases: int = 1,
775
  progress=gr.Progress(track_tqdm=False),
776
  ):
777
  if not video_path:
 
866
  progress(0.85, desc="Packing canvases (IPPP)")
867
  canvases, sub_ranges, n_selected = pack_canvases_per_group(
868
  resized, masks, groups, int(patch_size),
869
+ target_canvases=1,
870
  )
871
  canvas_items: List[Tuple[str, str]] = []
872
  for idx, canv in enumerate(canvases):
 
907
  "bitcost_pct": float(bitcost_pct),
908
  "fade_strength": float(fade_strength),
909
  "gop": gop_resolved,
910
+ "canvas_policy": "one_ippp_canvas_per_group",
911
+ "i_frame_policy": "first_frame_full_in_each_group",
912
  },
913
  "gop_groups": [
914
  {
 
1500
  label="Visualization mode",
1501
  )
1502
  sample_frames = gr.Slider(
1503
+ 4, 64, value=32, step=1, label="Sampled frames",
1504
  )
1505
  top_k = gr.Slider(
1506
  16, 16384, value=1024, step=16,
 
1519
  ("GOP = 4 — fixed 4-frame groups", "4"),
1520
  ("GOP = 8 — fixed 8-frame groups", "8"),
1521
  ("GOP = 16 — fixed 16-frame groups", "16"),
1522
+ ("Codec-stream: adaptive groups by saliency energy", "dynamic"),
1523
  ],
1524
  value="8",
1525
  label="GOP (group of pictures)",
1526
+ info="Splits sampled frames into GOP groups. Each group "
1527
+ "produces exactly one IPPP canvas: the group's first "
1528
+ "frame stays whole as the I-frame, and later frames "
1529
+ "only contribute selected patches as P-updates. With "
1530
+ "32 sampled frames and GOP=8, this yields 4 canvases. "
1531
+ "Codec-stream mode adaptively groups by saliency "
1532
+ "energy, targeting roughly 8-64 sampled frames per group.",
 
 
 
 
 
1533
  )
1534
 
1535
  with gr.Accordion("Time window", open=False):
 
1629
  '<div id="ovc-footer">'
1630
  '<b>OneVision Encoder</b> · codec-style patch saliency demo · '
1631
  'Sobel + frame-diff stand in for the ffmpeg bitcost patch · '
1632
+ 'GOP-aware top-K patch selection with one IPPP canvas per group.'
1633
  '</div>'
1634
  )
1635
 
 
1640
  viz_mode, heatmap_alpha,
1641
  start_sec, end_sec,
1642
  saliency_signal, score_log_scale, bitcost_pct, fade_strength,
1643
+ gop,
1644
  ],
1645
  outputs=[vis_out, canvas_out, info_out, chart_out],
1646
  )
 
1652
  video_in, sample_frames, patch_size, top_k, max_pixels,
1653
  viz_mode, heatmap_alpha, start_sec, end_sec,
1654
  saliency_signal, score_log_scale, bitcost_pct, fade_strength,
1655
+ gop,
1656
  ],
1657
  )
1658