Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

FeilongTang commited on 27 days ago

Commit

33347ae

1 Parent(s): 67e22fc

Add GOP parameter: fixed (4/8/16) and dynamic (energy-adaptive)

Mirrors codec_tools' grouping_mode {fixed, readiness} for the
patch-selection stage of the demo.

- global : one group across the whole video (current behavior).
- 4/8/16 : fixed-size groups; budget split equally, top-K within
each group. Forces patches to spread along the timeline
instead of clumping on the single most salient moment.
- dynamic : adaptive groups via build_dynamic_groups, walking sampled
frames in time order and closing each group when its
cumulative score sum reaches total_energy / target_groups.
This is the simplified readiness rule from
codec_tools/pipeline/process_video_bitcost_mv_mask_collage.py
(equal-energy groups, no coverage_bins / delta-ratio
refinement — this is a demo).

The codec-vs-uniform chart now shows group boundaries as dashed
vertical lines on the codec panel, and the title carries the
GOP label and group count. Run info JSON gains:
- params.gop (resolved label)
- gop_groups[] ({start, end, n_frames, selected})

Files changed (1) hide show

app.py +159 -2

app.py CHANGED Viewed

@@ -208,6 +208,114 @@ def global_topk_masks(
     return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
 def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
     """Convert to gray-white wash: gray * (1-fade) + white * fade."""
     gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
@@ -352,6 +460,8 @@ def make_charts(
     total_duration_sec: float,
     total_patches_budget: int,
     saliency_signal: str,
 ):
     """Two side-by-side panels comparing codec selection vs uniform sampling.
@@ -389,8 +499,13 @@ def make_charts(
         edgecolor="#312e81", linewidth=0.4,
     )
     total_selected = sum(counts)
     ax1.set_title(
-        f"Codec selection · {saliency_signal} · {total_selected} patches",
         fontsize=10, color="#1e293b",
     )
     ax1.set_xlabel("time (s)", fontsize=9)
@@ -400,6 +515,18 @@ def make_charts(
     ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
     ax1.spines[["top", "right"]].set_visible(False)
     # ─── Right: uniform-sampling baseline at the same budget ────────────
     n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
     uniform_times = (
@@ -443,6 +570,7 @@ def process(
     score_log_scale: bool = False,
     bitcost_pct: float = 99.0,
     fade_strength: float = 0.55,
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
@@ -501,7 +629,9 @@ def process(
     grids = compute_score_grids(resized, int(patch_size), saliency_signal)
     if score_log_scale:
         grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
-    masks, actual_selected = global_topk_masks(grids, int(total_patches))
     norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
     mode = (viz_mode or "selection").lower()
@@ -553,7 +683,17 @@ def process(
             "score_log_scale": bool(score_log_scale),
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
         },
         "frame_window": {
             "first_decoded": int(f_start),
             "last_decoded": int(f_end),
@@ -586,6 +726,7 @@ def process(
     chart_fig = make_charts(
         grids, masks, fids, fps, duration_sec,
         int(total_patches), saliency_signal,
     )
     progress(1.0, desc="Done")
@@ -873,6 +1014,21 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                 patch_size = gr.Radio(
                     PATCH_CHOICES, value=14, label="Patch size (px)",
                 )
             with gr.Group(elem_classes="ovc-card"):
                 gr.Markdown("### Quick presets")
@@ -993,6 +1149,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
             viz_mode, heatmap_alpha,
             start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
         ],
         outputs=[vis_out, canvas_out, info_out, chart_out],
     )

     return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
+def build_dynamic_groups(
+    grids: List[np.ndarray], target_groups: int = 4, min_group_frames: int = 1,
+) -> List[Tuple[int, int]]:
+    """Adaptive temporal grouping by cumulative saliency energy.
+    Walk sampled frames in time order, accumulate frame-level score sums,
+    and close the current group once the running total reaches
+    `total_energy / target_groups`. Groups end up roughly equal in
+    *information content* rather than equal in frame count — this is the
+    same intuition as codec_tools' readiness mode, simplified for the
+    demo (no temporal-coverage / marginal-gain refinement)."""
+    n = len(grids)
+    if n == 0:
+        return []
+    if n <= target_groups:
+        return [(i, i) for i in range(n)]
+    energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
+    total = energies.sum()
+    if total <= 1e-8:
+        # Degenerate: pure even split.
+        size = max(1, n // target_groups)
+        groups: List[Tuple[int, int]] = []
+        cursor = 0
+        while cursor < n and len(groups) < target_groups:
+            end = min(n - 1, cursor + size - 1)
+            if len(groups) == target_groups - 1:
+                end = n - 1
+            groups.append((cursor, end))
+            cursor = end + 1
+        return groups
+    target_per_group = total / target_groups
+    groups = []
+    start = 0
+    cum = 0.0
+    for i in range(n):
+        cum += energies[i]
+        groups_left = target_groups - len(groups) - 1
+        frames_left_after = n - i - 1
+        # Close this group if energy budget hit AND we still leave room for
+        # the remaining groups (each needs >= min_group_frames frames).
+        threshold_hit = cum >= target_per_group
+        room_ok = frames_left_after >= groups_left * min_group_frames
+        size_ok = (i - start + 1) >= min_group_frames
+        if threshold_hit and room_ok and size_ok and len(groups) < target_groups - 1:
+            groups.append((start, i))
+            start = i + 1
+            cum = 0.0
+    # Tail group (whatever frames remain).
+    if start <= n - 1:
+        groups.append((start, n - 1))
+    return groups
+def grouped_topk_masks(
+    grids: List[np.ndarray], total_k: int, gop: str,
+) -> Tuple[List[np.ndarray], int, List[Tuple[int, int]], str]:
+    """Select patches under a GOP grouping strategy.
+    GOP modes:
+      - "global": one big group across the whole video — top-K global.
+      - "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
+        budget is split equally across groups, top-K picked within each.
+      - "dynamic": adaptive groups (see build_dynamic_groups), targeting
+        4 groups by default; each group gets an equal share of the budget.
+    Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
+    """
+    n = len(grids)
+    if n == 0:
+        return [], 0, [], gop
+    mode = (gop or "global").strip().lower()
+    if mode in ("global", "none", "0", ""):
+        masks, actual = global_topk_masks(grids, int(total_k))
+        return masks, actual, [(0, n - 1)], "global"
+    if mode == "dynamic":
+        groups = build_dynamic_groups(grids, target_groups=min(4, max(1, n)))
+    else:
+        try:
+            g_size = max(1, int(mode))
+        except ValueError:
+            g_size = n
+        groups = []
+        cursor = 0
+        while cursor < n:
+            end = min(n - 1, cursor + g_size - 1)
+            groups.append((cursor, end))
+            cursor = end + 1
+    num_groups = max(1, len(groups))
+    per_group_budget = max(1, int(total_k) // num_groups)
+    # Initialize empty masks, then fill per-group selections.
+    out_masks = [np.zeros(g.shape, dtype=np.uint8) for g in grids]
+    actual_total = 0
+    for (s, e) in groups:
+        sub = grids[s:e + 1]
+        sub_masks, sub_actual = global_topk_masks(sub, per_group_budget)
+        for i, sm in enumerate(sub_masks):
+            out_masks[s + i] = sm
+        actual_total += sub_actual
+    return out_masks, actual_total, groups, mode
 def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
     """Convert to gray-white wash: gray * (1-fade) + white * fade."""
     gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
     total_duration_sec: float,
     total_patches_budget: int,
     saliency_signal: str,
+    groups: List[Tuple[int, int]] = None,
+    gop_label: str = "global",
 ):
     """Two side-by-side panels comparing codec selection vs uniform sampling.
         edgecolor="#312e81", linewidth=0.4,
     )
     total_selected = sum(counts)
+    n_groups = len(groups) if groups else 1
+    gop_str = (
+        gop_label if gop_label in ("global", "dynamic") else f"GOP={gop_label}"
+    )
     ax1.set_title(
+        f"Codec selection · {saliency_signal} · {gop_str} "
+        f"({n_groups} groups) · {total_selected} patches",
         fontsize=10, color="#1e293b",
     )
     ax1.set_xlabel("time (s)", fontsize=9)
     ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
     ax1.spines[["top", "right"]].set_visible(False)
+    # Group boundary lines (skip if there's just one big group).
+    if groups and len(groups) > 1 and times:
+        for (_, end_idx) in groups[:-1]:
+            if end_idx + 1 < len(times):
+                bx = (times[end_idx] + times[end_idx + 1]) / 2.0
+            else:
+                bx = times[end_idx] + bar_w
+            ax1.axvline(
+                bx, color="#94a3b8", linestyle=(0, (4, 3)),
+                alpha=0.55, linewidth=0.9,
+            )
     # ─── Right: uniform-sampling baseline at the same budget ────────────
     n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
     uniform_times = (
     score_log_scale: bool = False,
     bitcost_pct: float = 99.0,
     fade_strength: float = 0.55,
+    gop: str = "global",
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
     grids = compute_score_grids(resized, int(patch_size), saliency_signal)
     if score_log_scale:
         grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
+    masks, actual_selected, groups, gop_resolved = grouped_topk_masks(
+        grids, int(total_patches), str(gop or "global"),
+    )
     norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
     mode = (viz_mode or "selection").lower()
             "score_log_scale": bool(score_log_scale),
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
+            "gop": gop_resolved,
         },
+        "gop_groups": [
+            {
+                "start_frame_idx": int(s),
+                "end_frame_idx": int(e),
+                "n_frames": int(e - s + 1),
+                "selected": int(sum(int(m.sum()) for m in masks[s:e + 1])),
+            }
+            for (s, e) in groups
+        ],
         "frame_window": {
             "first_decoded": int(f_start),
             "last_decoded": int(f_end),
     chart_fig = make_charts(
         grids, masks, fids, fps, duration_sec,
         int(total_patches), saliency_signal,
+        groups=groups, gop_label=gop_resolved,
     )
     progress(1.0, desc="Done")
                 patch_size = gr.Radio(
                     PATCH_CHOICES, value=14, label="Patch size (px)",
                 )
+                gop = gr.Radio(
+                    [
+                        ("Global — one budget across the whole video",   "global"),
+                        ("GOP = 4 — fixed 4-frame groups",                "4"),
+                        ("GOP = 8 — fixed 8-frame groups",                "8"),
+                        ("GOP = 16 — fixed 16-frame groups",              "16"),
+                        ("Dynamic — adaptive groups by saliency energy",  "dynamic"),
+                    ],
+                    value="global",
+                    label="GOP (group of pictures)",
+                    info="Splits sampled frames into groups; the patch budget "
+                         "is allocated equally across groups, top-K within "
+                         "each. Dynamic mode mirrors codec_tools' readiness "
+                         "grouping (equal-energy groups).",
+                )
             with gr.Group(elem_classes="ovc-card"):
                 gr.Markdown("### Quick presets")
             viz_mode, heatmap_alpha,
             start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
+            gop,
         ],
         outputs=[vis_out, canvas_out, info_out, chart_out],
     )