Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

App Files Files Community

FeilongTang commited on 24 days ago

Commit

257cddf

1 Parent(s): ff548c5

Use one IPPP canvas per GOP group

Browse files

Files changed (1) hide show

app.py +72 -96

app.py CHANGED Viewed

@@ -17,8 +17,9 @@ Pipeline (mirrors codec_tools/pipeline/process_video_bitcost_readiness.py):
     5. Render a "selection visualization" video: kept patches stay in
        full color, dropped patches are faded to a gray-white wash so the
        viewer can see exactly which patches the codec stage chose.
-    6. Pack the selected patches in time-order, raster scan, into a
-       single canvas image (the artifact LLaVA-OneVision2 consumes).
 """
 import json
@@ -49,7 +50,7 @@ DEMO_VIDEO_PATH = os.path.join(
 )
 DEMO_PRESET = (
     DEMO_VIDEO_PATH,  # video_in
-    16,               # sample_frames
     14,               # patch_size
     1024,             # total_patches
     150000,           # max_pixels
@@ -61,7 +62,6 @@ DEMO_PRESET = (
     96.0,             # bitcost_pct
     0.55,             # fade_strength
     "dynamic",        # gop
-    4,                # target_canvases
 )
@@ -258,27 +258,40 @@ def global_topk_masks(
 def build_dynamic_groups(
-    grids: List[np.ndarray], target_groups: int = 4, min_group_frames: int = 1,
 ) -> List[Tuple[int, int]]:
     """Adaptive temporal grouping by cumulative saliency energy.
-    Walk sampled frames in time order, accumulate frame-level score sums,
-    and close the current group once the running total reaches
-    `total_energy / target_groups`. Groups end up roughly equal in
-    *information content* rather than equal in frame count — this is the
-    same intuition as codec_tools' readiness mode, simplified for the
-    demo (no temporal-coverage / marginal-gain refinement)."""
     n = len(grids)
     if n == 0:
         return []
-    if n <= target_groups:
-        return [(i, i) for i in range(n)]
     energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
     total = energies.sum()
     if total <= 1e-8:
-        # Degenerate: pure even split.
-        size = max(1, n // target_groups)
         groups: List[Tuple[int, int]] = []
         cursor = 0
         while cursor < n and len(groups) < target_groups:
@@ -295,18 +308,16 @@ def build_dynamic_groups(
     cum = 0.0
     for i in range(n):
         cum += energies[i]
         groups_left = target_groups - len(groups) - 1
         frames_left_after = n - i - 1
-        # Close this group if energy budget hit AND we still leave room for
-        # the remaining groups (each needs >= min_group_frames frames).
-        threshold_hit = cum >= target_per_group
-        room_ok = frames_left_after >= groups_left * min_group_frames
-        size_ok = (i - start + 1) >= min_group_frames
-        if threshold_hit and room_ok and size_ok and len(groups) < target_groups - 1:
             groups.append((start, i))
             start = i + 1
             cum = 0.0
-    # Tail group (whatever frames remain).
     if start <= n - 1:
         groups.append((start, n - 1))
     return groups
@@ -321,8 +332,8 @@ def grouped_topk_masks(
       - "global": one big group across the whole video — top-K global.
       - "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
         budget is split equally across groups, top-K picked within each.
-      - "dynamic": adaptive groups (see build_dynamic_groups), targeting
-        4 groups by default; each group gets an equal share of the budget.
     Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
     """
@@ -337,7 +348,8 @@ def grouped_topk_masks(
         return masks, actual, [(0, n - 1)], "global"
     if mode == "dynamic":
-        groups = build_dynamic_groups(grids, target_groups=min(4, max(1, n)))
     else:
         try:
             g_size = max(1, int(mode))
@@ -349,6 +361,7 @@ def grouped_topk_masks(
             end = min(n - 1, cursor + g_size - 1)
             groups.append((cursor, end))
             cursor = end + 1
     num_groups = max(1, len(groups))
     target_k = max(0, int(total_k))
@@ -383,7 +396,7 @@ def grouped_topk_masks(
         for i, sm in enumerate(sub_masks):
             out_masks[s + i] = sm
         actual_total += sub_actual
-    return out_masks, actual_total, groups, mode
 def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
@@ -551,39 +564,22 @@ def _build_ippp_canvas(
     return canvas, n_overlays
-def _allocate_canvases_per_group(
-    target_canvases: int, num_groups: int,
-) -> List[int]:
-    """Split a total target canvas count across N groups as evenly as
-    possible; the first `remainder` groups get +1 each."""
-    target = max(1, int(target_canvases))
-    n = max(1, int(num_groups))
-    base, rem = divmod(target, n)
-    out = [base + (1 if i < rem else 0) for i in range(n)]
-    # Floor to at least 1 canvas per group so no group is invisible.
-    return [max(1, x) for x in out]
 def pack_canvases_per_group(
     frames: List[np.ndarray],
     masks: List[np.ndarray],
     groups: List[Tuple[int, int]],
     patch: int,
-    target_canvases: int = 4,
 ) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
-    """Pack exactly `target_canvases` IPPP canvases for the whole video,
-    distributing them across GOP groups as evenly as possible.
-    Each group's frame range [s..e] is split into K consecutive sub-ranges
-    (K = canvases allocated to that group). Each sub-range [ss..ee] becomes
-    one canvas:
-      - frame ss is the I-frame: its whole image goes to the canvas top.
-      - frames ss+1..ee are P-frames: only saliency-selected patches go
-        below the I-frame, packed time-major in a wb-wide raster grid.
     Returns:
-      canvases       — list of np.ndarray, length == target_canvases
-                       (or fewer if some groups have only 1 frame).
       sub_ranges     — list of (group_idx, sub_start, sub_end) parallel to
                        canvases, for caption / debugging.
       total_selected — I-frame patches (counted as full grid) + P-frame
@@ -595,34 +591,18 @@ def pack_canvases_per_group(
     if not groups or not frames:
         return [np.full((patch, patch, 3), 255, dtype=np.uint8)], [(0, 0, 0)], 0
-    per_group_counts = _allocate_canvases_per_group(target_canvases, len(groups))
     for g_idx, (s, e) in enumerate(groups):
         if s >= len(frames):
             continue
-        group_len = e - s + 1
-        k = max(1, min(per_group_counts[g_idx], group_len))
-        # Split [s..e] into k consecutive sub-ranges of (almost) equal size.
-        base, rem = divmod(group_len, k)
-        cursor = s
-        for sub_i in range(k):
-            sub_len = base + (1 if sub_i < rem else 0)
-            ss = cursor
-            ee = min(e, cursor + sub_len - 1)
-            cursor = ee + 1
-            canvas, n_p_overlays = _build_ippp_canvas(
-                frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
-                patch=patch,
-            )
-            canvases.append(canvas)
-            sub_ranges.append((g_idx, ss, ee))
-            # Accounting:
-            #   - I-frame counts as the full grid (anchor, every position
-            #     starts from it).
-            #   - Each P-frame overlay is +1 (positions may be overlaid
-            #     multiple times by later P-frames; we count each hit).
-            hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
-            total_selected += hb * wb + n_p_overlays
     if not canvases:
         canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
@@ -756,7 +736,7 @@ def make_charts(
             )
     n_groups = len(groups) if groups else 1
-    gop_str = gop_label if gop_label in ("global", "dynamic") else f"GOP={gop_label}"
     ax.set_title(
         f"Cumulative patches selected over time · {saliency_signal} · "
         f"{gop_str} ({n_groups} groups)",
@@ -791,7 +771,7 @@ def process(
     bitcost_pct: float = 99.0,
     fade_strength: float = 0.55,
     gop: str = "global",
-    target_canvases: int = 4,
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
@@ -886,7 +866,7 @@ def process(
     progress(0.85, desc="Packing canvases (IPPP)")
     canvases, sub_ranges, n_selected = pack_canvases_per_group(
         resized, masks, groups, int(patch_size),
-        target_canvases=int(target_canvases),
     )
     canvas_items: List[Tuple[str, str]] = []
     for idx, canv in enumerate(canvases):
@@ -927,7 +907,8 @@ def process(
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
             "gop": gop_resolved,
-            "target_canvases": int(target_canvases),
         },
         "gop_groups": [
             {
@@ -1519,7 +1500,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                     label="Visualization mode",
                 )
                 sample_frames = gr.Slider(
-                    4, 64, value=16, step=1, label="Sampled frames",
                 )
                 top_k = gr.Slider(
                     16, 16384, value=1024, step=16,
@@ -1538,22 +1519,17 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                         ("GOP = 4 — fixed 4-frame groups",                "4"),
                         ("GOP = 8 — fixed 8-frame groups",                "8"),
                         ("GOP = 16 — fixed 16-frame groups",              "16"),
-                        ("Dynamic — adaptive groups by saliency energy",  "dynamic"),
                     ],
                     value="8",
                     label="GOP (group of pictures)",
-                    info="Splits sampled frames into groups; the patch budget "
-                         "is allocated equally across groups, top-K within "
-                         "each. Dynamic mode mirrors codec_tools' readiness "
-                         "grouping (equal-energy groups).",
-                )
-                target_canvases = gr.Slider(
-                    1, 16, value=4, step=1,
-                    label="Target canvases (total per video)",
-                    info="Fixed canvas count regardless of GOP. The budget is "
-                         "split across groups; each group is further sliced "
-                         "into sub-ranges of consecutive frames, one IPPP "
-                         "canvas per sub-range.",
                 )
             with gr.Accordion("Time window", open=False):
@@ -1653,7 +1629,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
         '<div id="ovc-footer">'
         '<b>OneVision Encoder</b> · codec-style patch saliency demo · '
         'Sobel + frame-diff stand in for the ffmpeg bitcost patch · '
-        'global top-K selection across all sampled frames.'
         '</div>'
     )
@@ -1664,7 +1640,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
             viz_mode, heatmap_alpha,
             start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
-            gop, target_canvases,
         ],
         outputs=[vis_out, canvas_out, info_out, chart_out],
     )
@@ -1676,7 +1652,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
             video_in, sample_frames, patch_size, top_k, max_pixels,
             viz_mode, heatmap_alpha, start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
-            gop, target_canvases,
         ],
     )

     5. Render a "selection visualization" video: kept patches stay in
        full color, dropped patches are faded to a gray-white wash so the
        viewer can see exactly which patches the codec stage chose.
+    6. Pack one canvas per GOP group: the first frame of each group is
+       kept whole as the I-frame, and later frames only overwrite their
+       selected patches as P-frame updates.
 """
 import json
 )
 DEMO_PRESET = (
     DEMO_VIDEO_PATH,  # video_in
+    32,               # sample_frames
     14,               # patch_size
     1024,             # total_patches
     150000,           # max_pixels
     96.0,             # bitcost_pct
     0.55,             # fade_strength
     "dynamic",        # gop
 )
 def build_dynamic_groups(
+    grids: List[np.ndarray],
+    min_group_frames: int = 8,
+    max_group_frames: int = 64,
+    preferred_group_frames: int = 32,
 ) -> List[Tuple[int, int]]:
     """Adaptive temporal grouping by cumulative saliency energy.
+    Groups are energy-adaptive, but constrained to a practical codec-stream
+    range: by default each group spans roughly 8-64 sampled frames, with a
+    preference around 32 frames/group. Each group later becomes exactly one
+    IPPP canvas whose first frame is kept whole as the I-frame."""
     n = len(grids)
     if n == 0:
         return []
+    min_len = max(1, int(min_group_frames))
+    max_len = max(min_len, int(max_group_frames))
+    preferred = min(max_len, max(min_len, int(preferred_group_frames)))
+    if n <= max_len:
+        return [(0, n - 1)]
+    min_groups = max(1, math.ceil(n / max_len))
+    max_groups = max(1, n // min_len)
+    target_groups = max(1, math.ceil(n / preferred))
+    target_groups = min(max(target_groups, min_groups), max_groups)
+    if target_groups <= 1:
+        return [(0, n - 1)]
     energies = np.array([float(g.sum()) for g in grids], dtype=np.float64)
     total = energies.sum()
     if total <= 1e-8:
+        # Degenerate: pure even split, still respecting the group-size range.
+        size = max(min_len, min(max_len, math.ceil(n / target_groups)))
         groups: List[Tuple[int, int]] = []
         cursor = 0
         while cursor < n and len(groups) < target_groups:
     cum = 0.0
     for i in range(n):
         cum += energies[i]
+        group_len = i - start + 1
         groups_left = target_groups - len(groups) - 1
         frames_left_after = n - i - 1
+        min_room_ok = frames_left_after >= groups_left * min_len
+        threshold_hit = cum >= target_per_group and group_len >= min_len
+        force_close = group_len >= max_len
+        if len(groups) < target_groups - 1 and min_room_ok and (threshold_hit or force_close):
             groups.append((start, i))
             start = i + 1
             cum = 0.0
     if start <= n - 1:
         groups.append((start, n - 1))
     return groups
       - "global": one big group across the whole video — top-K global.
       - "<int>" (e.g. "4"/"8"/"16"): fixed group size in frames; the
         budget is split equally across groups, top-K picked within each.
+      - "dynamic": codec-stream-style adaptive groups (see
+        build_dynamic_groups), defaulting to roughly 8-64 frames/group.
     Returns (per-frame masks, actual selected count, [(start,end),...] groups, resolved_label).
     """
         return masks, actual, [(0, n - 1)], "global"
     if mode == "dynamic":
+        groups = build_dynamic_groups(grids)
+        resolved_label = "codec-stream"
     else:
         try:
             g_size = max(1, int(mode))
             end = min(n - 1, cursor + g_size - 1)
             groups.append((cursor, end))
             cursor = end + 1
+        resolved_label = mode
     num_groups = max(1, len(groups))
     target_k = max(0, int(total_k))
         for i, sm in enumerate(sub_masks):
             out_masks[s + i] = sm
         actual_total += sub_actual
+    return out_masks, actual_total, groups, resolved_label
 def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
     return canvas, n_overlays
 def pack_canvases_per_group(
     frames: List[np.ndarray],
     masks: List[np.ndarray],
     groups: List[Tuple[int, int]],
     patch: int,
+    target_canvases: int = 1,
 ) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
+    """Pack exactly one IPPP canvas per GOP group.
+    Each group's first frame is kept whole as the I-frame, and the
+    remaining frames in that same group contribute only their selected
+    patches as P-frame overlays. `target_canvases` is kept only for API
+    compatibility and is ignored.
     Returns:
+      canvases       — list of np.ndarray, length == number of groups.
       sub_ranges     — list of (group_idx, sub_start, sub_end) parallel to
                        canvases, for caption / debugging.
       total_selected — I-frame patches (counted as full grid) + P-frame
     if not groups or not frames:
         return [np.full((patch, patch, 3), 255, dtype=np.uint8)], [(0, 0, 0)], 0
     for g_idx, (s, e) in enumerate(groups):
         if s >= len(frames):
             continue
+        ss, ee = s, e
+        canvas, n_p_overlays = _build_ippp_canvas(
+            frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
+            patch=patch,
+        )
+        canvases.append(canvas)
+        sub_ranges.append((g_idx, ss, ee))
+        hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
+        total_selected += hb * wb + n_p_overlays
     if not canvases:
         canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
             )
     n_groups = len(groups) if groups else 1
+    gop_str = gop_label if gop_label in ("global", "codec-stream") else f"GOP={gop_label}"
     ax.set_title(
         f"Cumulative patches selected over time · {saliency_signal} · "
         f"{gop_str} ({n_groups} groups)",
     bitcost_pct: float = 99.0,
     fade_strength: float = 0.55,
     gop: str = "global",
+    target_canvases: int = 1,
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
     progress(0.85, desc="Packing canvases (IPPP)")
     canvases, sub_ranges, n_selected = pack_canvases_per_group(
         resized, masks, groups, int(patch_size),
+        target_canvases=1,
     )
     canvas_items: List[Tuple[str, str]] = []
     for idx, canv in enumerate(canvases):
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
             "gop": gop_resolved,
+            "canvas_policy": "one_ippp_canvas_per_group",
+            "i_frame_policy": "first_frame_full_in_each_group",
         },
         "gop_groups": [
             {
                     label="Visualization mode",
                 )
                 sample_frames = gr.Slider(
+                    4, 64, value=32, step=1, label="Sampled frames",
                 )
                 top_k = gr.Slider(
                     16, 16384, value=1024, step=16,
                         ("GOP = 4 — fixed 4-frame groups",                "4"),
                         ("GOP = 8 — fixed 8-frame groups",                "8"),
                         ("GOP = 16 — fixed 16-frame groups",              "16"),
+                        ("Codec-stream: adaptive groups by saliency energy", "dynamic"),
                     ],
                     value="8",
                     label="GOP (group of pictures)",
+                    info="Splits sampled frames into GOP groups. Each group "
+                         "produces exactly one IPPP canvas: the group's first "
+                         "frame stays whole as the I-frame, and later frames "
+                         "only contribute selected patches as P-updates. With "
+                         "32 sampled frames and GOP=8, this yields 4 canvases. "
+                         "Codec-stream mode adaptively groups by saliency "
+                         "energy, targeting roughly 8-64 sampled frames per group.",
                 )
             with gr.Accordion("Time window", open=False):
         '<div id="ovc-footer">'
         '<b>OneVision Encoder</b> · codec-style patch saliency demo · '
         'Sobel + frame-diff stand in for the ffmpeg bitcost patch · '
+        'GOP-aware top-K patch selection with one IPPP canvas per group.'
         '</div>'
     )
             viz_mode, heatmap_alpha,
             start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
+            gop,
         ],
         outputs=[vis_out, canvas_out, info_out, chart_out],
     )
             video_in, sample_frames, patch_size, top_k, max_pixels,
             viz_mode, heatmap_alpha, start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
+            gop,
         ],
     )