Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

FeilongTang commited on 25 days ago

Commit

047a9df

1 Parent(s): 901e5ca

Add 'Target canvases (total per video)' slider, default 4

Decouples the canvas count from the GOP grouping. The total is split
across GOP groups as evenly as possible (the first `remainder` groups
get +1 each, with a floor of 1 per group so nothing is invisible).
Each group's frame range is then sliced into k consecutive sub-ranges,
one IPPP canvas per sub-range.

Examples (16 sampled frames, target=4):
GOP=4 -> 4 groups × 1 canvas = 4
GOP=8 -> 2 groups × 2 canvases = 4
GOP=16 -> 1 group × 4 canvases = 4
Dynamic -> 4 groups × 1 canvas = 4

API
- pack_canvases_per_group() now returns (canvases, sub_ranges,
n_selected) and takes target_canvases.
- Caption switches from 'Group K/N' to 'Canvas K/N · group G · I@#s
+ p P-frames' so the sub-range origin is visible.
- Run info JSON 'canvases' entries get 'sub_range', and the params
block gains 'target_canvases'.
- DEMO_PRESET extended by one value.

Files changed (1) hide show

app.py +121 -58

app.py CHANGED Viewed

@@ -58,6 +58,7 @@ DEMO_PRESET = (
     96.0,             # bitcost_pct
     0.55,             # fade_strength
     "dynamic",        # gop
 )
@@ -447,68 +448,118 @@ def write_mp4(frames: List[np.ndarray], path: str, fps: float) -> None:
             proc.kill()
 def pack_canvases_per_group(
     frames: List[np.ndarray],
     masks: List[np.ndarray],
     groups: List[Tuple[int, int]],
     patch: int,
-) -> Tuple[List[np.ndarray], int]:
-    """One canvas per GOP group, structured as I-frame + P-frame patches
-    (IPPP order, matching the codec convention).
-    Within each group [s..e]:
-      - frame s is the I-frame: its WHOLE image is laid down as the top
-        of the canvas (the anchor / keyframe).
-      - frames s+1..e are P-frames: only their selected patches go below
-        the I-frame, packed in time-order, raster scan, in a wb-wide grid.
-    The canvas width is locked to the frame's patch-grid width so the
-    I-frame slots in cleanly and the P-grid below aligns block-for-block.
     """
     canvases: List[np.ndarray] = []
     total_selected = 0
-    for (s, e) in groups:
-        if s >= len(frames):
-            continue
-        i_frame = frames[s]
-        h, w = i_frame.shape[:2]
-        hb, wb = h // patch, w // patch
-        canvas_w = wb * patch
-        # I-frame block (already a multiple of patch from smart_resize).
-        i_block = i_frame[: hb * patch, : canvas_w].copy()
-        total_selected += hb * wb  # I-frame counts as fully kept.
-        # Collect selected patches from P-frames (s+1..e), time-major.
-        p_patches: List[np.ndarray] = []
-        for k in range(s + 1, e + 1):
-            if k >= len(frames):
-                break
-            f, m = frames[k], masks[k]
-            for i in range(m.shape[0]):
-                for j in range(m.shape[1]):
-                    if m[i, j]:
-                        p_patches.append(
-                            f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
-                        )
-        total_selected += len(p_patches)
-        if not p_patches:
-            canvases.append(i_block)
-            continue
-        # Lay P-patches in a wb-wide grid below the I-frame.
-        rows = (len(p_patches) + wb - 1) // wb
-        p_grid = np.full((rows * patch, canvas_w, 3), 255, dtype=np.uint8)
-        for idx, p in enumerate(p_patches):
-            r, c = divmod(idx, wb)
-            p_grid[r * patch:(r + 1) * patch, c * patch:(c + 1) * patch] = p
-        canvas = np.vstack([i_block, p_grid])
-        canvases.append(canvas)
     if not canvases:
         canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
-    return canvases, total_selected
 def make_charts(
@@ -630,6 +681,7 @@ def process(
     bitcost_pct: float = 99.0,
     fade_strength: float = 0.55,
     gop: str = "global",
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
@@ -721,19 +773,20 @@ def process(
     vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
     write_mp4(vis, vis_path, vis_fps)
-    progress(0.85, desc="Packing canvases (one per GOP group)")
-    canvases, n_selected = pack_canvases_per_group(
         resized, masks, groups, int(patch_size),
     )
     canvas_items: List[Tuple[str, str]] = []
     for idx, canv in enumerate(canvases):
         cp = os.path.join(out_dir, f"canvas_{idx:03d}.png")
         cv2.imwrite(cp, canv)
-        s_idx, e_idx = groups[idx] if idx < len(groups) else (idx, idx)
-        n_p = max(0, e_idx - s_idx)  # number of P-frames in this group
         caption = (
-            f"Group {idx + 1}/{len(canvases)} · I-frame @ sampled #{s_idx} "
-            f"+ {n_p} P-frame{'s' if n_p != 1 else ''}"
         )
         canvas_items.append((cp, caption))
@@ -754,6 +807,7 @@ def process(
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
             "gop": gop_resolved,
         },
         "gop_groups": [
             {
@@ -787,7 +841,8 @@ def process(
             {
                 "index": i,
                 "size": f"{canvases[i].shape[1]}x{canvases[i].shape[0]}",
-                "group": list(groups[i]) if i < len(groups) else None,
                 "structure": "IPPP — first frame full (I), rest contribute "
                              "only their selected patches (P).",
             }
@@ -1249,6 +1304,14 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                          "each. Dynamic mode mirrors codec_tools' readiness "
                          "grouping (equal-energy groups).",
                 )
             with gr.Accordion("Time window", open=False):
                 with gr.Row():
@@ -1355,7 +1418,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
             viz_mode, heatmap_alpha,
             start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
-            gop,
         ],
         outputs=[vis_out, canvas_out, info_out, chart_out],
     )
@@ -1367,7 +1430,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
             video_in, sample_frames, patch_size, top_k, max_pixels,
             viz_mode, heatmap_alpha, start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
-            gop,
         ],
     )

     96.0,             # bitcost_pct
     0.55,             # fade_strength
     "dynamic",        # gop
+    4,                # target_canvases
 )
             proc.kill()
+def _build_ippp_canvas(
+    frames: List[np.ndarray], masks: List[np.ndarray],
+    i_idx: int, p_range: range, patch: int,
+) -> np.ndarray:
+    """Build one IPPP canvas: full I-frame on top, P-frame selected patches
+    in a wb-wide raster grid below."""
+    i_frame = frames[i_idx]
+    h, w = i_frame.shape[:2]
+    hb, wb = h // patch, w // patch
+    canvas_w = wb * patch
+    i_block = i_frame[: hb * patch, : canvas_w].copy()
+    p_patches: List[np.ndarray] = []
+    for k in p_range:
+        if k >= len(frames):
+            break
+        f, m = frames[k], masks[k]
+        for i in range(m.shape[0]):
+            for j in range(m.shape[1]):
+                if m[i, j]:
+                    p_patches.append(
+                        f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
+                    )
+    if not p_patches:
+        return i_block
+    rows = (len(p_patches) + wb - 1) // wb
+    p_grid = np.full((rows * patch, canvas_w, 3), 255, dtype=np.uint8)
+    for idx, p in enumerate(p_patches):
+        r, c = divmod(idx, wb)
+        p_grid[r * patch:(r + 1) * patch, c * patch:(c + 1) * patch] = p
+    return np.vstack([i_block, p_grid])
+def _allocate_canvases_per_group(
+    target_canvases: int, num_groups: int,
+) -> List[int]:
+    """Split a total target canvas count across N groups as evenly as
+    possible; the first `remainder` groups get +1 each."""
+    target = max(1, int(target_canvases))
+    n = max(1, int(num_groups))
+    base, rem = divmod(target, n)
+    out = [base + (1 if i < rem else 0) for i in range(n)]
+    # Floor to at least 1 canvas per group so no group is invisible.
+    return [max(1, x) for x in out]
 def pack_canvases_per_group(
     frames: List[np.ndarray],
     masks: List[np.ndarray],
     groups: List[Tuple[int, int]],
     patch: int,
+    target_canvases: int = 4,
+) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
+    """Pack exactly `target_canvases` IPPP canvases for the whole video,
+    distributing them across GOP groups as evenly as possible.
+    Each group's frame range [s..e] is split into K consecutive sub-ranges
+    (K = canvases allocated to that group). Each sub-range [ss..ee] becomes
+    one canvas:
+      - frame ss is the I-frame: its whole image goes to the canvas top.
+      - frames ss+1..ee are P-frames: only saliency-selected patches go
+        below the I-frame, packed time-major in a wb-wide raster grid.
+    Returns:
+      canvases       — list of np.ndarray, length == target_canvases
+                       (or fewer if some groups have only 1 frame).
+      sub_ranges     — list of (group_idx, sub_start, sub_end) parallel to
+                       canvases, for caption / debugging.
+      total_selected — I-frame patches (counted as full grid) + P-frame
+                       selected patches across all canvases.
     """
     canvases: List[np.ndarray] = []
+    sub_ranges: List[Tuple[int, int, int]] = []
     total_selected = 0
+    if not groups or not frames:
+        return [np.full((patch, patch, 3), 255, dtype=np.uint8)], [(0, 0, 0)], 0
+    per_group_counts = _allocate_canvases_per_group(target_canvases, len(groups))
+    for g_idx, (s, e) in enumerate(groups):
+        if s >= len(frames):
+            continue
+        group_len = e - s + 1
+        k = max(1, min(per_group_counts[g_idx], group_len))
+        # Split [s..e] into k consecutive sub-ranges of (almost) equal size.
+        base, rem = divmod(group_len, k)
+        cursor = s
+        for sub_i in range(k):
+            sub_len = base + (1 if sub_i < rem else 0)
+            ss = cursor
+            ee = min(e, cursor + sub_len - 1)
+            cursor = ee + 1
+            canvas = _build_ippp_canvas(
+                frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
+                patch=patch,
+            )
+            canvases.append(canvas)
+            sub_ranges.append((g_idx, ss, ee))
+            # Accounting
+            i_h, i_w = canvas.shape[:2]
+            hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
+            total_selected += hb * wb  # I-frame counts as fully kept.
+            for kk in range(ss + 1, ee + 1):
+                if kk < len(masks):
+                    total_selected += int(masks[kk].sum())
     if not canvases:
         canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
+        sub_ranges = [(0, 0, 0)]
+    return canvases, sub_ranges, total_selected
 def make_charts(
     bitcost_pct: float = 99.0,
     fade_strength: float = 0.55,
     gop: str = "global",
+    target_canvases: int = 4,
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
     vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
     write_mp4(vis, vis_path, vis_fps)
+    progress(0.85, desc="Packing canvases (IPPP)")
+    canvases, sub_ranges, n_selected = pack_canvases_per_group(
         resized, masks, groups, int(patch_size),
+        target_canvases=int(target_canvases),
     )
     canvas_items: List[Tuple[str, str]] = []
     for idx, canv in enumerate(canvases):
         cp = os.path.join(out_dir, f"canvas_{idx:03d}.png")
         cv2.imwrite(cp, canv)
+        g_idx, ss, ee = sub_ranges[idx] if idx < len(sub_ranges) else (0, idx, idx)
+        n_p = max(0, ee - ss)
         caption = (
+            f"Canvas {idx + 1}/{len(canvases)} · group {g_idx + 1} · "
+            f"I@#{ss} + {n_p} P-frame{'s' if n_p != 1 else ''}"
         )
         canvas_items.append((cp, caption))
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
             "gop": gop_resolved,
+            "target_canvases": int(target_canvases),
         },
         "gop_groups": [
             {
             {
                 "index": i,
                 "size": f"{canvases[i].shape[1]}x{canvases[i].shape[0]}",
+                "group": int(sub_ranges[i][0]) if i < len(sub_ranges) else None,
+                "sub_range": list(sub_ranges[i][1:3]) if i < len(sub_ranges) else None,
                 "structure": "IPPP — first frame full (I), rest contribute "
                              "only their selected patches (P).",
             }
                          "each. Dynamic mode mirrors codec_tools' readiness "
                          "grouping (equal-energy groups).",
                 )
+                target_canvases = gr.Slider(
+                    1, 16, value=4, step=1,
+                    label="Target canvases (total per video)",
+                    info="Fixed canvas count regardless of GOP. The budget is "
+                         "split across groups; each group is further sliced "
+                         "into sub-ranges of consecutive frames, one IPPP "
+                         "canvas per sub-range.",
+                )
             with gr.Accordion("Time window", open=False):
                 with gr.Row():
             viz_mode, heatmap_alpha,
             start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
+            gop, target_canvases,
         ],
         outputs=[vis_out, canvas_out, info_out, chart_out],
     )
             video_in, sample_frames, patch_size, top_k, max_pixels,
             viz_mode, heatmap_alpha, start_sec, end_sec,
             saliency_signal, score_log_scale, bitcost_pct, fade_strength,
+            gop, target_canvases,
         ],
     )