Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

App Files Files Community

FeilongTang commited on 24 days ago

Commit

5a9b121

1 Parent(s): 210c709

Show GOP canvases as explicit I/P sections

Browse files

Files changed (1) hide show

app.py +59 -38

app.py CHANGED Viewed

@@ -525,12 +525,12 @@ def _build_ippp_canvas(
     frames: List[np.ndarray], masks: List[np.ndarray],
     i_idx: int, p_range: range, patch: int,
 ) -> Tuple[np.ndarray, int]:
-    """Build one IPPP canvas with the I-frame on top and packed P patches below.
     Layout:
       1. The group's first frame is copied whole as the I-frame.
-      2. Every selected patch from later P-frames is appended below the
-         I-frame in time-major raster order.
     Returns (canvas, n_patches) where n_patches is the number of selected
     P-frame patches packed under the I-frame."""
@@ -540,11 +540,14 @@ def _build_ippp_canvas(
     frame_h, frame_w = hb * patch, wb * patch
     i_crop = i_frame[:frame_h, :frame_w].copy()
-    packed_patches: List[np.ndarray] = []
     for k in p_range:
         if k >= len(frames):
             break
         f, m = frames[k], masks[k]
         for i in range(m.shape[0]):
             for j in range(m.shape[1]):
                 if m[i, j]:
@@ -554,28 +557,29 @@ def _build_ippp_canvas(
                             j * patch:(j + 1) * patch,
                         ].copy()
                     )
-    n_patches = len(packed_patches)
-    packed_rows = int(math.ceil(n_patches / max(1, wb))) if n_patches else 0
-    packed_h = packed_rows * patch
-    canvas = np.full((frame_h + packed_h, frame_w, 3), 250, dtype=np.uint8)
-    canvas[:frame_h, :frame_w] = i_crop
-    if packed_h > 0:
-        cv2.line(
-            canvas,
-            (0, frame_h - 1),
-            (frame_w - 1, frame_h - 1),
-            (99, 102, 241),
-            2,
-            lineType=cv2.LINE_AA,
-        )
         for idx, tile in enumerate(packed_patches):
             row = idx // wb
             col = idx % wb
-            y0 = frame_h + row * patch
             x0 = col * patch
-            canvas[y0:y0 + patch, x0:x0 + patch] = tile
     return canvas, n_patches
@@ -589,10 +593,9 @@ def pack_canvases_per_group(
 ) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
     """Pack exactly one IPPP canvas per GOP group.
-    Each group's first frame is kept whole as the I-frame, and the
-    remaining frames in that same group contribute only their selected
-    patches as P-frame overlays. `target_canvases` is kept only for API
-    compatibility and is ignored.
     Returns:
       canvases       — list of np.ndarray, length == number of groups.
@@ -892,10 +895,11 @@ def process(
         src_start = int(fids[ss]) if ss < len(fids) else None
         src_end = int(fids[ee]) if ee < len(fids) else None
         p_frame_count = max(0, ee - ss)
         p_patch_count = int(sum(int(m.sum()) for m in masks[ss + 1:ee + 1]))
         caption = (
             f"Canvas {idx + 1}/{len(canvases)} · group {g_idx + 1} · "
-            f"sampled #{ss}-{ee} · src {src_start}-{src_end} · "
             f"I src#{src_start} + {p_patch_count} P patches from "
             f"{p_frame_count} frame{'s' if p_frame_count != 1 else ''}"
         )
@@ -928,7 +932,7 @@ def process(
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
             "gop": gop_resolved,
-            "canvas_policy": "one_ippp_canvas_per_group",
             "i_frame_policy": "first_frame_full_in_each_group",
         },
         "gop_groups": [
@@ -941,8 +945,11 @@ def process(
                 "end_source_frame_id": int(fids[e]) if e < len(fids) else None,
                 "source_frame_ids": [int(fids[i]) for i in range(s, e + 1)],
                 "n_frames": int(e - s + 1),
                 "i_frame_source_id": int(fids[s]) if s < len(fids) else None,
                 "p_frame_count": int(max(0, e - s)),
                 "p_frame_selected_patches": int(sum(int(m.sum()) for m in masks[s + 1:e + 1])),
                 "selected": int(sum(int(m.sum()) for m in masks[s:e + 1])),
             }
@@ -992,19 +999,31 @@ def process(
                     [int(fids[x]) for x in range(sub_ranges[i][1], sub_ranges[i][2] + 1)]
                     if i < len(sub_ranges) else []
                 ),
                 "i_frame_source_id": (
                     int(fids[sub_ranges[i][1]]) if i < len(sub_ranges) else None
                 ),
                 "p_frame_count": (
                     int(max(0, sub_ranges[i][2] - sub_ranges[i][1]))
                     if i < len(sub_ranges) else 0
                 ),
                 "p_frame_selected_patches": (
                     int(sum(int(m.sum()) for m in masks[sub_ranges[i][1] + 1:sub_ranges[i][2] + 1]))
                     if i < len(sub_ranges) else 0
                 ),
-                "structure": "IPPP — full I-frame on top, selected P patches "
-                             "packed below in time-major order.",
             }
             for i in range(len(canvases))
         ],
@@ -1565,6 +1584,7 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                 gop = gr.Radio(
                     [
                         ("GOP = 4 — fixed 4-frame groups",                "4"),
                         ("GOP = 8 — fixed 8-frame groups",                "8"),
                         ("GOP = 16 — fixed 16-frame groups",              "16"),
                         ("Codec-stream: adaptive groups by saliency energy", "dynamic"),
@@ -1572,10 +1592,11 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                     value="8",
                     label="GOP (group of pictures)",
                     info="Splits sampled frames into GOP groups. Each group "
-                         "produces exactly one IPPP canvas: the group's first "
-                         "frame stays whole as the I-frame, and later frames "
-                         "only contribute selected patches as P-updates. With "
-                         "32 sampled frames and GOP=8, this yields 4 canvases. "
                          "Codec-stream mode adaptively groups by saliency "
                          "energy, targeting roughly 8-64 sampled frames per group.",
                 )
@@ -1650,10 +1671,10 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                         gr.Markdown("### Packed canvases (one per GOP group)")
                         gr.Markdown(
                             "<small>Each canvas is one GOP group rendered in "
-                            "<b>IPPP order</b>: the group's first frame is the "
-                            "<b>I-frame</b> kept whole on top, followed by the "
-                            "<b>P-frame</b> selected patches packed below in "
-                            "time order.</small>"
                         )
                         canvas_out = gr.Gallery(
                             label="", show_label=False,

     frames: List[np.ndarray], masks: List[np.ndarray],
     i_idx: int, p_range: range, patch: int,
 ) -> Tuple[np.ndarray, int]:
+    """Build one GOP canvas with explicit I/P sections.
     Layout:
       1. The group's first frame is copied whole as the I-frame.
+      2. Each later P-frame gets its own packed section below, in time order.
+         So GOP=4 becomes I|P|P|P, GOP=5 becomes I|P|P|P|P, etc.
     Returns (canvas, n_patches) where n_patches is the number of selected
     P-frame patches packed under the I-frame."""
     frame_h, frame_w = hb * patch, wb * patch
     i_crop = i_frame[:frame_h, :frame_w].copy()
+    divider_h = 2
+    p_sections: List[np.ndarray] = []
+    n_patches = 0
     for k in p_range:
         if k >= len(frames):
             break
         f, m = frames[k], masks[k]
+        packed_patches: List[np.ndarray] = []
         for i in range(m.shape[0]):
             for j in range(m.shape[1]):
                 if m[i, j]:
                             j * patch:(j + 1) * patch,
                         ].copy()
                     )
+        n_patches += len(packed_patches)
+        packed_rows = max(1, int(math.ceil(len(packed_patches) / max(1, wb))))
+        packed_h = packed_rows * patch
+        section_bg = np.full((packed_h, frame_w, 3), 246, dtype=np.uint8)
         for idx, tile in enumerate(packed_patches):
             row = idx // wb
             col = idx % wb
+            y0 = row * patch
             x0 = col * patch
+            section_bg[y0:y0 + patch, x0:x0 + patch] = tile
+        p_sections.append(section_bg)
+    total_h = frame_h + sum(divider_h + sec.shape[0] for sec in p_sections)
+    canvas = np.full((total_h, frame_w, 3), 250, dtype=np.uint8)
+    canvas[:frame_h, :frame_w] = i_crop
+    y = frame_h
+    for section in p_sections:
+        canvas[y:y + divider_h, :] = (99, 102, 241)
+        y += divider_h
+        sec_h = section.shape[0]
+        canvas[y:y + sec_h, :frame_w] = section
+        y += sec_h
     return canvas, n_patches
 ) -> Tuple[List[np.ndarray], List[Tuple[int, int, int]], int]:
     """Pack exactly one IPPP canvas per GOP group.
+    Each group's first frame is kept whole as the I-frame, and every
+    later frame gets its own packed P section below it. `target_canvases`
+    is kept only for API compatibility and is ignored.
     Returns:
       canvases       — list of np.ndarray, length == number of groups.
         src_start = int(fids[ss]) if ss < len(fids) else None
         src_end = int(fids[ee]) if ee < len(fids) else None
         p_frame_count = max(0, ee - ss)
+        structure_label = " ".join(["I"] + ["P"] * p_frame_count)
         p_patch_count = int(sum(int(m.sum()) for m in masks[ss + 1:ee + 1]))
         caption = (
             f"Canvas {idx + 1}/{len(canvases)} · group {g_idx + 1} · "
+            f"{structure_label} · sampled #{ss}-{ee} · src {src_start}-{src_end} · "
             f"I src#{src_start} + {p_patch_count} P patches from "
             f"{p_frame_count} frame{'s' if p_frame_count != 1 else ''}"
         )
             "bitcost_pct": float(bitcost_pct),
             "fade_strength": float(fade_strength),
             "gop": gop_resolved,
+            "canvas_policy": "one_canvas_per_group_with_per_frame_p_sections",
             "i_frame_policy": "first_frame_full_in_each_group",
         },
         "gop_groups": [
                 "end_source_frame_id": int(fids[e]) if e < len(fids) else None,
                 "source_frame_ids": [int(fids[i]) for i in range(s, e + 1)],
                 "n_frames": int(e - s + 1),
+                "structure_label": " ".join(["I"] + ["P"] * max(0, e - s)),
                 "i_frame_source_id": int(fids[s]) if s < len(fids) else None,
+                "p_source_frame_ids": [int(fids[i]) for i in range(s + 1, e + 1)],
                 "p_frame_count": int(max(0, e - s)),
+                "p_frame_patch_counts": [int(masks[i].sum()) for i in range(s + 1, e + 1)],
                 "p_frame_selected_patches": int(sum(int(m.sum()) for m in masks[s + 1:e + 1])),
                 "selected": int(sum(int(m.sum()) for m in masks[s:e + 1])),
             }
                     [int(fids[x]) for x in range(sub_ranges[i][1], sub_ranges[i][2] + 1)]
                     if i < len(sub_ranges) else []
                 ),
+                "structure_label": (
+                    " ".join(["I"] + ["P"] * max(0, sub_ranges[i][2] - sub_ranges[i][1]))
+                    if i < len(sub_ranges) else "I"
+                ),
                 "i_frame_source_id": (
                     int(fids[sub_ranges[i][1]]) if i < len(sub_ranges) else None
                 ),
+                "p_source_frame_ids": (
+                    [int(fids[x]) for x in range(sub_ranges[i][1] + 1, sub_ranges[i][2] + 1)]
+                    if i < len(sub_ranges) else []
+                ),
                 "p_frame_count": (
                     int(max(0, sub_ranges[i][2] - sub_ranges[i][1]))
                     if i < len(sub_ranges) else 0
                 ),
+                "p_frame_patch_counts": (
+                    [int(masks[x].sum()) for x in range(sub_ranges[i][1] + 1, sub_ranges[i][2] + 1)]
+                    if i < len(sub_ranges) else []
+                ),
                 "p_frame_selected_patches": (
                     int(sum(int(m.sum()) for m in masks[sub_ranges[i][1] + 1:sub_ranges[i][2] + 1]))
                     if i < len(sub_ranges) else 0
                 ),
+                "structure": "Full I-frame on top; one packed P section per "
+                             "later frame, in time order.",
             }
             for i in range(len(canvases))
         ],
                 gop = gr.Radio(
                     [
                         ("GOP = 4 — fixed 4-frame groups",                "4"),
+                        ("GOP = 5 — fixed 5-frame groups",                "5"),
                         ("GOP = 8 — fixed 8-frame groups",                "8"),
                         ("GOP = 16 — fixed 16-frame groups",              "16"),
                         ("Codec-stream: adaptive groups by saliency energy", "dynamic"),
                     value="8",
                     label="GOP (group of pictures)",
                     info="Splits sampled frames into GOP groups. Each group "
+                         "produces exactly one GOP canvas: the group's first "
+                         "frame stays whole as the I-frame, and each later "
+                         "frame gets its own P section below it. So GOP=4 "
+                         "becomes I P P P, GOP=5 becomes I P P P P. With 32 "
+                         "sampled frames and GOP=8, this yields 4 canvases. "
                          "Codec-stream mode adaptively groups by saliency "
                          "energy, targeting roughly 8-64 sampled frames per group.",
                 )
                         gr.Markdown("### Packed canvases (one per GOP group)")
                         gr.Markdown(
                             "<small>Each canvas is one GOP group rendered in "
+                            "<b>I/P structure</b>: the group's first frame is "
+                            "the <b>I-frame</b> kept whole on top, and each "
+                            "later frame gets its own packed <b>P-frame</b> "
+                            "section below in time order.</small>"
                         )
                         canvas_out = gr.Gallery(
                             label="", show_label=False,