Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

FeilongTang commited on 25 days ago

Commit

e3d4cd8

1 Parent(s): 047a9df

IPPP canvas: overlay P-frame patches onto I-frame at same dimensions

Replaces the I-on-top + P-grid-below layout with the proper codec
semantic: every frame in a group shares the picture size; a P-frame
only encodes the macroblocks that need to change.

Per sub-range:
1. Start canvas = I-frame's full image (canvas_h × canvas_w =
hb*patch × wb*patch).
2. For each P-frame in time order, for every saliency-selected
patch position (i, j), overwrite canvas[i*patch:(i+1)*patch,
j*patch:(j+1)*patch] with the P-frame's pixels at that position.
3. Final canvas is what the encoder would have reconstructed at the
end of this sub-range. Moving subjects produce visible 'ghost'
trails because successive P-frame overlays at overlapping
positions show the motion.

Every canvas now ends up exactly the I-frame's dimensions — they all
match each other regardless of how many P-frames the sub-range had.

Accounting: total_selected_patches_incl_i_frames now sums the I-frame
grid (hb*wb) plus the count of P-frame overlay hits (same position can
be overlaid by multiple P-frames; each hit counts).

Files changed (1) hide show

app.py +34 -26

app.py CHANGED Viewed

@@ -451,16 +451,28 @@ def write_mp4(frames: List[np.ndarray], path: str, fps: float) -> None:
 def _build_ippp_canvas(
     frames: List[np.ndarray], masks: List[np.ndarray],
     i_idx: int, p_range: range, patch: int,
-) -> np.ndarray:
-    """Build one IPPP canvas: full I-frame on top, P-frame selected patches
-    in a wb-wide raster grid below."""
     i_frame = frames[i_idx]
     h, w = i_frame.shape[:2]
     hb, wb = h // patch, w // patch
-    canvas_w = wb * patch
-    i_block = i_frame[: hb * patch, : canvas_w].copy()
-    p_patches: List[np.ndarray] = []
     for k in p_range:
         if k >= len(frames):
             break
@@ -468,19 +480,15 @@ def _build_ippp_canvas(
         for i in range(m.shape[0]):
             for j in range(m.shape[1]):
                 if m[i, j]:
-                    p_patches.append(
-                        f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
-                    )
-    if not p_patches:
-        return i_block
-    rows = (len(p_patches) + wb - 1) // wb
-    p_grid = np.full((rows * patch, canvas_w, 3), 255, dtype=np.uint8)
-    for idx, p in enumerate(p_patches):
-        r, c = divmod(idx, wb)
-        p_grid[r * patch:(r + 1) * patch, c * patch:(c + 1) * patch] = p
-    return np.vstack([i_block, p_grid])
 def _allocate_canvases_per_group(
@@ -542,19 +550,19 @@ def pack_canvases_per_group(
             ss = cursor
             ee = min(e, cursor + sub_len - 1)
             cursor = ee + 1
-            canvas = _build_ippp_canvas(
                 frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
                 patch=patch,
             )
             canvases.append(canvas)
             sub_ranges.append((g_idx, ss, ee))
-            # Accounting
-            i_h, i_w = canvas.shape[:2]
             hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
-            total_selected += hb * wb  # I-frame counts as fully kept.
-            for kk in range(ss + 1, ee + 1):
-                if kk < len(masks):
-                    total_selected += int(masks[kk].sum())
     if not canvases:
         canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]

 def _build_ippp_canvas(
     frames: List[np.ndarray], masks: List[np.ndarray],
     i_idx: int, p_range: range, patch: int,
+) -> Tuple[np.ndarray, int]:
+    """Build one IPPP canvas at the *same dimensions as the I-frame*.
+    Codec convention: every frame in a group shares the picture size; a
+    P-frame only encodes the macroblocks that need to change. So:
+      1. Initialise the canvas to the I-frame's full image.
+      2. For each P-frame in time order, replace each saliency-selected
+         patch position with the P-frame's pixels at that position.
+      3. The canvas now reads as 'what the encoder would have reconstructed
+         at the end of this group' — same shape as the I-frame, with the
+         high-energy regions updated by later P-frames.
+    Returns (canvas, n_overlays) where n_overlays is the count of P-frame
+    patches that overwrote a position (a position may be hit multiple
+    times by different P-frames; we count each hit)."""
     i_frame = frames[i_idx]
     h, w = i_frame.shape[:2]
     hb, wb = h // patch, w // patch
+    canvas_h, canvas_w = hb * patch, wb * patch
+    canvas = i_frame[:canvas_h, :canvas_w].copy()
+    n_overlays = 0
     for k in p_range:
         if k >= len(frames):
             break
         for i in range(m.shape[0]):
             for j in range(m.shape[1]):
                 if m[i, j]:
+                    canvas[
+                        i * patch:(i + 1) * patch,
+                        j * patch:(j + 1) * patch,
+                    ] = f[
+                        i * patch:(i + 1) * patch,
+                        j * patch:(j + 1) * patch,
+                    ]
+                    n_overlays += 1
+    return canvas, n_overlays
 def _allocate_canvases_per_group(
             ss = cursor
             ee = min(e, cursor + sub_len - 1)
             cursor = ee + 1
+            canvas, n_p_overlays = _build_ippp_canvas(
                 frames, masks, i_idx=ss, p_range=range(ss + 1, ee + 1),
                 patch=patch,
             )
             canvases.append(canvas)
             sub_ranges.append((g_idx, ss, ee))
+            # Accounting:
+            #   - I-frame counts as the full grid (anchor, every position
+            #     starts from it).
+            #   - Each P-frame overlay is +1 (positions may be overlaid
+            #     multiple times by later P-frames; we count each hit).
             hb, wb = frames[ss].shape[0] // patch, frames[ss].shape[1] // patch
+            total_selected += hb * wb + n_p_overlays
     if not canvases:
         canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]