CompressedGemma
/

ContourFuse

Model card Files Files and versions

xet

Community

CompressedGemma commited on about 23 hours ago

Commit

b53cacf

verified ·

1 Parent(s): fb98ac2

Optimize RAM usage

Browse files

Files changed (1) hide show

applyweights.py +29 -12

applyweights.py CHANGED Viewed

@@ -21,7 +21,7 @@ from pathlib import Path
 import numpy as np
 import torch
-from safetensors.torch import load, save_file
 PROJ_KEYS = ("gate_proj.weight", "up_proj.weight", "down_proj.weight")
@@ -133,8 +133,8 @@ def apply_single_file(model_path: Path, output_dir: Path, layer_files: dict, arg
     dry_run = args.dry_run
     print(f"\n[model] Processing file: {model_path.name}")
-    with open(model_path, "rb") as f:
-        tensors = load(f.read())
     fused   = 0
     skipped = 0
@@ -142,8 +142,7 @@ def apply_single_file(model_path: Path, output_dir: Path, layer_files: dict, arg
     for layer_idx, layer_path in sorted(layer_files.items()):
         layer_type = "global" if is_global_attention_layer(layer_idx) else "swa"
-        with open(layer_path, "rb") as f:
-            new_weights = load(f.read())
         if not any(k in new_weights for k in PROJ_KEYS):
             print(f"  [skip] Layer {layer_idx}: none of {PROJ_KEYS} found. "
@@ -227,8 +226,7 @@ def apply_sharded(model_dir: Path, output_dir: Path, layer_files: dict, args) ->
     for layer_idx, layer_path in sorted(layer_files.items()):
         layer_type = "global" if is_global_attention_layer(layer_idx) else "swa"
-        with open(layer_path, "rb") as f:
-            new_weights = load(f.read())
         if not any(k in new_weights for k in PROJ_KEYS):
             print(f"  [skip] Layer {layer_idx}: none of {PROJ_KEYS} found. "
@@ -260,10 +258,28 @@ def apply_sharded(model_dir: Path, output_dir: Path, layer_files: dict, args) ->
             f"No layers matched in weight_map. Sample keys: {sample}"
         )
     if not dry_run:
-        if output_dir.exists():
-            shutil.rmtree(output_dir)
-        shutil.copytree(model_dir, output_dir)
     fused_layer_idxs: set = set()
@@ -271,8 +287,8 @@ def apply_sharded(model_dir: Path, output_dir: Path, layer_files: dict, args) ->
         shard_src = model_dir / shard_name
         shard_dst = output_dir / shard_name
-        with open(shard_src, "rb") as f:
-            tensors = load(f.read())
         # Re-group by layer so fuse_layer_deltas is called once per layer per shard.
         by_layer: dict = {}
@@ -320,6 +336,7 @@ def apply_sharded(model_dir: Path, output_dir: Path, layer_files: dict, args) ->
         if not dry_run:
             save_file(tensors, str(shard_dst))
             print(f"  [ok] Saved shard {shard_name} ({len(by_layer)} layer(s))")
     if skipped > 0:
         print(f"  [warn] {skipped} layer(s) fully skipped, "

 import numpy as np
 import torch
+from safetensors.torch import load, load_file, save_file
 PROJ_KEYS = ("gate_proj.weight", "up_proj.weight", "down_proj.weight")
     dry_run = args.dry_run
     print(f"\n[model] Processing file: {model_path.name}")
+    # load_file uses memory-mapping — avoids reading the whole file into RAM twice
+    tensors = load_file(str(model_path))
     fused   = 0
     skipped = 0
     for layer_idx, layer_path in sorted(layer_files.items()):
         layer_type = "global" if is_global_attention_layer(layer_idx) else "swa"
+        new_weights = load_file(str(layer_path))
         if not any(k in new_weights for k in PROJ_KEYS):
             print(f"  [skip] Layer {layer_idx}: none of {PROJ_KEYS} found. "
     for layer_idx, layer_path in sorted(layer_files.items()):
         layer_type = "global" if is_global_attention_layer(layer_idx) else "swa"
+        new_weights = load_file(str(layer_path))
         if not any(k in new_weights for k in PROJ_KEYS):
             print(f"  [skip] Layer {layer_idx}: none of {PROJ_KEYS} found. "
             f"No layers matched in weight_map. Sample keys: {sample}"
         )
+    # Identify which shards will be modified so we can copy non-modified files lazily.
+    modified_shards = set(fusion_plan.keys())
     if not dry_run:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Copy all non-shard files (config, tokenizer, index, etc.) eagerly.
+        # Shard files are copied individually just before they are modified,
+        # avoiding a full model copy upfront that can exhaust RAM and disk I/O.
+        for src_file in model_dir.iterdir():
+            dst_file = output_dir / src_file.name
+            if src_file.name not in modified_shards:
+                if src_file.is_dir():
+                    shutil.copytree(src_file, dst_file, dirs_exist_ok=True)
+                else:
+                    shutil.copy2(src_file, dst_file)
+        # Copy unmodified shards (they just need to be present in the output).
+        all_shards = {v for v in weight_map.values()}
+        for shard_name in all_shards - modified_shards:
+            src = model_dir / shard_name
+            dst = output_dir / shard_name
+            if src.exists() and not dst.exists():
+                shutil.copy2(src, dst)
     fused_layer_idxs: set = set()
         shard_src = model_dir / shard_name
         shard_dst = output_dir / shard_name
+        # load_file uses memory-mapped I/O — no full f.read() into RAM
+        tensors = load_file(str(shard_src))
         # Re-group by layer so fuse_layer_deltas is called once per layer per shard.
         by_layer: dict = {}
         if not dry_run:
             save_file(tensors, str(shard_dst))
             print(f"  [ok] Saved shard {shard_name} ({len(by_layer)} layer(s))")
+        del tensors  # free RAM before loading next shard
     if skipped > 0:
         print(f"  [warn] {skipped} layer(s) fully skipped, "