CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 26 days ago

Commit

414e1de

verified ·

1 Parent(s): 7d55b19

Upload 2 files

Browse files

Files changed (2) hide show

generate_imatrix.py +102 -11
hpc_forward_merged.c +48 -87

generate_imatrix.py CHANGED Viewed

@@ -132,7 +132,12 @@ class GGUFModel:
         self.data_offset = align_offset(f.tell())
     def get_arch(self):
-        arch = self.kv.get('general.architecture', 'gemma2')
         return arch
     def get_config(self):
@@ -177,7 +182,11 @@ class GGUFModel:
         ti = self.tensor_infos[name]
         abs_offset = self.data_offset + ti['offset']
         raw = bytes(self._mm[abs_offset:abs_offset + ti['data_size']])
-        return dequantize(raw, ti['type'], ti['n_elements'])
     def get_tensor_shape(self, name):
         """Return the shape of a tensor (GGUF stores reversed dims)."""
@@ -226,7 +235,8 @@ def dequant_q4_0(raw, n_elements):
     qs = data[:, 2:18]  # 16 bytes = 32 nibbles
     lo = (qs & 0xF).astype(np.float32) - 8.0
     hi = (qs >> 4).astype(np.float32) - 8.0
-    x = np.concatenate([lo, hi], axis=1)  # [n_blocks, 32]
     return (d * x).reshape(-1)[:n_elements]
 def dequant_q2k(raw, n_elements):
@@ -243,7 +253,7 @@ def dequant_q2k(raw, n_elements):
         dmin = dmin_fp16[blk]
         for half in range(2):
             for sub in range(4):
-                j = half * 8 + sub
                 sc = int(scales_packed[blk, j]) & 0xF
                 mn = int(scales_packed[blk, j]) >> 4
                 d_sub = d * sc
@@ -430,10 +440,13 @@ class SimpleTokenizer:
         """Encode text and split into fixed-length chunks."""
         ids = self.encode(text)
         chunks = []
-        for i in range(0, len(ids) - chunk_size, chunk_size // 2):  # 50% overlap
             chunk = ids[i:i + chunk_size]
-            if len(chunk) == chunk_size:
-                chunks.append(np.array(chunk, dtype=np.int32))
         if not chunks and ids:
             # Pad short text
             padded = ids + [self.eos_id] * (chunk_size - len(ids))
@@ -555,6 +568,8 @@ class TransformerRunner:
         n_embd = cfg['n_embd']
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
         head_dim = self.head_dim
         eps = cfg['rms_eps']
@@ -673,6 +688,7 @@ class TransformerRunner:
         # Read back importance for the tensors that WERE processed in C
         for name, arr, cnt in imp_refs:
             self.importance[name] = (arr.astype(np.float64), cnt.value)
         # Handle MoE FFN if C code skipped it
@@ -797,16 +813,17 @@ class TransformerRunner:
             imp_f32 = self.importance[name][0].astype(np.float32)
             count = ctypes.c_int64(self.importance[name][1])
             # Dummy output — we only want the importance recording
             dummy_out = np.empty((M, 1), dtype=np.float32)
-            dummy_w = np.zeros((1, K), dtype=np.float32)
             self._hpc_lib.hexstate_matmul_record(
                 x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                dummy_w.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                 dummy_out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                 imp_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                ctypes.c_int64(M), ctypes.c_int64(K), ctypes.c_int64(1),
                 ctypes.byref(count))
             self.importance[name] = (imp_f32.astype(np.float64), count.value)
@@ -832,6 +849,8 @@ class TransformerRunner:
         cfg = self.cfg
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
         seq_len = hidden.shape[0]
         # ── Attention norm ──
@@ -1009,6 +1028,8 @@ class TransformerRunner:
         cfg = self.cfg
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
         head_dim = self.head_dim
         seq_len = hidden.shape[0]
@@ -1232,6 +1253,8 @@ class TransformerRunner:
         cfg = self.cfg
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
         head_dim = self.head_dim
         seq_len = hidden.shape[0]
@@ -1461,7 +1484,9 @@ def hpc_propagate_importance(importance_dict, n_layers, verbose=False):
                 n_nbr += 1
             if n_nbr > 0:
                 e_nbr /= n_nbr
-            new_mult[i] = np.exp((e_self + 0.3 * e_nbr) / temperature)
         mean_m = np.mean(new_mult)
         if mean_m > 1e-30:
@@ -1511,6 +1536,54 @@ def write_imatrix(path, importance_dict):
     return len(entries)
 # ─── Main ───────────────────────────────────────────────────────────────────
 def main():
@@ -1521,6 +1594,7 @@ def main():
     parser.add_argument('calibration', help='Calibration text file')
     parser.add_argument('-o', '--output', default='imatrix.dat',
                         help='Output imatrix file (default: imatrix.dat)')
     parser.add_argument('--chunks', type=int, default=10,
                         help='Number of token chunks to process (default: 10)')
     parser.add_argument('--chunk-size', type=int, default=4096,
@@ -1547,6 +1621,23 @@ def main():
     model = GGUFModel(args.model)
     config = model.get_config()
     print(f"  Architecture:  {config['arch']}")
     print(f"  Layers:        {config['n_layers']}")
     print(f"  Hidden:        {config['n_embd']}")

         self.data_offset = align_offset(f.tell())
     def get_arch(self):
+        arch = self.kv.get('general.architecture')
+        if not arch:
+            # Try to infer from tensor names
+            if any('attn_gate' in n for n in self.tensor_infos):
+                return 'gemma2'
+            return 'llama'
         return arch
     def get_config(self):
         ti = self.tensor_infos[name]
         abs_offset = self.data_offset + ti['offset']
         raw = bytes(self._mm[abs_offset:abs_offset + ti['data_size']])
+        try:
+            return dequantize(raw, ti['type'], ti['n_elements'])
+        except ValueError as e:
+            print(f"  Error dequantizing {name}: {e}")
+            return None
     def get_tensor_shape(self, name):
         """Return the shape of a tensor (GGUF stores reversed dims)."""
     qs = data[:, 2:18]  # 16 bytes = 32 nibbles
     lo = (qs & 0xF).astype(np.float32) - 8.0
     hi = (qs >> 4).astype(np.float32) - 8.0
+    # Correct nibble interleaving: [lo0, hi0, lo1, hi1, ...]
+    x = np.stack([lo, hi], axis=2).reshape(n_blocks, 32)
     return (d * x).reshape(-1)[:n_elements]
 def dequant_q2k(raw, n_elements):
         dmin = dmin_fp16[blk]
         for half in range(2):
             for sub in range(4):
+                j = half * 4 + sub  # Corrected index: 0-3 and 4-7
                 sc = int(scales_packed[blk, j]) & 0xF
                 mn = int(scales_packed[blk, j]) >> 4
                 d_sub = d * sc
         """Encode text and split into fixed-length chunks."""
         ids = self.encode(text)
         chunks = []
+        # Use a more reasonable stride (75% overlap instead of 50% for better coverage)
+        # or just 0% for pure speed. Let's go with 25% overlap as a middle ground.
+        stride = chunk_size * 3 // 4
+        for i in range(0, len(ids) - chunk_size + 1, stride):
             chunk = ids[i:i + chunk_size]
+            chunks.append(np.array(chunk, dtype=np.int32))
         if not chunks and ids:
             # Pad short text
             padded = ids + [self.eos_id] * (chunk_size - len(ids))
         n_embd = cfg['n_embd']
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
+        if isinstance(n_head_kv, list):
+            n_head_kv = n_head_kv[layer_idx]
         head_dim = self.head_dim
         eps = cfg['rms_eps']
         # Read back importance for the tensors that WERE processed in C
         for name, arr, cnt in imp_refs:
+            # Extract value from ctypes byref pointer
             self.importance[name] = (arr.astype(np.float64), cnt.value)
         # Handle MoE FFN if C code skipped it
             imp_f32 = self.importance[name][0].astype(np.float32)
             count = ctypes.c_int64(self.importance[name][1])
+            # Pass real weights to C library for importance recording
+            weight_ptr = weight.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
             # Dummy output — we only want the importance recording
             dummy_out = np.empty((M, 1), dtype=np.float32)
             self._hpc_lib.hexstate_matmul_record(
                 x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                weight_ptr,
                 dummy_out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
                 imp_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                ctypes.c_int64(M), ctypes.c_int64(K), ctypes.c_int64(N),
                 ctypes.byref(count))
             self.importance[name] = (imp_f32.astype(np.float64), count.value)
         cfg = self.cfg
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
+        if isinstance(n_head_kv, list):
+            n_head_kv = n_head_kv[layer_idx]
         seq_len = hidden.shape[0]
         # ── Attention norm ──
         cfg = self.cfg
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
+        if isinstance(n_head_kv, list):
+            n_head_kv = n_head_kv[layer_idx]
         head_dim = self.head_dim
         seq_len = hidden.shape[0]
         cfg = self.cfg
         n_head = cfg['n_head']
         n_head_kv = cfg['n_head_kv']
+        if isinstance(n_head_kv, list):
+            n_head_kv = n_head_kv[layer_idx]
         head_dim = self.head_dim
         seq_len = hidden.shape[0]
                 n_nbr += 1
             if n_nbr > 0:
                 e_nbr /= n_nbr
+            # Clamp energy to prevent exponential explosion (max exp(5) ~ 148)
+            energy = np.clip((e_self + 0.3 * e_nbr) / temperature, -10, 5)
+            new_mult[i] = np.exp(energy)
         mean_m = np.mean(new_mult)
         if mean_m > 1e-30:
     return len(entries)
+def load_hf_config(config_path):
+    """Load a HuggingFace config.json and extract architecture info.
+    Maps HF keys to internal generate_imatrix.py keys:
+      hidden_size -> n_embd
+      num_hidden_layers -> n_layers
+      num_attention_heads -> n_head
+      num_key_value_heads -> n_head_kv
+      intermediate_size -> n_ff
+      vocab_size -> vocab_size
+      rms_norm_eps -> rms_eps
+      rope_theta -> rope_base
+      model_type -> arch
+    """
+    import json
+    with open(config_path, 'r') as f:
+        raw = json.load(f)
+    src = raw
+    if 'text_config' in raw and 'hidden_size' not in raw:
+        src = raw['text_config']
+    cfg = {}
+    cfg['arch'] = src.get('model_type', raw.get('model_type', 'unknown'))
+    cfg['n_embd'] = src.get('hidden_size', 0)
+    cfg['n_layers'] = src.get('num_hidden_layers', 0)
+    cfg['n_head'] = src.get('num_attention_heads', 0)
+    cfg['n_head_kv'] = src.get('num_key_value_heads', 0)
+    cfg['n_ff'] = src.get('intermediate_size', 0)
+    cfg['vocab_size'] = src.get('vocab_size', 0)
+    cfg['rms_eps'] = src.get('rms_norm_eps', 1e-6)
+    rope_params = src.get('rope_parameters', {})
+    cfg['rope_base'] = rope_params.get('rope_theta',
+                          src.get('rope_theta', 10000.0))
+    cfg['expert_count'] = src.get('num_local_experts', src.get('num_experts', 0))
+    cfg['expert_used_count'] = src.get('num_experts_per_tok', 0)
+    # head_dim fallback
+    if src.get('head_dim'):
+        cfg['head_dim'] = src['head_dim']
+    elif cfg['n_head'] > 0:
+        cfg['head_dim'] = cfg['n_embd'] // cfg['n_head']
+    return cfg
 # ─── Main ───────────────────────────────────────────────────────────────────
 def main():
     parser.add_argument('calibration', help='Calibration text file')
     parser.add_argument('-o', '--output', default='imatrix.dat',
                         help='Output imatrix file (default: imatrix.dat)')
+    parser.add_argument('--config', help='Optional HuggingFace config.json')
     parser.add_argument('--chunks', type=int, default=10,
                         help='Number of token chunks to process (default: 10)')
     parser.add_argument('--chunk-size', type=int, default=4096,
     model = GGUFModel(args.model)
     config = model.get_config()
+    # ── Load/Merge config.json ──
+    cfg_path = args.config
+    if not cfg_path:
+        # Auto-lookup in model directory
+        model_dir = os.path.dirname(os.path.abspath(args.model))
+        potential_cfg = os.path.join(model_dir, 'config.json')
+        if os.path.exists(potential_cfg):
+            cfg_path = potential_cfg
+    if cfg_path:
+        print(f"  Merging config from: {cfg_path}")
+        hf_cfg = load_hf_config(cfg_path)
+        # Override GGUF values with HF config values where they exist and are non-zero
+        for k, v in hf_cfg.items():
+            if v is not None:
+                config[k] = v
     print(f"  Architecture:  {config['arch']}")
     print(f"  Layers:        {config['n_layers']}")
     print(f"  Hidden:        {config['n_embd']}")

hpc_forward_merged.c CHANGED Viewed

@@ -113,21 +113,30 @@ static void hpc_matmul_graph(const float *x, const float *weight, float *out,
         for (int64_t s = 0; s < n_sites - 1; s++)
             hpc_cz(g, s, s + 1);
-        /* Read importance via graph marginals */
         double fidelity = g->avg_fidelity;
         for (int64_t s = 0; s < n_sites; s++) {
             int64_t j0 = s * stride;
             int64_t j1 = (s + 1) * stride;
             if (j1 > K) j1 = K;
-            float e = col_energy[j0];
-            int phase = ((int)(e * 1e3f)) % D;
-            if (phase < 0) phase += D;
-            double marg = hpc_marginal(g, s, phase);
-            double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5;
-            if (boost < 0.5) boost = 0.5;
-            if (boost > 2.0) boost = 2.0;
-            for (int64_t j = j0; j < j1; j++)
-                importance[j] += col_energy[j] * (float)boost;
         }
         if (count) *count += M;
     }
@@ -218,6 +227,7 @@ void hexstate_forward_layer(
     const float *v_w, int64_t v_dim,
     const float *gate_w, int64_t gate_rows,
     const float *o_w, int64_t o_cols,
     /* FFN weights */
     const float *ffn_norm_w,
     const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w,
@@ -252,27 +262,13 @@ void hexstate_forward_layer(
         if (!qkv) { free(normed); free(attn_out); return; }
         /* Graph-based matmul: importance via HPCGraph marginals */
-        printf("matmul qkv M=%ld K=%ld N=%ld\n", (long)seq_len, (long)n_embd, (long)qkv_dim); fflush(stdout); hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv,
                          seq_len, n_embd, qkv_dim, 0);
         /* Split Q, K, V */
         int64_t q_total = n_head * head_dim;
         int64_t kv_total = n_head_kv * head_dim;
-        float *Q = qkv;                              /* [seq, q_total] */
-        float *K = qkv + q_total;                    /* offset per row */
-        float *V = qkv + q_total + kv_total;         /* offset per row */
-        /* ── HPC Linear Attention: graph IS the attention ──
-         *
-         * Create HPCGraph with n_head sites.
-         * Each head is a site. K·V interaction energy → quhit amplitude.
-         * CZ edges between adjacent heads → cross-head phase coherence.
-         * hpc_marginal(h) → attention weight for head h.
-         *
-         * Running state S[h] accumulates K⊗V, weighted by coherence.
-         * This is causal linear attention where the HPC graph determines
-         * HOW MUCH each head contributes at each timestep.
-         */
         HPCGraph *attn_graph = hpc_create(n_head);
         float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float));
         float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float));
@@ -288,14 +284,13 @@ void hexstate_forward_layer(
                 /* Encode K·V energy into graph sites */
                 for (int64_t h = 0; h < n_head; h++) {
-                    int64_t kv_h = h % n_head_kv;  /* GQA mapping */
                     float *kh = kt_base + kv_h * head_dim;
                     float *vh = vt_base + kv_h * head_dim;
                     float energy = 0.0f;
                     for (int64_t d = 0; d < head_dim; d++)
                         energy += kh[d] * vh[d];
-                    /* Triality encode energy → D=6 quhit amplitude */
                     double re[D] = {0}, im[D] = {0};
                     float ae = fabsf(energy) + 1e-6f;
                     int ph = ((int)(ae * 100.0f)) % D;
@@ -306,11 +301,9 @@ void hexstate_forward_layer(
                     hpc_set_local(attn_graph, h, re, im);
                 }
-                /* CZ-couple adjacent heads: creates cross-head entanglement */
                 for (int64_t h = 0; h < n_head - 1; h++)
                     hpc_cz(attn_graph, h, h + 1);
-                /* Compute attention output per head using graph marginals */
                 #pragma omp parallel for schedule(static)
                 for (int64_t h = 0; h < n_head; h++) {
                     int64_t kv_h = h % n_head_kv;
@@ -320,7 +313,6 @@ void hexstate_forward_layer(
                     float *Sh = S + h * head_dim * head_dim;
                     float *zh = z_acc + h * head_dim;
-                    /* Get HPC marginal: phase-coherent weight for this head */
                     float ae = 0.0f;
                     for (int64_t d = 0; d < head_dim; d++)
                         ae += fabsf(kh[d] * vh[d]);
@@ -331,14 +323,14 @@ void hexstate_forward_layer(
                     if (coherence < 0.1f) coherence = 0.1f;
                     if (coherence > 3.0f) coherence = 3.0f;
-                    /* Feature map: φ(x) = max(x,0) + ε */
-                    float qf[256], kf[256];
                     for (int64_t d = 0; d < head_dim; d++) {
                         qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
                         kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f;
                     }
-                    /* Accumulate: S += coherence × outer(kf, v) */
                     for (int64_t d1 = 0; d1 < head_dim; d1++) {
                         float ks = kf[d1] * coherence;
                         for (int64_t d2 = 0; d2 < head_dim; d2++)
@@ -347,37 +339,31 @@ void hexstate_forward_layer(
                     for (int64_t d = 0; d < head_dim; d++)
                         zh[d] += kf[d] * coherence;
-                    /* Output: (qf @ S) / (qf · z) */
                     float den = 1e-8f;
                     for (int64_t d = 0; d < head_dim; d++)
                         den += qf[d] * zh[d];
                     float inv_den = 1.0f / den;
-                    /* Write to attn_inner at position [t, h*head_dim ... ] */
                     float *ao = attn_inner + t * inner_dim;
                     for (int64_t d2 = 0; d2 < head_dim; d2++) {
                         float num = 0.0f;
                         for (int64_t d1 = 0; d1 < head_dim; d1++)
                             num += qf[d1] * Sh[d1 * head_dim + d2];
-                        /* Accumulate into attn_inner (multiple heads write here) */
                         ao[h * head_dim + d2] = num * inv_den;
                     }
                 }
-                /* Compact graph edges periodically */
                 if (t > 0 && t % 64 == 0)
                     hpc_compact_edges(attn_graph);
             }
         }
-        /* Gate projection if present */
         if (gate_w && gate_rows > 0) {
-            int trans_w = (gate_rows == inner_dim) ? 1 : 0;
-            int64_t N_out = trans_w ? n_embd : gate_rows;
             float *gated = (float *)malloc(seq_len * N_out * sizeof(float));
             if (gated) {
-                printf("matmul gate M=%ld K=%ld N=%ld trans=%d\n", (long)seq_len, (long)inner_dim, (long)N_out, trans_w); fflush(stdout); hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate,
-                                seq_len, inner_dim, N_out, trans_w);
                 for (int64_t t = 0; t < seq_len; t++) {
                     int64_t copy_dim = N_out < n_embd ? N_out : n_embd;
                     memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float));
@@ -391,7 +377,6 @@ void hexstate_forward_layer(
             }
         }
         if (attn_inner) free(attn_inner);
         if (attn_graph) hpc_destroy(attn_graph);
         free(S); free(z_acc); free(qkv);
@@ -410,7 +395,6 @@ void hexstate_forward_layer(
         hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0);
         hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0);
-        /* Same HPC attention as above but with separate Q/K/V buffers */
         int64_t hd_q = q_dim / n_head;
         int64_t hd_kv = k_dim / n_head_kv;
         int64_t inner_dim = n_head * hd_kv;
@@ -421,7 +405,6 @@ void hexstate_forward_layer(
         if (attn_graph && S && z_acc && attn_inner) {
             for (int64_t t = 0; t < seq_len; t++) {
-                /* Encode heads into graph */
                 for (int64_t h = 0; h < n_head; h++) {
                     int64_t kv_h = h % n_head_kv;
                     float *kh = K_buf + t * k_dim + kv_h * hd_kv;
@@ -449,34 +432,39 @@ void hexstate_forward_layer(
                     float *zh = z_acc + h * hd_kv;
                     int64_t feat = hd_q < hd_kv ? hd_q : hd_kv;
-                    float ae = fabsf(kh[0]*vh[0]) + 1e-6f;
                     int ph = ((int)(ae * 100.0f)) % D;
                     double coh_raw = hpc_marginal(attn_graph, h, ph);
                     float coh = (float)(coh_raw * D);
                     if (coh < 0.1f) coh = 0.1f;
                     if (coh > 3.0f) coh = 3.0f;
                     for (int64_t d1 = 0; d1 < feat; d1++) {
-                        float kf = (kh[d1] > 0 ? kh[d1] : 0) + 1e-6f;
-                        float ks = kf * coh;
                         for (int64_t d2 = 0; d2 < hd_kv; d2++)
                             Sh[d1*hd_kv+d2] += ks * vh[d2];
-                        zh[d1] += kf * coh;
                     }
                     float den = 1e-8f;
-                    for (int64_t d = 0; d < feat; d++) {
-                        float qf = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
-                        den += qf * zh[d];
-                    }
                     float inv_den = 1.0f / den;
                     float *ao = attn_inner + t * inner_dim;
                     for (int64_t d2 = 0; d2 < hd_kv; d2++) {
                         float num = 0.0f;
-                        for (int64_t d1 = 0; d1 < feat; d1++) {
-                            float qf = (qh[d1] > 0 ? qh[d1] : 0) + 1e-6f;
-                            num += qf * Sh[d1*hd_kv+d2];
-                        }
                         ao[h*hd_kv+d2] = num * inv_den;
                     }
                 }
@@ -485,31 +473,14 @@ void hexstate_forward_layer(
             }
         }
-        /* Output projection */
         if (o_w && o_cols > 0) {
-            float *proj_in = attn_inner;
-            int free_proj_in = 0;
-            if (inner_dim != o_cols) {
-                proj_in = (float *)calloc(seq_len * o_cols, sizeof(float));
-                if (proj_in) {
-                    for (int64_t t = 0; t < seq_len; t++) {
-                        int64_t copy_dim = inner_dim < o_cols ? inner_dim : o_cols;
-                        memcpy(proj_in + t * o_cols, attn_inner + t * inner_dim, copy_dim * sizeof(float));
-                    }
-                    free_proj_in = 1;
-                } else {
-                    proj_in = attn_inner;
-                }
-            }
             float *projected = (float *)calloc(seq_len * n_embd, sizeof(float));
             if (projected) {
-                hpc_matmul_graph(proj_in, o_w, projected, imp_o, cnt_o,
-                                seq_len, o_cols, n_embd, 0);
                 memcpy(attn_out, projected, seq_len * n_embd * sizeof(float));
                 free(projected);
             }
-            if (free_proj_in && proj_in != attn_inner) free(proj_in);
         } else {
             for (int64_t t = 0; t < seq_len; t++) {
                 int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
@@ -517,19 +488,16 @@ void hexstate_forward_layer(
             }
         }
         if (attn_inner) free(attn_inner);
         if (attn_graph) hpc_destroy(attn_graph);
         free(S); free(z_acc);
         free(Q); free(K_buf); free(V_buf);
     }
-    /* Residual add: hidden += attn_out */
     int64_t total = seq_len * n_embd;
     #pragma omp parallel for schedule(static)
     for (int64_t i = 0; i < total; i++)
         hidden[i] += attn_out[i];
-    /* ══════════════ Phase 3: FFN ══════════════ */
     if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) {
         float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float));
         float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
@@ -537,36 +505,29 @@ void hexstate_forward_layer(
         if (normed_ff && gate_out && up_out) {
             hpc_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps);
-            /* Graph-based matmul for FFN with importance */
             hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out,
                             imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0);
             hpc_matmul_graph(normed_ff, ffn_up_w, up_out,
                             imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0);
-            /* SiLU(gate) * up */
             hpc_silu(gate_out, seq_len * ffn_dim);
             #pragma omp parallel for schedule(static)
             for (int64_t i = 0; i < seq_len * ffn_dim; i++)
                 gate_out[i] *= up_out[i];
-            /* Down projection: graph-based importance recording */
             float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float));
             if (ff_out_buf) {
                 hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf,
                                 imp_ffn_down, cnt_ffn_down,
                                 seq_len, ffn_dim, n_embd, 0);
-                /* Residual add */
                 #pragma omp parallel for schedule(static)
                 for (int64_t i = 0; i < total; i++)
                     hidden[i] += ff_out_buf[i];
                 free(ff_out_buf);
             }
         }
         free(normed_ff); free(gate_out); free(up_out);
     }
     free(normed);
     free(attn_out);
 }

         for (int64_t s = 0; s < n_sites - 1; s++)
             hpc_cz(g, s, s + 1);
+        /* Read importance via graph marginals.
+         * The bucket marginal (marg) is shared across the stride window, but
+         * each column gets its own phase and boost derived from col_energy[j],
+         * so no column inherits another column's boost factor. */
         double fidelity = g->avg_fidelity;
         for (int64_t s = 0; s < n_sites; s++) {
             int64_t j0 = s * stride;
             int64_t j1 = (s + 1) * stride;
             if (j1 > K) j1 = K;
+            /* Bucket-level marginal: computed once per site (cheap) */
+            float e0 = col_energy[j0];
+            int phase0 = ((int)(e0 * 1e3f)) % D;
+            if (phase0 < 0) phase0 += D;
+            double marg = hpc_marginal(g, s, phase0);
+            /* Per-column boost: each column uses its own energy */
+            for (int64_t j = j0; j < j1; j++) {
+                float e = col_energy[j];
+                int phase = ((int)(e * 1e3f)) % D;
+                if (phase < 0) phase += D;
+                double boost = 1.0 + (marg * fidelity * D - 1.0) * 0.5;
+                if (boost < 0.5) boost = 0.5;
+                if (boost > 2.0) boost = 2.0;
+                importance[j] += e * (float)boost;
+            }
         }
         if (count) *count += M;
     }
     const float *v_w, int64_t v_dim,
     const float *gate_w, int64_t gate_rows,
     const float *o_w, int64_t o_cols,
+    int gate_trans,  /* New: explicit transpose flag */
     /* FFN weights */
     const float *ffn_norm_w,
     const float *ffn_gate_w, const float *ffn_up_w, const float *ffn_down_w,
         if (!qkv) { free(normed); free(attn_out); return; }
         /* Graph-based matmul: importance via HPCGraph marginals */
+        hpc_matmul_graph(normed, qkv_w, qkv, imp_qkv, cnt_qkv,
                          seq_len, n_embd, qkv_dim, 0);
         /* Split Q, K, V */
         int64_t q_total = n_head * head_dim;
         int64_t kv_total = n_head_kv * head_dim;
         HPCGraph *attn_graph = hpc_create(n_head);
         float *S = (float *)calloc(n_head * head_dim * head_dim, sizeof(float));
         float *z_acc = (float *)calloc(n_head * head_dim, sizeof(float));
                 /* Encode K·V energy into graph sites */
                 for (int64_t h = 0; h < n_head; h++) {
+                    int64_t kv_h = h % n_head_kv;
                     float *kh = kt_base + kv_h * head_dim;
                     float *vh = vt_base + kv_h * head_dim;
                     float energy = 0.0f;
                     for (int64_t d = 0; d < head_dim; d++)
                         energy += kh[d] * vh[d];
                     double re[D] = {0}, im[D] = {0};
                     float ae = fabsf(energy) + 1e-6f;
                     int ph = ((int)(ae * 100.0f)) % D;
                     hpc_set_local(attn_graph, h, re, im);
                 }
                 for (int64_t h = 0; h < n_head - 1; h++)
                     hpc_cz(attn_graph, h, h + 1);
                 #pragma omp parallel for schedule(static)
                 for (int64_t h = 0; h < n_head; h++) {
                     int64_t kv_h = h % n_head_kv;
                     float *Sh = S + h * head_dim * head_dim;
                     float *zh = z_acc + h * head_dim;
                     float ae = 0.0f;
                     for (int64_t d = 0; d < head_dim; d++)
                         ae += fabsf(kh[d] * vh[d]);
                     if (coherence < 0.1f) coherence = 0.1f;
                     if (coherence > 3.0f) coherence = 3.0f;
+                    /* Safe buffer allocation for any head_dim */
+                    float *qf = (float *)alloca(head_dim * sizeof(float));
+                    float *kf = (float *)alloca(head_dim * sizeof(float));
                     for (int64_t d = 0; d < head_dim; d++) {
                         qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
                         kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f;
                     }
                     for (int64_t d1 = 0; d1 < head_dim; d1++) {
                         float ks = kf[d1] * coherence;
                         for (int64_t d2 = 0; d2 < head_dim; d2++)
                     for (int64_t d = 0; d < head_dim; d++)
                         zh[d] += kf[d] * coherence;
                     float den = 1e-8f;
                     for (int64_t d = 0; d < head_dim; d++)
                         den += qf[d] * zh[d];
                     float inv_den = 1.0f / den;
                     float *ao = attn_inner + t * inner_dim;
                     for (int64_t d2 = 0; d2 < head_dim; d2++) {
                         float num = 0.0f;
                         for (int64_t d1 = 0; d1 < head_dim; d1++)
                             num += qf[d1] * Sh[d1 * head_dim + d2];
                         ao[h * head_dim + d2] = num * inv_den;
                     }
                 }
                 if (t > 0 && t % 64 == 0)
                     hpc_compact_edges(attn_graph);
             }
         }
         if (gate_w && gate_rows > 0) {
+            int64_t N_out = gate_trans ? n_embd : gate_rows;
             float *gated = (float *)malloc(seq_len * N_out * sizeof(float));
             if (gated) {
+                hpc_matmul_graph(attn_inner, gate_w, gated, imp_gate, cnt_gate,
+                                seq_len, inner_dim, N_out, gate_trans);
                 for (int64_t t = 0; t < seq_len; t++) {
                     int64_t copy_dim = N_out < n_embd ? N_out : n_embd;
                     memcpy(attn_out + t * n_embd, gated + t * N_out, copy_dim * sizeof(float));
             }
         }
         if (attn_inner) free(attn_inner);
         if (attn_graph) hpc_destroy(attn_graph);
         free(S); free(z_acc); free(qkv);
         hpc_matmul_graph(normed, k_w, K_buf, imp_k, cnt_k, seq_len, n_embd, k_dim, 0);
         hpc_matmul_graph(normed, v_w, V_buf, imp_v, cnt_v, seq_len, n_embd, v_dim, 0);
         int64_t hd_q = q_dim / n_head;
         int64_t hd_kv = k_dim / n_head_kv;
         int64_t inner_dim = n_head * hd_kv;
         if (attn_graph && S && z_acc && attn_inner) {
             for (int64_t t = 0; t < seq_len; t++) {
                 for (int64_t h = 0; h < n_head; h++) {
                     int64_t kv_h = h % n_head_kv;
                     float *kh = K_buf + t * k_dim + kv_h * hd_kv;
                     float *zh = z_acc + h * hd_kv;
                     int64_t feat = hd_q < hd_kv ? hd_q : hd_kv;
+                    float ae = 0.0f;
+                    for(int64_t d=0; d<hd_kv; d++) ae += fabsf(kh[d]*vh[d]);
+                    ae += 1e-6f;
                     int ph = ((int)(ae * 100.0f)) % D;
                     double coh_raw = hpc_marginal(attn_graph, h, ph);
                     float coh = (float)(coh_raw * D);
                     if (coh < 0.1f) coh = 0.1f;
                     if (coh > 3.0f) coh = 3.0f;
+                    float *qf = (float *)alloca(feat * sizeof(float));
+                    float *kf = (float *)alloca(feat * sizeof(float));
+                    for (int64_t d = 0; d < feat; d++) {
+                        qf[d] = (qh[d] > 0 ? qh[d] : 0) + 1e-6f;
+                        kf[d] = (kh[d] > 0 ? kh[d] : 0) + 1e-6f;
+                    }
                     for (int64_t d1 = 0; d1 < feat; d1++) {
+                        float ks = kf[d1] * coh;
                         for (int64_t d2 = 0; d2 < hd_kv; d2++)
                             Sh[d1*hd_kv+d2] += ks * vh[d2];
+                        zh[d1] += kf[d1] * coh;
                     }
                     float den = 1e-8f;
+                    for (int64_t d = 0; d < feat; d++)
+                        den += qf[d] * zh[d];
                     float inv_den = 1.0f / den;
                     float *ao = attn_inner + t * inner_dim;
                     for (int64_t d2 = 0; d2 < hd_kv; d2++) {
                         float num = 0.0f;
+                        for (int64_t d1 = 0; d1 < feat; d1++)
+                            num += qf[d1] * Sh[d1*hd_kv+d2];
                         ao[h*hd_kv+d2] = num * inv_den;
                     }
                 }
             }
         }
         if (o_w && o_cols > 0) {
             float *projected = (float *)calloc(seq_len * n_embd, sizeof(float));
             if (projected) {
+                hpc_matmul_graph(attn_inner, o_w, projected, imp_o, cnt_o,
+                                seq_len, inner_dim, n_embd, 0);
                 memcpy(attn_out, projected, seq_len * n_embd * sizeof(float));
                 free(projected);
             }
         } else {
             for (int64_t t = 0; t < seq_len; t++) {
                 int64_t copy_dim = inner_dim < n_embd ? inner_dim : n_embd;
             }
         }
         if (attn_inner) free(attn_inner);
         if (attn_graph) hpc_destroy(attn_graph);
         free(S); free(z_acc);
         free(Q); free(K_buf); free(V_buf);
     }
     int64_t total = seq_len * n_embd;
     #pragma omp parallel for schedule(static)
     for (int64_t i = 0; i < total; i++)
         hidden[i] += attn_out[i];
     if (ffn_norm_w && ffn_gate_w && ffn_up_w && ffn_down_w && ffn_dim > 0) {
         float *normed_ff = (float *)malloc(seq_len * n_embd * sizeof(float));
         float *gate_out = (float *)malloc(seq_len * ffn_dim * sizeof(float));
         if (normed_ff && gate_out && up_out) {
             hpc_rms_norm(hidden, ffn_norm_w, normed_ff, seq_len, n_embd, eps);
             hpc_matmul_graph(normed_ff, ffn_gate_w, gate_out,
                             imp_ffn_gate, cnt_ffn_gate, seq_len, n_embd, ffn_dim, 0);
             hpc_matmul_graph(normed_ff, ffn_up_w, up_out,
                             imp_ffn_up, cnt_ffn_up, seq_len, n_embd, ffn_dim, 0);
             hpc_silu(gate_out, seq_len * ffn_dim);
             #pragma omp parallel for schedule(static)
             for (int64_t i = 0; i < seq_len * ffn_dim; i++)
                 gate_out[i] *= up_out[i];
             float *ff_out_buf = (float *)malloc(seq_len * n_embd * sizeof(float));
             if (ff_out_buf) {
                 hpc_matmul_graph(gate_out, ffn_down_w, ff_out_buf,
                                 imp_ffn_down, cnt_ffn_down,
                                 seq_len, ffn_dim, n_embd, 0);
                 #pragma omp parallel for schedule(static)
                 for (int64_t i = 0; i < total; i++)
                     hidden[i] += ff_out_buf[i];
                 free(ff_out_buf);
             }
         }
         free(normed_ff); free(gate_out); free(up_out);
     }
     free(normed);
     free(attn_out);
 }