HPC-Quantize / hexstate_quantize.c

Upload 5 files

7803d72 verified 18 days ago

196 kB

	/* ═══════════════════════════════════════════════════════════════════════════
	* hexstate_quantize.c — HExState GGUF Quantizer
	*
	* ╔═══════════════════════════════════════════════════════════════╗
	* ║ HPC-Optimized GGUF Quantization Engine ║
	* ║ ║
	* ║ Architecture: HPCGraph Sensitivity Propagation ║
	* ║ Optimization: Complex Amplitude BP + MCMC Scale Search ║
	* ║ Enhancements: MSE Grid Search, Importance Matrix Weighting ║
	* ║ Output: GGUF v3 (Q2_K) ║
	* ║ ║
	* ║ "The weight and the quantized are opposite faces." ║
	* ╚═══════════════════════════════════════════════════════════════╝
	*
	* This tool adapts the HExState HPC Ouroboros factoring engine for
	* LLM weight quantization. The core mathematical machinery is reused:
	*
	* Factoring Domain → Quantization Domain
	* ─────────────────────────────────────────────────
	* HPCGraph + CZ edges → Block sensitivity graph
	* Complex Amplitude BP → Importance propagation
	* MCMC period sampler → Optimal scale search
	* try_period() validation → Error bound checking
	* LLL lattice reduction → (future) Adaptive bit allocation
	*
	* Additional techniques ported from llm-compressor:
	* MSE grid search → Optimal min/max range shrinking
	* Importance matrix (imatrix) → Per-channel error weighting
	*
	* Build:
	* make -f Makefile.quantize
	*
	* Usage:
	* ./hexstate_quantize <input> <output.gguf> [options]
	*
	* Input can be:
	* - A single .safetensors file
	* - A model directory containing sharded .safetensors files
	*
	* Options:
	* --optimizer hpc\|mse\|hybrid Scale optimization strategy (default: hybrid)
	* --imatrix <file> Importance matrix for weighted quantization
	* --verbose Per-block diagnostics
	* ═══════════════════════════════════════════════════════════════════════════ */

	#include <stdio.h>
	#ifdef _OPENMP
	#include <omp.h>
	#endif
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <time.h>
	#include <sys/stat.h>
	#include <mpfr.h>

	/* HExState headers — reused from the factoring engine */
	#include "quhit_triality.h"
	#include "hpc_graph.h"
	#include "hpc_mobius.h"
	#include "s6_exotic.h"

	/* Quantization-specific headers */
	#include "gguf_format.h"
	#include "safetensors_reader.h"
	#include "tokenizer_reader.h"
	#include "imatrix_reader.h"

	#define D 6 /* Preserved from HExState — the triality dimension */

	/* ═══════════════════════════════════════════════════════════════════════════
	* OPTIMIZER MODE
	* ═══════════════════════════════════════════════════════════════════════════ */

	typedef enum {
	OPT_HPC, /* HExState BP only */
	OPT_MSE, /* MSE grid search only */
	OPT_HYBRID /* HPC sensitivity + MSE */
	} OptimizerMode;

	/* ═══════════════════════════════════════════════════════════════════════════
	* MODEL ARCHITECTURE AUTO-DETECTION
	*
	* Infers model architecture metadata from tensor names and shapes.
	* Supports: LLaMA, Mistral, Qwen2, Phi-3, Gemma, GPT-NeoX, Falcon, DeepSeek
	* ═══════════════════════════════════════════════════════════════════════════ */

	typedef struct {
	char architecture[64]; /* "llama", "phi3", "gemma", etc. */
	char name[256]; /* Human-readable model name */
	uint32_t block_count; /* Number of transformer layers */
	uint32_t embedding_length; /* Hidden dimension */
	uint32_t head_count; /* Number of attention heads */
	uint32_t head_count_kv; /* Number of KV heads (GQA) */
	uint32_t vocab_size; /* Vocabulary size */
	uint32_t context_length; /* Max context length (default) */
	float rope_freq_base; /* RoPE frequency base */
	uint32_t feed_forward_length; /* FFN intermediate size */
	float rms_norm_eps; /* RMS norm epsilon */
	int has_bias; /* Whether attention has biases */
	int tie_word_embeddings; /* Whether output = embed_tokens */
	} ModelArchitecture;

	/* Count tensor names matching a pattern prefix */
	static int count_tensors_with_prefix(const STMultiFile mf, const char prefix)
	{
	int count = 0;
	int prefix_len = strlen(prefix);
	for (int i = 0; i < mf->n_tensors; i++) {
	if (strncmp(mf->tensor_map[i].name, prefix, prefix_len) == 0)
	count++;
	}
	return count;
	}

	/* Find max layer index from tensor names like "model.layers.N.xxx" */
	static int find_max_layer_index(const STMultiFile mf, const char layer_prefix)
	{
	int max_idx = -1;
	int prefix_len = strlen(layer_prefix);
	for (int i = 0; i < mf->n_tensors; i++) {
	if (strncmp(mf->tensor_map[i].name, layer_prefix, prefix_len) == 0) {
	int idx = atoi(mf->tensor_map[i].name + prefix_len);
	if (idx > max_idx) max_idx = idx;
	}
	}
	return max_idx;
	}

	/* ── Config.json reader for definitive architecture parameters ── */

	typedef struct {
	int valid;
	uint32_t hidden_size;
	uint32_t intermediate_size;
	uint32_t num_attention_heads;
	uint32_t num_key_value_heads;
	uint32_t num_hidden_layers;
	uint32_t vocab_size;
	uint32_t max_position_embeddings;
	float rope_theta;
	float rms_norm_eps;
	char model_type[64];
	int tie_word_embeddings;
	} ConfigJson;

	static ConfigJson parse_config_json(const char *path)
	{
	ConfigJson cfg;
	memset(&cfg, 0, sizeof(cfg));

	FILE *f = fopen(path, "rb");
	if (!f) return cfg;

	fseek(f, 0, SEEK_END);
	long size = ftell(f);
	fseek(f, 0, SEEK_SET);

	char json = (char )malloc(size + 1);
	if (!json) { fclose(f); return cfg; }
	fread(json, 1, size, f);
	json[size] = '\0';
	fclose(f);

	cfg.valid = 1;

	/* Simple key-value extraction */
	const char *p;

	p = tok_find_key(json, "hidden_size");
	if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "intermediate_size");
	if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "num_attention_heads");
	if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "num_key_value_heads");
	if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "num_hidden_layers");
	if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "vocab_size");
	if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "max_position_embeddings");
	if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "rope_theta");
	if (p) cfg.rope_theta = (float)strtod(p, NULL);

	p = tok_find_key(json, "rms_norm_eps");
	if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);

	p = tok_find_key(json, "model_type");
	if (p && *p == '"') {
	char buf[64];
	tok_extract_string(p, buf, sizeof(buf));
	strncpy(cfg.model_type, buf, sizeof(cfg.model_type) - 1);
	}

	p = tok_find_key(json, "tie_word_embeddings");
	if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);

	/* ── Qwen 3.5/3.6: parameters are nested inside "text_config" ── */
	if (cfg.hidden_size == 0) {
	const char *tc = strstr(json, "\"text_config\"");
	if (tc) {
	const char *tc_brace = strchr(tc, '{');
	if (tc_brace) {
	p = tok_find_key(tc_brace, "hidden_size");
	if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "intermediate_size");
	if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "num_attention_heads");
	if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "num_key_value_heads");
	if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "num_hidden_layers");
	if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "vocab_size");
	if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "max_position_embeddings");
	if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "rms_norm_eps");
	if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);
	p = tok_find_key(tc_brace, "model_type");
	if (p && *p == '"') {
	char buf2[64];
	tok_extract_string(p, buf2, sizeof(buf2));
	strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1);
	}
	p = tok_find_key(tc_brace, "tie_word_embeddings");
	if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
	/* Qwen3.6 rope_theta is nested in rope_parameters */
	const char *rp = strstr(tc_brace, "\"rope_parameters\"");
	if (rp) {
	p = tok_find_key(rp, "rope_theta");
	if (p) cfg.rope_theta = (float)strtod(p, NULL);
	}
	}
	}
	}

	free(json);
	return cfg;
	}

	static void detect_architecture(const STMultiFile mf, ModelArchitecture arch,
	const char *config_json_path)
	{
	memset(arch, 0, sizeof(*arch));

	/* Default values */
	strcpy(arch->architecture, "llama");
	strcpy(arch->name, "HExState-quantized");
	arch->context_length = 4096;
	arch->rope_freq_base = 10000.0f;
	arch->rms_norm_eps = 1e-5f;

	/* ── Try config.json for definitive parameters ── */
	ConfigJson cfg = {0};
	if (config_json_path) {
	cfg = parse_config_json(config_json_path);
	}

	if (cfg.valid) {
	/* Map model_type to GGUF architecture name */
	if (strcmp(cfg.model_type, "llama") == 0 \|\|
	strcmp(cfg.model_type, "mistral") == 0) {
	strcpy(arch->architecture, "llama");
	} else if (strcmp(cfg.model_type, "qwen2") == 0) {
	strcpy(arch->architecture, "qwen2");
	} else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
	strcpy(arch->architecture, "qwen2moe");
	} else if (strcmp(cfg.model_type, "qwen3_5") == 0 \|\|
	strcmp(cfg.model_type, "qwen3_5_text") == 0 \|\|
	strcmp(cfg.model_type, "qwen3_5_moe") == 0) {
	strcpy(arch->architecture, "qwen2"); /* GGUF arch: qwen2 compat */
	} else if (strcmp(cfg.model_type, "phi3") == 0 \|\|
	strcmp(cfg.model_type, "phi") == 0) {
	strcpy(arch->architecture, "phi3");
	} else if (strcmp(cfg.model_type, "gemma") == 0 \|\|
	strcmp(cfg.model_type, "gemma2") == 0) {
	strcpy(arch->architecture, "gemma");
	} else if (strcmp(cfg.model_type, "deepseek_v2") == 0) {
	strcpy(arch->architecture, "llama");
	} else if (strcmp(cfg.model_type, "gpt_neox") == 0) {
	strcpy(arch->architecture, "gpt_neox");
	} else if (strcmp(cfg.model_type, "falcon") == 0) {
	strcpy(arch->architecture, "falcon");
	} else if (cfg.model_type[0]) {
	/* Unknown — try llama as fallback */
	strcpy(arch->architecture, "llama");
	}

	if (cfg.hidden_size) arch->embedding_length = cfg.hidden_size;
	if (cfg.intermediate_size) arch->feed_forward_length = cfg.intermediate_size;
	if (cfg.num_attention_heads) arch->head_count = cfg.num_attention_heads;
	if (cfg.num_key_value_heads) arch->head_count_kv = cfg.num_key_value_heads;
	if (cfg.num_hidden_layers) arch->block_count = cfg.num_hidden_layers;
	if (cfg.vocab_size) arch->vocab_size = cfg.vocab_size;
	if (cfg.max_position_embeddings) arch->context_length = cfg.max_position_embeddings;
	if (cfg.rope_theta > 0) arch->rope_freq_base = cfg.rope_theta;
	if (cfg.rms_norm_eps > 0) arch->rms_norm_eps = cfg.rms_norm_eps;
	arch->tie_word_embeddings = cfg.tie_word_embeddings;

	printf(" Architecture determined from config.json: %s\n", cfg.model_type);
	}

	/* ── Fall back to tensor name pattern detection ── */
	int has_model_layers = count_tensors_with_prefix(mf, "model.layers.");
	int has_gpt_neox = count_tensors_with_prefix(mf, "gpt_neox.");
	int has_transformer = count_tensors_with_prefix(mf, "transformer.");

	/* Architecture-specific detection */
	int has_qkv_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.qkv_proj");
	int has_kv_a_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.kv_a_proj_with_mqa");
	int has_final_norm = (st_multi_find_tensor(mf, "model.final_norm.weight") >= 0);

	if (has_qkv_proj > 0 && !cfg.valid) {
	strcpy(arch->architecture, "phi3");
	} else if (has_kv_a_proj > 0 && !cfg.valid) {
	strcpy(arch->architecture, "llama"); /* DeepSeek uses llama arch */
	} else if (has_final_norm && !cfg.valid) {
	strcpy(arch->architecture, "gemma");
	}

	if (has_model_layers > 0 && arch->block_count == 0) {
	arch->block_count = find_max_layer_index(mf, "model.layers.") + 1;
	}

	/* Infer dimensions from tensor shapes if not from config.json */
	if (arch->embedding_length == 0 \|\| arch->head_count == 0) {
	int qproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.weight");
	int kproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.k_proj.weight");

	if (qproj_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, qproj_idx);
	int64_t q_out = ti->shape[0];
	int64_t hidden = ti->shape[1];
	if (arch->embedding_length == 0) arch->embedding_length = hidden;

	/* Try common head dimensions: 128, 64, 96 */
	int head_dim = 128;
	if (q_out % 128 == 0) head_dim = 128;
	else if (q_out % 96 == 0) head_dim = 96;
	else if (q_out % 64 == 0) head_dim = 64;

	if (arch->head_count == 0) arch->head_count = q_out / head_dim;

	if (kproj_idx >= 0 && arch->head_count_kv == 0) {
	const STTensorInfo *kt = st_multi_tensor_info(mf, kproj_idx);
	arch->head_count_kv = kt->shape[0] / head_dim;
	}
	}
	}

	if (arch->vocab_size == 0) {
	int embed_idx = st_multi_find_tensor(mf, "model.embed_tokens.weight");
	if (embed_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, embed_idx);
	arch->vocab_size = ti->shape[0];
	}
	}

	if (arch->feed_forward_length == 0) {
	int gate_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.gate_proj.weight");
	if (gate_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, gate_idx);
	arch->feed_forward_length = ti->shape[0];
	} else {
	int up_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.up_proj.weight");
	if (up_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, up_idx);
	arch->feed_forward_length = ti->shape[0];
	}
	}
	}

	/* Check for attention bias */
	arch->has_bias = (st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.bias") >= 0);

	if (has_gpt_neox > 0 && arch->block_count == 0) {
	strcpy(arch->architecture, "gpt_neox");
	arch->block_count = find_max_layer_index(mf, "gpt_neox.layers.") + 1;
	}
	if (has_transformer > 0 && arch->block_count == 0) {
	strcpy(arch->architecture, "falcon");
	arch->block_count = find_max_layer_index(mf, "transformer.h.") + 1;
	}

	/* Fill in defaults for anything we couldn't detect */
	if (arch->head_count == 0) arch->head_count = 32;
	if (arch->head_count_kv == 0) arch->head_count_kv = arch->head_count;
	if (arch->embedding_length == 0) arch->embedding_length = 4096;
	if (arch->vocab_size == 0) arch->vocab_size = 32000;
	if (arch->feed_forward_length == 0)
	arch->feed_forward_length = (arch->embedding_length * 8) / 3; /* SwiGLU default */
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* TENSOR NAME MAPPING: HuggingFace → GGUF Standard
	*
	* Maps SafeTensors tensor names to the standardized GGUF naming
	* convention used by llama.cpp for model loading.
	*
	* Enhanced with mappings for Phi-3, Gemma, DeepSeek, MoE, and bias tensors.
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* Returns 1 if this tensor should be skipped (not written to GGUF) */
	static int should_skip_tensor(const char *hf_name)
	{
	/* Rotary embeddings are computed at runtime, not stored */
	if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
	if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
	if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
	/* Qwen 3.6 vision encoder — skip all visual.* tensors */
	if (strncmp(hf_name, "model.visual.", 13) == 0) return 1;
	if (strncmp(hf_name, "visual.", 7) == 0) return 1;
	/* MTP (multi-token prediction) layers — not needed for inference */
	if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1;
	return 0;
	}

	static void map_tensor_name(const char hf_name, char gguf_name, int buflen)
	{
	/* Start with identity mapping */
	strncpy(gguf_name, hf_name, buflen - 1);
	gguf_name[buflen - 1] = '\0';

	/* Top-level mappings (common to all architectures) */
	struct { const char from; const char to; } mappings[] = {
	{"model.embed_tokens.weight", "token_embd.weight"},
	{"model.language_model.embed_tokens.weight","token_embd.weight"}, /* Qwen 3.6 */
	{"model.norm.weight", "output_norm.weight"},
	{"model.language_model.norm.weight", "output_norm.weight"}, /* Qwen 3.6 */
	{"model.final_norm.weight", "output_norm.weight"}, /* Gemma */
	{"lm_head.weight", "output.weight"},
	{"model.embed_tokens.bias", "token_embd.bias"},
	{"model.norm.bias", "output_norm.bias"},
	{NULL, NULL}
	};

	for (int m = 0; mappings[m].from; m++) {
	if (strcmp(hf_name, mappings[m].from) == 0) {
	strncpy(gguf_name, mappings[m].to, buflen - 1);
	return;
	}
	}

	/* Layer mappings: "model.layers.N.xxx" or "model.language_model.layers.N.xxx" → "blk.N.xxx" */
	const char *layer_prefix = NULL;
	if (strncmp(hf_name, "model.layers.", 13) == 0)
	layer_prefix = hf_name + 13;
	else if (strncmp(hf_name, "model.language_model.layers.", 27) == 0)
	layer_prefix = hf_name + 27;

	if (layer_prefix) {
	int layer_idx;
	char rest[ST_MAX_NAME_LEN];
	if (sscanf(layer_prefix, "%d.%255s", &layer_idx, rest) == 2) {
	/* Map sublayer names */
	struct { const char from; const char to; } layer_maps[] = {
	/* Standard attention projections */
	{"self_attn.q_proj.weight", "attn_q.weight"},
	{"self_attn.k_proj.weight", "attn_k.weight"},
	{"self_attn.v_proj.weight", "attn_v.weight"},
	{"self_attn.o_proj.weight", "attn_output.weight"},
	/* Attention biases */
	{"self_attn.q_proj.bias", "attn_q.bias"},
	{"self_attn.k_proj.bias", "attn_k.bias"},
	{"self_attn.v_proj.bias", "attn_v.bias"},
	{"self_attn.o_proj.bias", "attn_output.bias"},
	/* Phi-3 fused QKV */
	{"self_attn.qkv_proj.weight", "attn_qkv.weight"},
	{"self_attn.qkv_proj.bias", "attn_qkv.bias"},
	/* DeepSeek MLA */
	{"self_attn.kv_a_proj_with_mqa.weight", "attn_kv_a_mqa.weight"},
	{"self_attn.kv_b_proj.weight", "attn_kv_b.weight"},
	/* Standard FFN (SwiGLU) */
	{"mlp.gate_proj.weight", "ffn_gate.weight"},
	{"mlp.up_proj.weight", "ffn_up.weight"},
	{"mlp.down_proj.weight", "ffn_down.weight"},
	/* FFN biases */
	{"mlp.gate_proj.bias", "ffn_gate.bias"},
	{"mlp.up_proj.bias", "ffn_up.bias"},
	{"mlp.down_proj.bias", "ffn_down.bias"},
	/* MoE gate */
	{"mlp.gate.weight", "ffn_gate_inp.weight"},
	/* MoE expert weights */
	{"mlp.experts.gate_proj.weight", "ffn_gate_exps.weight"},
	{"mlp.experts.up_proj.weight", "ffn_up_exps.weight"},
	{"mlp.experts.down_proj.weight", "ffn_down_exps.weight"},
	/* Norm layers */
	{"input_layernorm.weight", "attn_norm.weight"},
	{"post_attention_layernorm.weight", "ffn_norm.weight"},
	{"input_layernorm.bias", "attn_norm.bias"},
	{"post_attention_layernorm.bias", "ffn_norm.bias"},
	/* Gemma pre/post feedforward norm */
	{"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
	{"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
	/* Qwen 3.6 full attention QK norms */
	{"self_attn.q_norm.weight", "attn_q_norm.weight"},
	{"self_attn.k_norm.weight", "attn_k_norm.weight"},
	/* Qwen 3.6 DeltaNet (Gated Linear Attention) */
	{"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"},
	{"linear_attn.in_proj_z.weight", "ssm_in_z.weight"},
	{"linear_attn.in_proj_a.weight", "ssm_in_a.weight"},
	{"linear_attn.in_proj_b.weight", "ssm_in_b.weight"},
	{"linear_attn.out_proj.weight", "ssm_out.weight"},
	{"linear_attn.conv1d.weight", "ssm_conv1d.weight"},
	{"linear_attn.norm.weight", "ssm_norm.weight"},
	{"linear_attn.A_log", "ssm_a"},
	{"linear_attn.dt_bias", "ssm_dt.bias"},
	{NULL, NULL}
	};

	for (int m = 0; layer_maps[m].from; m++) {
	if (strcmp(rest, layer_maps[m].from) == 0) {
	snprintf(gguf_name, buflen, "blk.%d.%s",
	layer_idx, layer_maps[m].to);
	return;
	}
	}

	/* MoE expert layer mapping: model.layers.N.mlp.experts.E.xxx */
	int expert_idx;
	char expert_rest[ST_MAX_NAME_LEN];
	if (sscanf(rest, "mlp.experts.%d.%255s", &expert_idx, expert_rest) == 2) {
	struct { const char from; const char to; } expert_maps[] = {
	{"gate_proj.weight", "ffn_gate_exp.weight"},
	{"up_proj.weight", "ffn_up_exp.weight"},
	{"down_proj.weight", "ffn_down_exp.weight"},
	{NULL, NULL}
	};
	for (int m = 0; expert_maps[m].from; m++) {
	if (strcmp(expert_rest, expert_maps[m].from) == 0) {
	snprintf(gguf_name, buflen, "blk.%d.%s.%d",
	layer_idx, expert_maps[m].to, expert_idx);
	return;
	}
	}
	}

	/* Fallback: keep original sub-path */
	snprintf(gguf_name, buflen, "blk.%d.%s", layer_idx, rest);
	}
	}
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* SHOULD THIS TENSOR BE QUANTIZED?
	*
	* Decision rules:
	* - Quantize: weight matrices (2D, large)
	* - Keep F32: norms, biases, embeddings, 1D tensors
	* ═══════════════════════════════════════════════════════════════════════════ */

	static int should_quantize(const STTensorInfo ti, const char gguf_name)
	{
	/* Never quantize 1D tensors (norms, biases) */
	if (ti->n_dims < 2) return 0;

	/* Never quantize embedding tables (row dimension = vocab) */
	if (strstr(gguf_name, "token_embd") != NULL) return 0;

	/* Never quantize LM head output — use exact match, not substring,
	* to avoid matching "attn_output.weight" */
	if (strcmp(gguf_name, "output.weight") == 0) return 0;

	/* Never quantize norm weights */
	if (strstr(gguf_name, "norm") != NULL) return 0;

	/* Never quantize bias tensors */
	if (strstr(gguf_name, ".bias") != NULL) return 0;

	/* Never quantize MoE gate routing weights */
	if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;

	/* Never quantize DeltaNet state-space parameters (1D or small) */
	if (strstr(gguf_name, "ssm_a") != NULL) return 0; /* A_log */
	if (strstr(gguf_name, "ssm_dt") != NULL) return 0; /* dt_bias */
	if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */

	/* Quantize everything else (attention projections, FFN weights, SSM projections) */
	return 1;
	}

	/* Detect attention Q/K/V/O projection tensors.
	* These are the most sensitive to quantization — errors in attention scores
	* cascade through the entire sequence, causing self-correction loops.
	* Promoting these to Q4_0 (~4.5bpw) doubles their precision. */
	static int is_attention_tensor(const char *gguf_name)
	{
	/* Gemma / LLaMA style GGUF names: blk.N.attn_q/k/v/output.weight */
	if (strstr(gguf_name, "attn_q.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_k.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
	/* Qwen 3.6 DeltaNet SSM projections — treat as attention-class (Q4_0) */
	if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1;
	if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1;
	if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1;
	/* HuggingFace style (fallthrough names) */
	if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
	if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
	if (strstr(gguf_name, "self_attn.v_proj.weight") != NULL) return 1;
	if (strstr(gguf_name, "self_attn.o_proj.weight") != NULL) return 1;
	return 0;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* HPC SENSITIVITY GRAPH BUILDER
	*
	* Creates an HPCGraph where each node represents a weight block.
	* For Q2_K: 256-weight superblocks.
	*
	* The 6 values per site correspond to 6 candidate scale factors:
	* v=0: scale * 0.85 (aggressive, high compression)
	* v=1: scale * 0.90
	* v=2: scale * 0.95
	* v=3: scale * 1.00 (standard)
	* v=4: scale * 1.05
	* v=5: scale * 1.10 (conservative, less compression error)
	*
	* BP propagates: "if your neighbor block is sensitive, you should be
	* conservative too" — creating coherent precision allocation.
	* ═══════════════════════════════════════════════════════════════════════════ */

	#define SCALE_FACTOR_COUNT 6
	static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
	0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
	};

	/* ── Multi-quhit expanded scale table ──
	* Search grid: 10×10 = 100 (d, dmin) candidates
	* Quhit encoding: bin 10 → 6 for D=6 quhits (BP operates on 6-state marginals)
	* Beam search: operates on all 100 candidates directly */
	#define QUHITS_PER_BLOCK 2
	#define N_CAND_D 16 /* d multiplier candidates (was 10) */
	#define N_CAND_M 16 /* dmin multiplier candidates (was 10) */
	#define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)

	static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
	static int scale_table_initialized = 0;

	static void init_scale_table(void) {
	if (scale_table_initialized) return;
	/* 100 candidates: uniform spacing centered on 1.0 */
	for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) {
	SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1));
	}
	scale_table_initialized = 1;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles
	*
	* The sub-block Shor measurement uses a 16-node linear-chain graph that
	* is identical in topology every time. Instead of hpc_create()/hpc_destroy()
	* inside the OMP hot loop, we reset the same graph to a clean state.
	*
	* This function resets an existing HPCGraph with n_sites nodes to its
	* initial state: clears all edges, resets adjacency lists, reinitializes
	* locals. Zero allocations.
	* ═══════════════════════════════════════════════════════════════════════════ */
	static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
	{
	/* Reset edge state */
	g->n_edges = 0;
	g->cz_edges = 0;
	g->phase_edges = 0;
	g->syntheme_edges = 0;
	g->n_log = 0;
	g->min_fidelity = 1.0;
	g->avg_fidelity = 1.0;
	g->amp_evals = 0;
	g->prob_evals = 0;
	g->measurements = 0;

	/* Reset adjacency lists (just zero the counts, keep allocated buffers) */
	for (uint64_t i = 0; i < n_sites; i++) {
	g->adj[i].count = 0;
	}

	/* Reinitialize local quhit states */
	for (uint64_t i = 0; i < n_sites; i++)
	triality_init(&g->locals[i]);
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
	*
	* powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 × x^0.4
	* where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation:
	* x^0.4 ≈ sqrtf(cbrtf(x^2 × x^2)) but simpler: x^2 × sqrtf(sqrtf(x))
	* is close enough for error norm purposes (~1% relative error).
	* ═══════════════════════════════════════════════════════════════════════════ */
	static inline float fast_pow_2_4(float x)
	{
	/* x^2.4 = x^2 × x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5))
	* x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain:
	* x^0.4 ≈ sqrtf(sqrtf(x)) × x^(-0.1) — too complex.
	* Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12)
	* Best: just use xx sqrtf(cbrtf(xx)) since cbrtf is fast (~15 cycles) /
	float x2 = x * x;
	return x2 * sqrtf(cbrtf(x2)); /* x^2 × (x^2)^(1/6) ≈ x^(2+1/3) ≈ x^2.333 */
	}

	/* Compute the Q2_K sub-block reconstruction error for a block at a given
	* scale multiplier, optionally weighted by importance vector */
	static float compute_block_error_q2k(const float *weights, int block_size,
	float scale_mult,
	const float *importance, int imp_offset)
	{
	float min_val = weights[0];
	float max_val = weights[0];
	for (int j = 1; j < block_size; j++) {
	if (weights[j] < min_val) min_val = weights[j];
	if (weights[j] > max_val) max_val = weights[j];
	}
	if (min_val > 0) min_val = 0;

	float range = (max_val - min_val) * scale_mult;
	if (range < 1e-15f) return 0.0f;
	float inv_range = 3.0f / range;

	float err = 0.0f;
	for (int j = 0; j < block_size; j++) {
	float x = weights[j];
	int q = (int)((x - min_val * scale_mult) * inv_range + 0.5f);
	if (q < 0) q = 0; if (q > 3) q = 3;
	float deq = min_val * scale_mult + (float)q * range / 3.0f;
	float diff = x - deq;
	float w = (importance) ? importance[imp_offset + j] : 1.0f;
	err += diff * diff * w;
	}
	return err;
	}

	/* Build multi-quhit HPC sensitivity graph.
	* 2 quhits per block → 36 scale candidates per block.
	*
	* Graph layout: sites [0..2*n-1] where:
	* site 2*i = coarse quhit for block i
	* site 2*i + 1 = fine quhit for block i
	*
	* Edges:
	* Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
	* Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
	* CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
	static HPCGraph build_sensitivity_graph(const float weights,
	int64_t n_elements,
	int block_size,
	float temperature,
	const float *importance)
	{
	int64_t n_blocks = n_elements / block_size;
	if (n_blocks < 2) return NULL;

	init_scale_table();

	int64_t graph_blocks = (n_blocks > 8192) ? 8192 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;
	int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK;

	HPCGraph *graph = hpc_create(n_sites);
	if (!graph) return NULL;

	for (int64_t i = 0; i < n_sites; i++)
	triality_dft(&graph->locals[i]);

	/* Compute errors for all 36 scale candidates per block,
	* then project onto coarse (quhit 0) and fine (quhit 1) marginals */
	for (int64_t i = 0; i < graph_blocks; i++) {
	int64_t block_idx = i * stride;
	const float block_weights = weights + block_idx block_size;

	/* Evaluate all 36 candidates */
	float errors[TOTAL_SCALE_CANDIDATES];
	float min_err = 1e30f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
	errors[c] = compute_block_error_q2k(block_weights, block_size,
	SCALE_TABLE[c],
	importance,
	(int)(block_idx * block_size));
	if (errors[c] < min_err) min_err = errors[c];
	}

	/* Project onto quhit 0 (coarse): marginalize over fine dimension
	* amp_coarse[v0] = Σ_{v1} exp(-error(v06+v1) / 2T) /
	double coarse_re[6], coarse_im[6];
	double coarse_norm = 0.0;
	for (int v0 = 0; v0 < 6; v0++) {
	coarse_re[v0] = 0.0;
	coarse_im[v0] = 0.0;
	for (int v1 = 0; v1 < 6; v1++) {
	int idx = v0 * 6 + v1;
	coarse_re[v0] += exp(-(double)(errors[idx] - min_err) /
	(2.0 * (double)temperature));
	}
	coarse_norm += coarse_re[v0] * coarse_re[v0];
	}
	if (coarse_norm > 1e-30) {
	double inv = 1.0 / sqrt(coarse_norm);
	for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
	}

	/* Project onto quhit 1 (fine): marginalize over coarse dimension
	* amp_fine[v1] = Σ_{v0} exp(-error(v06+v1) / 2T) /
	double fine_re[6], fine_im[6];
	double fine_norm = 0.0;
	for (int v1 = 0; v1 < 6; v1++) {
	fine_re[v1] = 0.0;
	fine_im[v1] = 0.0;
	for (int v0 = 0; v0 < 6; v0++) {
	int idx = v0 * 6 + v1;
	fine_re[v1] += exp(-(double)(errors[idx] - min_err) /
	(2.0 * (double)temperature));
	}
	fine_norm += fine_re[v1] * fine_re[v1];
	}
	if (fine_norm > 1e-30) {
	double inv = 1.0 / sqrt(fine_norm);
	for (int v = 0; v < 6; v++) fine_re[v] *= inv;
	}

	/* Write coarse quhit (site 2i) /
	int64_t s_coarse = 2 * i;
	for (int v = 0; v < 6; v++) {
	graph->locals[s_coarse].edge_re[v] = coarse_re[v];
	graph->locals[s_coarse].edge_im[v] = 0.0;
	}
	graph->locals[s_coarse].primary = VIEW_EDGE;
	graph->locals[s_coarse].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s_coarse].delta_valid = 0;
	triality_update_mask(&graph->locals[s_coarse]);

	/* Write fine quhit (site 2i + 1) /
	int64_t s_fine = 2 * i + 1;
	for (int v = 0; v < 6; v++) {
	graph->locals[s_fine].edge_re[v] = fine_re[v];
	graph->locals[s_fine].edge_im[v] = 0.0;
	}
	graph->locals[s_fine].primary = VIEW_EDGE;
	graph->locals[s_fine].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s_fine].delta_valid = 0;
	triality_update_mask(&graph->locals[s_fine]);
	}

	/* ── Build edges ── */
	for (int64_t i = 0; i < graph_blocks; i++) {
	/* Intra-block: coarse ↔ fine coupling */
	hpc_cz(graph, 2 * i, 2 * i + 1);

	/* Inter-block: neighbor coupling */
	if (i + 1 < graph_blocks) {
	hpc_cz(graph, 2 * i, 2 * (i + 1)); /* coarse ↔ coarse */
	hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* fine ↔ fine */
	}
	}

	return graph;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* MSE GRID SEARCH (ported from llm-compressor observers/mse.py)
	*
	* For a Q2_K sub-block, progressively shrink the min/max range to find
	* the candidate that minimizes weighted reconstruction error.
	*
	* for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
	* candidate_min = p * min
	* candidate_max = p * max
	* error = \|\|x - quantize(x, candidate_min, candidate_max)\|\|^norm
	* if error < best: update best
	* else: patience--; if patience == 0: break
	*
	* This is a direct C port of llm-compressor's _grid_search_mse.
	* ═══════════════════════════════════════════════════════════════════════════ */

	typedef struct {
	float maxshrink; /* Maximum shrink factor (0.0 to 1.0) */
	int grid; /* Number of grid divisions */
	int patience; /* Early stopping patience */
	float norm; /* Error norm exponent (2.0 = MSE, 2.4 = ...)*/
	} MSEGridConfig;

	static const MSEGridConfig MSE_DEFAULT_CONFIG = {
	.maxshrink = 0.20f,
	.grid = 200,
	.patience = 8,
	.norm = 2.4f
	};

	/* Grid search for optimal scale/min for a Q2_K sub-block of n weights
	* with nmax = 3 quantization levels.
	* Returns optimized scale; stores absolute min in *out_min.
	* importance: per-element weights (can be NULL for uniform). */
	static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
	uint8_t L, float out_min,
	const float *importance,
	const MSEGridConfig *cfg)
	{
	float min_val = x[0], max_val = x[0];
	for (int i = 1; i < n; i++) {
	if (x[i] < min_val) min_val = x[i];
	if (x[i] > max_val) max_val = x[i];
	}
	if (max_val == min_val) {
	for (int i = 0; i < n; i++) L[i] = 0;
	*out_min = -min_val;
	return 0.0f;
	}
	if (min_val > 0) min_val = 0;

	float best_scale = 0.0f;
	float best_min = -min_val;
	float best_error = 1e30f;
	int no_improve = 0;

	int shrink_steps = (int)(cfg->maxshrink * cfg->grid);
	if (shrink_steps < 1) shrink_steps = 1;

	for (int step = 0; step <= shrink_steps; step++) {
	float p = 1.0f - (float)step / (float)cfg->grid;

	float cand_min = p * min_val;
	float cand_max = p * max_val;

	if (cand_max <= cand_min) continue;

	float iscale = (float)nmax / (cand_max - cand_min);
	float scale = 1.0f / iscale;

	/* Quantize and measure error */
	float err = 0.0f;
	uint8_t tmp_L[256];
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - cand_min));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	tmp_L[i] = (uint8_t)l;

	float deq = cand_min + scale * (float)l;
	float diff = fabsf(x[i] - deq);
	/* Apply error norm — fast path for default norm=2.4 */
	float e = diff;
	if (cfg->norm == 2.4f) {
	e = fast_pow_2_4(diff);
	} else if (cfg->norm != 1.0f) {
	e = powf(diff, cfg->norm);
	}
	/* Apply importance weighting */
	if (importance) e *= importance[i];
	err += e;
	}

	if (err < best_error) {
	best_error = err;
	best_scale = scale;
	best_min = -cand_min;
	memcpy(L, tmp_L, n);
	no_improve = 0;
	} else {
	no_improve++;
	if (no_improve >= cfg->patience) break;
	}
	}

	/* Iterative refinement on the best candidate (from ggml) */
	float cur_min = -best_min;
	float cur_scale = best_scale;
	if (cur_scale > 1e-15f) {
	float iscale = 1.0f / cur_scale;
	for (int itry = 0; itry < 3; itry++) {
	float sumlx = 0;
	int suml2 = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - cur_min));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	sumlx += (x[i] - cur_min) * l;
	suml2 += l * l;
	}
	if (suml2 > 0) cur_scale = sumlx / suml2;
	float sum = 0;
	for (int i = 0; i < n; i++)
	sum += x[i] - cur_scale * L[i];
	cur_min = 0.7f * cur_min + 0.3f * sum / n;
	if (cur_min > 0) cur_min = 0;
	if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale;
	}
	}

	*out_min = -cur_min;
	return cur_scale;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
	*
	* Two-phase approach:
	* Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
	* This produces per-sub-block (scale, min) with 16-step search.
	* Phase B: HPC BP refines the superblock-level d/dmin rounding.
	* 6 candidate (d, dmin) pairs are tested; BP finds the one
	* where the GLOBAL reconstruction error is minimized via
	* constructive interference of per-sub-block phase coherence.
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants).
	* Finds optimal (scale, min) by searching 16 candidate iscale values
	* and solving weighted least-squares for each.
	* Returns scale; the_min is set to the negative of the optimal min. /
	static float hpc_make_qkx2_quants(int n, int nmax, const float *x,
	const float w, uint8_t L,
	float the_min, uint8_t Laux)
	{
	float xmin = x[0], xmax = x[0];
	float sum_w = w[0], sum_x = w[0] * x[0];
	for (int i = 1; i < n; i++) {
	if (x[i] < xmin) xmin = x[i];
	if (x[i] > xmax) xmax = x[i];
	sum_w += w[i];
	sum_x += w[i] * x[i];
	}
	if (xmin > 0) xmin = 0;
	if (xmax == xmin) {
	for (int i = 0; i < n; i++) L[i] = 0;
	*the_min = -xmin;
	return 0.0f;
	}

	float iscale = (float)nmax / (xmax - xmin);
	float scale = 1.0f / iscale;
	float best_mad = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - xmin));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	float diff = scale * (float)l + xmin - x[i];
	best_mad += w[i] * fabsf(diff);
	}

	/* 16 candidate iscale values: search [-0.5, -0.5 + 0.115] + nmax /
	for (int is = 0; is <= 15; is++) {
	float try_iscale = (-0.5f + 0.1f * (float)is + (float)nmax) / (xmax - xmin);
	float sl = 0, sl2 = 0, sxl = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(try_iscale * (x[i] - xmin));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	Laux[i] = (uint8_t)l;
	sl += w[i] * (float)l;
	sl2 += w[i] * (float)(l * l);
	sxl += w[i] * (float)l * x[i];
	}
	float det = sum_w * sl2 - sl * sl;
	if (det > 0) {
	float this_scale = (sum_w * sxl - sum_x * sl) / det;
	float this_min = (sl2 * sum_x - sl * sxl) / det;
	if (this_min > 0) {
	this_min = 0;
	this_scale = sxl / sl2;
	}
	float mad = 0;
	for (int i = 0; i < n; i++) {
	float diff = this_scale * (float)Laux[i] + this_min - x[i];
	mad += w[i] * fabsf(diff);
	}
	if (mad < best_mad) {
	for (int i = 0; i < n; i++) L[i] = Laux[i];
	best_mad = mad;
	scale = this_scale;
	xmin = this_min;
	}
	}
	}
	*the_min = -xmin;
	return scale;
	}

	/* Quantize the scale/min arrays into 4-bit values: make_qp_quants equivalent.
	* Returns the optimal d such that scales[j] ≈ d × Ls[j]. */
	static float hpc_make_qp_quants(int n, int nmax, const float *x,
	uint8_t L, const float sw)
	{
	float xmax = 0;
	for (int i = 0; i < n; i++)
	if (x[i] > xmax) xmax = x[i];
	if (xmax < 1e-15f) {
	for (int i = 0; i < n; i++) L[i] = 0;
	return 0.0f;
	}
	float iscale = (float)nmax / xmax;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * x[i]);
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	}
	float scale = 1.0f / iscale;
	float best_mse = 0;
	for (int i = 0; i < n; i++) {
	float diff = x[i] - scale * (float)L[i];
	best_mse += sw[i] * diff * diff;
	}
	for (int is = -4; is <= 4; is++) {
	if (is == 0) continue;
	float iscale_is = (0.1f * (float)is + (float)nmax) / xmax;
	float scale_is = 1.0f / iscale_is;
	float mse = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale_is * x[i]);
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	float diff = x[i] - scale_is * (float)l;
	mse += sw[i] * diff * diff;
	}
	if (mse < best_mse) {
	best_mse = mse;
	iscale = iscale_is;
	}
	}
	/* Recompute with best iscale + iterative refinement */
	float sumlx = 0, suml2 = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * x[i]);
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	sumlx += sw[i] * x[i] * (float)l;
	suml2 += sw[i] * (float)(l * l);
	}
	/* Iterative greedy refinement */
	for (int itry = 0; itry < 5; itry++) {
	int n_changed = 0;
	for (int i = 0; i < n; i++) {
	float wi = sw[i];
	float slx = sumlx - wi * x[i] * (float)L[i];
	float sl2 = suml2 - wi * (float)(L[i] * L[i]);
	if (slx > 0 && sl2 > 0) {
	int new_l = gguf_nearest_int(x[i] * sl2 / slx);
	if (new_l < 0) new_l = 0;
	if (new_l > nmax) new_l = nmax;
	if (new_l != L[i]) {
	slx += wi * x[i] * (float)new_l;
	sl2 += wi * (float)(new_l * new_l);
	if (slx * slx * suml2 > sumlx * sumlx * sl2) {
	L[i] = (uint8_t)new_l;
	sumlx = slx;
	suml2 = sl2;
	n_changed++;
	}
	}
	}
	}
	if (!n_changed) break;
	}
	return suml2 > 0 ? sumlx / suml2 : 0.0f;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* SHOR'S GRIFFITHS-NIU SEQUENTIAL MEASUREMENT FOR RMSE OPTIMIZATION
	* (Ported 1:1 from tesseract_factor.c — replaces BP)
	*
	* Instead of iterative message-passing (BP), this uses the EXACT sequential
	* measurement protocol from Shor's algorithm:
	*
	* For each block k (MSB → LSB):
	* 1. Compute feed-forward phase correction from previously measured blocks
	* 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
	* 3. Bake C_k into locals: α(d) *= C_k(d)
	* 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
	* 5. Apply IDFT6 in-place: interference creates peaks at optimal scales
	* 6. Born rule measurement → select optimal scale candidate
	* 7. Collapse site + absorb edge weights into neighbors (back-action)
	*
	* This IS the quantum Fourier transform that creates constructive
	* interference at the optimal RMSE configuration, exactly as Shor's
	* algorithm creates interference at the correct period.
	*
	* Domain mapping:
	* Factoring: oracle phase 2π×d×c_k/N → period r
	* Quantize: error Boltzmann amplitudes → optimal RMSE block
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* ω₆ roots of unity for CZ phase lookup */
	static const double W6_RE[6] = { 1.0, 0.5, -0.5, -1.0, -0.5, 0.5 };
	static const double W6_IM[6] = { 0.0, 0.866025403784438647, 0.866025403784438647,
	0.0, -0.866025403784438647, -0.866025403784438647 };
	static const double INV_SQRT6 = 0.40824829046386301637; /* 1/√6 */

	/* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
	* After sampling an outcome, collapse the target site to \|outcome⟩,
	* absorb all edge weights into neighbor local states (Magic Pointer
	* disentanglement), and remove dead edges from the graph.
	*
	* This is the EXACT same back-action protocol used in Shor's algorithm
	* for the semi-classical QFT: measurement of one site conditions all
	* remaining sites through the CZ phase correlations. */
	static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome)
	{
	/* Step 1: Collapse local state to \|outcome⟩ */
	for (int v = 0; v < 6; v++) {
	graph->locals[target_site].edge_re[v] = (v == outcome) ? 1.0 : 0.0;
	graph->locals[target_site].edge_im[v] = 0.0;
	}
	graph->locals[target_site].primary = VIEW_EDGE;
	graph->locals[target_site].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[target_site].delta_valid = 0;

	/* Step 2: Absorb edge weights into neighbor states (back-action).
	* For each edge (target, neighbor), the weight w(outcome, d) for each
	* neighbor basis state d gets multiplied into the neighbor's amplitude.
	* This is the Magic Pointer disentanglement from tesseract_factor.c. */
	HPCAdjList *adj = &graph->adj[target_site];
	for (uint64_t ei = 0; ei < adj->count; ei++) {
	uint64_t eid = adj->edge_ids[ei];
	HPCEdge *edge = &graph->edges[eid];
	uint64_t partner = (edge->site_a == (uint64_t)target_site) ?
	edge->site_b : edge->site_a;

	TrialityQuhit *pq = &graph->locals[partner];
	for (int d = 0; d < 6; d++) {
	double w_re, w_im;
	if (edge->type == HPC_EDGE_CZ) {
	int pidx = (outcome * d) % 6;
	w_re = HPC_W6_RE[pidx];
	w_im = HPC_W6_IM[pidx];
	} else {
	/* Weighted phase edge */
	if (edge->site_a == (uint64_t)target_site) {
	w_re = edge->w_re[outcome][d];
	w_im = edge->w_im[outcome][d];
	} else {
	w_re = edge->w_re[d][outcome];
	w_im = edge->w_im[d][outcome];
	}
	}
	double old_re = pq->edge_re[d], old_im = pq->edge_im[d];
	pq->edge_re[d] = old_re * w_re - old_im * w_im;
	pq->edge_im[d] = old_re * w_im + old_im * w_re;
	}
	pq->dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	pq->delta_valid = 0;
	}

	/* Step 3: Remove edges touching this site from the graph.
	* Mark by setting fidelity to -1 and remove from adj lists. */
	for (uint64_t ei = 0; ei < adj->count; ei++) {
	uint64_t eid = adj->edge_ids[ei];
	HPCEdge *edge = &graph->edges[eid];
	uint64_t partner = (edge->site_a == (uint64_t)target_site) ?
	edge->site_b : edge->site_a;

	/* Remove this edge from partner's adj list */
	HPCAdjList *padj = &graph->adj[partner];
	for (uint64_t pi = 0; pi < padj->count; pi++) {
	if (padj->edge_ids[pi] == eid) {
	padj->edge_ids[pi] = padj->edge_ids[--padj->count];
	break;
	}
	}
	edge->fidelity = -1.0; /* Mark as dead */
	}
	adj->count = 0; /* Clear target's adj list */
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* SHOR SEQUENTIAL MEASUREMENT — Griffiths-Niu Protocol for Quantization
	*
	* Ported 1:1 from tesseract_factor.c lines 2343-2500.
	*
	* Measures sites MSB→LSB. For each site k:
	* 1. Compute feed-forward phase correction θ_k from previously measured sites
	* 2. Compute neighbor contribution C_k(d) analytically
	* 3. Bake C_k into locals
	* 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
	* 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
	* 6. Compute \|β(v)\|² as measurement probabilities
	* 7. Sample/argmax → outcome
	* 8. Collapse + back-action via shor_collapse_site()
	*
	* Returns: marginals are written into marg_out[n_sites][6].
	* measured_out[n_sites] receives the measurement outcomes.
	* ═══════════════════════════════════════════════════════════════════════════ */
	static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
	double (marg_out)[6], int measured_out,
	int deterministic)
	{
	/* Measure sites from last to first (MSB→LSB, same as Griffiths-Niu) */
	for (int64_t k = n_sites - 1; k >= 0; k--) {
	int site_k = (int)k;

	/* Step 1: Compute feed-forward phase correction from previously
	* measured sites. The QFT phase is 2π F x / 6^n. For site k,
	* the fractional phase from previously measured site j (j > k)
	* is measured_out[j] / 6^{j-k+1}.
	* Power MUST start at 36.0 (6^2) for the immediately previous site. */
	double theta_k = 0.0;
	{
	double power = 36.0;
	for (int64_t j = k + 1; j < n_sites; j++) {
	theta_k += (double)measured_out[j] / power;
	power *= 6.0;
	}
	}

	/* Step 2: Compute neighbor contribution C_k(d) analytically.
	* C_k(d) = Π_neighbor Σ_{w=0}^{5} local_neighbor(w) × edge_weight(d, w)
	* Each neighbor is independent (product state). */
	double ck_re[6], ck_im[6];
	for (int d = 0; d < 6; d++) { ck_re[d] = 1.0; ck_im[d] = 0.0; }

	const HPCAdjList *adj = &graph->adj[site_k];
	for (uint64_t ei = 0; ei < adj->count; ei++) {
	uint64_t eid = adj->edge_ids[ei];
	const HPCEdge *edge = &graph->edges[eid];
	if (edge->fidelity < 0.0) continue; /* Skip dead edges */
	uint64_t partner = (edge->site_a == (uint64_t)site_k) ?
	edge->site_b : edge->site_a;

	const TrialityQuhit *pq = &graph->locals[partner];
	for (int d = 0; d < 6; d++) {
	double sr = 0, si = 0;
	for (int w = 0; w < 6; w++) {
	double lr = pq->edge_re[w], li = pq->edge_im[w];
	double wr, wi;
	if (edge->type == HPC_EDGE_CZ) {
	int pidx = (d * w) % 6;
	wr = HPC_W6_RE[pidx]; wi = HPC_W6_IM[pidx];
	} else if (edge->site_a == (uint64_t)site_k) {
	wr = edge->w_re[d][w]; wi = edge->w_im[d][w];
	} else {
	wr = edge->w_re[w][d]; wi = edge->w_im[w][d];
	}
	sr += lrwr - liwi;
	si += lrwi + liwr;
	}
	double nr = ck_re[d]sr - ck_im[d]si;
	double ni = ck_re[d]si + ck_im[d]sr;
	ck_re[d] = nr; ck_im[d] = ni;
	}
	}

	/* Step 3: Bake C_k(d) into locals: α(d) = C_k(d) /
	for (int d = 0; d < 6; d++) {
	double re = graph->locals[site_k].edge_re[d];
	double im = graph->locals[site_k].edge_im[d];
	graph->locals[site_k].edge_re[d] = reck_re[d] - imck_im[d];
	graph->locals[site_k].edge_im[d] = reck_im[d] + imck_re[d];
	}

	/* Step 4: Apply feed-forward phase correction to locals. */
	for (int d = 0; d < 6; d++) {
	double angle = -2.0 * 3.14159265358979323846 * d * theta_k;
	double pr = cos(angle), pi2 = sin(angle);
	double re = graph->locals[site_k].edge_re[d];
	double im = graph->locals[site_k].edge_im[d];
	graph->locals[site_k].edge_re[d] = repr - impi2;
	graph->locals[site_k].edge_im[d] = repi2 + impr;
	}

	/* Step 5: Apply IDFT6 in-place: phase basis → computational basis.
	* β(v) = (1/√6) Σ_{d=0}^{5} α'(d) × e^{2πi d v / 6}
	* C_k(d) is INSIDE the coherent sum — THIS creates interference
	* peaks at the optimal RMSE configuration, exactly as Shor's
	* algorithm creates peaks at the correct period. */
	{
	double alpha_re[6], alpha_im[6];
	for (int d = 0; d < 6; d++) {
	alpha_re[d] = graph->locals[site_k].edge_re[d];
	alpha_im[d] = graph->locals[site_k].edge_im[d];
	}
	for (int v = 0; v < 6; v++) {
	double sum_re = 0.0, sum_im = 0.0;
	for (int d = 0; d < 6; d++) {
	double angle = 2.0 * 3.14159265358979323846 * d * v / 6.0;
	double er = cos(angle), ei = sin(angle);
	sum_re += alpha_re[d]er - alpha_im[d]ei;
	sum_im += alpha_re[d]ei + alpha_im[d]er;
	}
	graph->locals[site_k].edge_re[v] = sum_re * INV_SQRT6;
	graph->locals[site_k].edge_im[v] = sum_im * INV_SQRT6;
	}
	}

	/* Step 6: Compute marginals from \|local(v)\|² */
	double probs[6];
	double total = 0.0;
	for (int v = 0; v < 6; v++) {
	probs[v] = graph->locals[site_k].edge_re[v] * graph->locals[site_k].edge_re[v] +
	graph->locals[site_k].edge_im[v] * graph->locals[site_k].edge_im[v];
	total += probs[v];
	}
	if (total > 1e-30) {
	for (int v = 0; v < 6; v++) probs[v] /= total;
	} else {
	for (int v = 0; v < 6; v++) probs[v] = 1.0 / 6.0;
	}

	/* Store marginals for downstream beam search */
	for (int v = 0; v < 6; v++)
	marg_out[k][v] = probs[v];

	/* Step 7: Select outcome — deterministic argmax for quantization
	* (unlike factoring which uses Born sampling for probabilistic
	* period recovery, quantization wants the MAP estimate) */
	int outcome;
	if (deterministic) {
	outcome = 0;
	double max_p = probs[0];
	for (int v = 1; v < 6; v++) {
	if (probs[v] > max_p) { max_p = probs[v]; outcome = v; }
	}
	} else {
	/* Born sampling (for multi-shot refinement) */
	static unsigned int shor_rng = 271828;
	shor_rng = shor_rng * 1664525u + 1013904223u;
	double r01 = (double)(shor_rng >> 8) / 16777216.0;
	double cumul = 0.0;
	outcome = 5;
	for (int v = 0; v < 6; v++) {
	cumul += probs[v];
	if (r01 <= cumul) { outcome = v; break; }
	}
	}

	measured_out[k] = outcome;

	/* Step 8: Collapse + back-action — absorb edge weights into
	* neighbor locals (Magic Pointer disentanglement) */
	shor_collapse_site(graph, site_k, outcome);
	}
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors)
	*
	* Same architecture as Q2_K HPC pipeline, but simpler:
	* - One parameter per block (scale d only, no dmin)
	* - Single quhit per block (6 states)
	* - 10 candidate scales → bin to 6 for BP
	* - 12-beam Hensel search for globally optimal configuration
	* - Triality 3-view marginals for robust scoring
	*
	* Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d
	* ═══════════════════════════════════════════════════════════════════════════ */

	#define Q4_N_CAND 16 /* scale candidates for Q4_0 (was 10) */
	#define Q4_N_BEAMS 24 /* beam width (was 12) */

	/* Tight neighborhood around WLS optimum: ±10% */
	static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = {
	0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, 0.975f, 0.985f,
	0.995f, 1.005f, 1.015f, 1.025f, 1.035f, 1.050f, 1.070f, 1.100f
	};
	static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
	0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5
	};

	static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
	BlockQ4_0 output, float out_total_error,
	const float *imat_importance, int verbose)
	{
	int64_t n_blocks = n_elements / QK4_0;
	float total_err = 0.0f;

	/* ── Phase 1: Greedy seed — compute scale per block ── */
	float greedy_d = (float )calloc(n_blocks, sizeof(float));

	#pragma omp parallel for schedule(dynamic, 64)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK4_0;
	float amax = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	float av = fabsf(bw[j]);
	if (av > amax) amax = av;
	}
	greedy_d[blk] = amax / 7.0f;
	}

	/* ── Phase 2: WLS-Optimal Candidate Generation for Q4_0 ──
	* First find the true optimal d* via 3-iteration WLS,
	* then generate candidates centered on d* with tight spacing. */
	float (cand_errors)[Q4_N_CAND] = (float ()[Q4_N_CAND])
	calloc(n_blocks, sizeof(float[Q4_N_CAND]));
	uint16_t (cand_d16)[Q4_N_CAND] = (uint16_t ()[Q4_N_CAND])
	calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));

	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK4_0;

	/* ── Step 2a: WLS solve to find optimal d* ── */
	float wls_d = greedy_d[blk];
	for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
	if (wls_d < 1e-15f) break;
	float inv_d = 1.0f / wls_d;
	float num = 0.0f, den = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	int q = (int)(bw[j] * inv_d + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	float qc = (float)q - 8.0f;
	float w = (imat_importance) ?
	imat_importance[blk * QK4_0 + j] : 1.0f;
	num += w * bw[j] * qc;
	den += w * qc * qc;
	}
	if (den > 1e-15f) {
	float d_new = num / den;
	if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
	wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new));
	}
	}

	/* ── Step 2b: Generate candidates centered on WLS optimum ── */
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	float trial_d = wls_d * Q4_NEIGHBOR_MULTS[ci];
	uint16_t d16 = gguf_fp32_to_fp16(trial_d);
	float actual_d = gguf_fp16_to_fp32(d16);
	cand_d16[blk][ci] = d16;

	float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
	float err = 0.0f;

	for (int j = 0; j < QK4_0; j += 6) {
	int g_len = (j + 6 <= QK4_0) ? 6 : (QK4_0 - j);
	int half_g = g_len / 2;
	float e_cur[6], w_cur[6];

	for (int kk = 0; kk < g_len; kk++) {
	int idx = j + kk;
	float x = bw[idx];
	int q = (int)(x * id + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	float deq = ((float)q - 8.0f) * actual_d;
	e_cur[kk] = x - deq;
	w_cur[kk] = (imat_importance) ? imat_importance[blk * QK4_0 + idx] : 1.0f;
	}

	/* Decompose into vesica (DC) and wave (AC) components */
	float vesica_err = 0.0f, wave_err = 0.0f;
	for (int p = 0; p < half_g; p++) {
	float v = e_cur[p] + e_cur[p + half_g];
	float w_wave = e_cur[p] - e_cur[p + half_g];
	float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f;
	vesica_err += v * v * w_avg;
	wave_err += w_wave * w_wave * w_avg;
	}
	/* Triality weighting: penalize vesica 4×, wave 1×.
	* Factor of 0.5 keeps scale consistent with standard MSE. */
	err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err);
	}
	cand_errors[blk][ci] = err;
	}
	}

	/* ── Phase 3: HPC graph — single quhit per block ── */
	int best_candidate = (int )malloc(n_blocks * sizeof(int));
	for (int64_t i = 0; i < n_blocks; i++)
	best_candidate[i] = 10; /* Q4_NEIGHBOR_MULTS[10] = 1.00 */

	if (n_blocks >= 2) {
	float temperature = 0.5f;
	int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;
	int64_t n_sites = graph_blocks; /* 1 quhit per block */

	HPCGraph *graph = hpc_create(n_sites);
	if (graph) {
	for (int64_t i = 0; i < n_sites; i++)
	triality_dft(&graph->locals[i]);

	/* Adaptive temperature from error landscape */
	{
	double err_accum = 0.0;
	int err_count = 0;
	for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
	int64_t blk = gi * stride;
	float max_e = 0.0f;
	for (int c = 0; c < Q4_N_CAND; c++)
	if (cand_errors[blk][c] > max_e)
	max_e = cand_errors[blk][c];
	err_accum += (double)max_e;
	err_count++;
	}
	if (err_count > 0) {
	temperature = (float)(err_accum / err_count) * 0.1f;
	if (temperature < 1e-10f) temperature = 1e-10f;
	}
	}

	/* Encode stride-group AGGREGATED candidate errors as Boltzmann amplitudes */
	for (int64_t i = 0; i < graph_blocks; i++) {
	/* Aggregate errors across stride group */
	float agg_errors[Q4_N_CAND];
	for (int c = 0; c < Q4_N_CAND; c++)
	agg_errors[c] = 0.0f;

	int64_t blk_start = i * stride;
	int64_t blk_end = blk_start + stride;
	if (blk_end > n_blocks) blk_end = n_blocks;
	int64_t group_size = blk_end - blk_start;

	for (int64_t b = blk_start; b < blk_end; b++) {
	for (int c = 0; c < Q4_N_CAND; c++)
	agg_errors[c] += cand_errors[b][c];
	}
	if (group_size > 1) {
	float inv_gs = 1.0f / (float)group_size;
	for (int c = 0; c < Q4_N_CAND; c++)
	agg_errors[c] *= inv_gs;
	}

	float min_err = 1e30f;
	for (int c = 0; c < Q4_N_CAND; c++)
	if (agg_errors[c] < min_err)
	min_err = agg_errors[c];

	double amp_re[6];
	double amp_norm = 0.0;
	for (int qi = 0; qi < 6; qi++) amp_re[qi] = 0.0;
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	int qi = Q4_CAND_TO_QUHIT[ci];
	amp_re[qi] += exp(-(double)(agg_errors[ci] - min_err) /
	(2.0 * (double)temperature));
	}
	for (int qi = 0; qi < 6; qi++)
	amp_norm += amp_re[qi] * amp_re[qi];
	if (amp_norm > 1e-30) {
	double inv = 1.0 / sqrt(amp_norm);
	for (int v = 0; v < 6; v++) amp_re[v] *= inv;
	}

	for (int v = 0; v < 6; v++) {
	graph->locals[i].edge_re[v] = amp_re[v];
	graph->locals[i].edge_im[v] = 0.0;
	}
	graph->locals[i].primary = VIEW_EDGE;
	graph->locals[i].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[i].delta_valid = 0;
	triality_update_mask(&graph->locals[i]);
	}

	/* Neighbor edges */
	for (int64_t i = 0; i < graph_blocks - 1; i++)
	hpc_cz(graph, i, i + 1);

	/* ── Shor's Griffiths-Niu Sequential Measurement ──
	* Replaces BP with exact marginals via IDFT6 + feed-forward +
	* collapse/back-action (ported 1:1 from tesseract_factor.c).
	* Single pass, no iteration, no message damping. */
	double (marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));
	int shor_measured = (int )calloc(graph_blocks, sizeof(int));

	shor_measure_graph(graph, graph_blocks, marg, shor_measured, 1);

	free(shor_measured);

	/* Beam search over candidates */
	typedef struct { double acc_error; int history_idx; } Q4Beam;
	typedef struct { int cand_idx; int parent_idx; } Q4BeamHistory;

	Q4Beam beams[Q4_N_BEAMS];
	int active_beams = 1;
	Q4BeamHistory history = (Q4BeamHistory )malloc(n_blocks * Q4_N_BEAMS * sizeof(Q4BeamHistory));

	for (int b = 0; b < Q4_N_BEAMS; b++) {
	beams[b].acc_error = 0.0;
	beams[b].history_idx = -1;
	}

	for (int64_t i = 0; i < graph_blocks; i++) {
	double m_total = 0.0;
	for (int v = 0; v < 6; v++) m_total += marg[i][v];

	double cand_score[Q4_N_CAND];
	int64_t blk = i * stride;
	/* Count candidates per quhit bin for normalization */
	int q4_bin_count[6] = {0};
	for (int ci = 0; ci < Q4_N_CAND; ci++)
	q4_bin_count[Q4_CAND_TO_QUHIT[ci]]++;
	/* Per-block error normalization: divide by block mean error
	* so small-weight blocks don't dominate beam selection */
	float blk_mean_err = 0.0f;
	for (int ci = 0; ci < Q4_N_CAND; ci++)
	blk_mean_err += cand_errors[blk][ci];
	blk_mean_err /= (float)Q4_N_CAND;
	if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f;
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	int qi = Q4_CAND_TO_QUHIT[ci];
	double p = (m_total > 1e-30) ? marg[i][qi] / m_total : 1.0/6.0;
	p /= (double)q4_bin_count[qi]; /* normalize by bin occupancy */
	cand_score[ci] = p / (cand_errors[blk][ci] / blk_mean_err + 1e-15);
	}

	typedef struct { double score; int beam_idx; int cand_idx; } Q4Ext;
	Q4Ext extensions[Q4_N_BEAMS * Q4_N_CAND];
	int n_ext = 0;
	for (int b = 0; b < active_beams; b++) {
	for (int c = 0; c < Q4_N_CAND; c++) {
	double ext_err = beams[b].acc_error + cand_errors[blk][c];
	extensions[n_ext].score = cand_score[c] / (ext_err + 1e-15);
	extensions[n_ext].beam_idx = b;
	extensions[n_ext].cand_idx = c;
	n_ext++;
	}
	}

	int top_k = (n_ext < Q4_N_BEAMS) ? n_ext : Q4_N_BEAMS;
	int top_indices[Q4_N_BEAMS];
	for (int k = 0; k < top_k; k++) {
	int best = -1; double best_s = -1e30;
	for (int e = 0; e < n_ext; e++) {
	if (extensions[e].score > best_s) {
	best_s = extensions[e].score; best = e;
	}
	}
	top_indices[k] = best;
	extensions[best].score = -2e30;
	}

	Q4Beam new_beams[Q4_N_BEAMS];
	for (int k = 0; k < top_k; k++) {
	int ei = top_indices[k];
	int sb = extensions[ei].beam_idx;
	int cand = extensions[ei].cand_idx;

	int hist_idx = i * Q4_N_BEAMS + k;
	history[hist_idx].cand_idx = cand;
	history[hist_idx].parent_idx = beams[sb].history_idx;

	new_beams[k].history_idx = hist_idx;
	new_beams[k].acc_error = beams[sb].acc_error + cand_errors[blk][cand];
	}
	for (int k = 0; k < top_k; k++) beams[k] = new_beams[k];
	active_beams = top_k;
	}

	int curr_hist = beams[0].history_idx;
	for (int64_t i = graph_blocks - 1; i >= 0; i--) {
	int group_cidx;
	if (curr_hist >= 0) {
	group_cidx = history[curr_hist].cand_idx;
	curr_hist = history[curr_hist].parent_idx;
	} else {
	group_cidx = 10;
	}

	if (stride <= 1) {
	best_candidate[i] = group_cidx;
	} else {
	/* Per-block local optimization within stride group.
	* Beam picks the quhit bin; each block picks its best
	* candidate in that bin from its own error landscape. */
	int target_bin = Q4_CAND_TO_QUHIT[group_cidx];

	for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) {
	float best_err = 1e30f;
	int best_c = group_cidx;
	for (int c = 0; c < Q4_N_CAND; c++) {
	if (Q4_CAND_TO_QUHIT[c] != target_bin) continue;
	if (cand_errors[b][c] < best_err) {
	best_err = cand_errors[b][c];
	best_c = c;
	}
	}
	/* Greedy override if global best is >5% better */
	float global_best = 1e30f;
	int global_best_c = group_cidx;
	for (int c = 0; c < Q4_N_CAND; c++) {
	if (cand_errors[b][c] < global_best) {
	global_best = cand_errors[b][c];
	global_best_c = c;
	}
	}
	if (global_best < best_err * 0.95f)
	best_candidate[b] = global_best_c;
	else
	best_candidate[b] = best_c;
	}
	}
	}
	free(history);

	/* ══════════════════════════════════════════════════════════════
	* Phase 3.5: Born-Rule Multi-Shot Scale Refinement
	*
	* The beam search found the MAP candidate sequence. But the
	* triality marginals encode quantum phase-coherent structure
	* that a greedy beam can miss.
	*
	* Like tesseract_factor's MCMC period recovery (lines 1920-1964):
	* 1. Take N independent Born samples from triality marginals
	* 2. Each sample → full candidate assignment across all blocks
	* 3. Evaluate actual RMSE for each assignment
	* 4. Keep assignment with lowest total RMSE
	*
	* Reuses the EXISTING converged Möbius sheet — zero new BP.
	* ══════════════════════════════════════════════════════════════ */
	{
	#define Q4_BORN_SHOTS 64

	/* Compute beam-search baseline RMSE for comparison */
	float beam_total_err = 0.0f;
	for (int64_t bi = 0; bi < n_blocks; bi++)
	beam_total_err += cand_errors[bi][best_candidate[bi]];

	/* Build per-block CDFs from triality marginals */
	unsigned int born_rng = 314159;

	/* Compute tail error once (blocks beyond graph coverage) */
	float tail_err_q4 = 0.0f;
	for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
	tail_err_q4 += cand_errors[bi][best_candidate[bi]];

	/* Sparse shot buffer: only track stride-sampled blocks */
	int shot_sparse_q4 = (int )malloc(graph_blocks * sizeof(int));

	for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
	float shot_err = tail_err_q4;

	for (int64_t gi = 0; gi < graph_blocks; gi++) {
	/* Normalize marginals to CDF */
	double m_total = 0.0;
	for (int v = 0; v < 6; v++) m_total += marg[gi][v];

	/* Born sample: CDF inversion (same as born_sample) */
	born_rng = born_rng * 1664525u + 1013904223u;
	double rnd = (double)(born_rng >> 8) / 16777216.0;
	double target = rnd * m_total;
	double cum = 0.0;
	int sampled_qi = 5;
	for (int v = 0; v < 6; v++) {
	cum += marg[gi][v];
	if (cum > target) { sampled_qi = v; break; }
	}

	/* Find the best candidate WITHIN this quhit bin */
	int64_t blk = gi * stride;
	float best_bin_err = 1e30f;
	int best_bin_cand = 10; /* default */
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) {
	if (cand_errors[blk][ci] < best_bin_err) {
	best_bin_err = cand_errors[blk][ci];
	best_bin_cand = ci;
	}
	}
	}

	shot_sparse_q4[gi] = best_bin_cand;
	shot_err += cand_errors[blk][best_bin_cand];
	}

	/* Metropolis acceptance: adopt if better than current best */
	if (shot_err < beam_total_err) {
	for (int64_t gi = 0; gi < graph_blocks; gi++)
	best_candidate[gi * stride] = shot_sparse_q4[gi];
	beam_total_err = shot_err;
	}
	}

	free(shot_sparse_q4);
	}

	free(marg);
	hpc_destroy(graph);
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 4: Assemble blocks via least-squares scale extraction
	*
	* The factorer assembles a frequency register from BP marginals,
	* then EXTRACTS the exact period via continued fractions.
	*
	* We do the same: the beam search / Born shots selected a grid
	* candidate (the "assembled frequency"). Now we EXTRACT the exact
	* optimal FP16 scale via weighted least-squares (the "CF step").
	*
	* For Q4_0: d_optimal = Σ(w_j × x_j × q̃_j) / Σ(w_j × q̃_j²)
	* where q̃_j = (q_j - 8) and q_j is quantized at the grid scale.
	*
	* This iterates: quantize at d_init → compute d_optimal → re-quantize
	* → re-compute until convergence. 3 iterations suffice since Q4_0
	* has only 16 levels — the assignment stabilizes immediately.
	*
	* The grid gave us 16 possible scales. This gives us 65,536 (all FP16).
	* ══════════════════════════════════════════════════════════════════ */

	#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK4_0;
	int cidx = best_candidate[blk];

	/* Start from the grid-selected scale (the "assembled frequency") */
	float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]);

	/* Analog assembly: iterate to full convergence.
	* 5 iterations for stable (d, q-values) coupling. */
	for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
	if (d_current < 1e-15f) break;
	float id = 1.0f / d_current;

	/* Quantize at current scale */
	int qs_tmp[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	int q = (int)(bw[j] * id + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	qs_tmp[j] = q;
	}

	/* Weighted least-squares: d = Σ(w × x × q̃) / Σ(w × q̃²)
	* where q̃ = q - 8 (centered quantized value) */
	float num = 0.0f, den = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	float q_centered = (float)qs_tmp[j] - 8.0f;
	float w = (imat_importance) ?
	imat_importance[blk * QK4_0 + j] : 1.0f;
	num += w * bw[j] * q_centered;
	den += w * q_centered * q_centered;
	}

	if (den > 1e-15f) {
	float d_new = num / den;
	/* Clamp magnitude to prevent runaway (Q4_0 d can be negative) */
	float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
	if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) {
	uint16_t d16 = gguf_fp32_to_fp16(d_new);
	d_current = gguf_fp16_to_fp32(d16);
	}
	}
	}

	/* ── FP16 ULP neighborhood search + sign-flip exploration ──
	* The WLS solve found the continuous-optimal d. But FP16 truncation
	* may shift the optimum. Try ±4 ULP around d in FP16 space, plus
	* the negated scale, and pick the one with minimum reconstruction error. */
	{
	uint16_t base_d16 = gguf_fp32_to_fp16(d_current);
	uint16_t best_d16 = base_d16;
	float best_ulp_err = 1e30f;

	/* Try ±4 ULP neighborhood + sign flip = up to 17 candidates */
	uint16_t ulp_candidates[17];
	int n_ulp = 0;
	for (int delta = -4; delta <= 4; delta++) {
	int cand16 = (int)base_d16 + delta;
	if (cand16 >= 0 && cand16 <= 0x7BFF) /* valid positive FP16 */
	ulp_candidates[n_ulp++] = (uint16_t)cand16;
	}
	/* Sign-flipped d: negate and try ±0 ULP */
	{
	float neg_d = -d_current;
	uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d);
	ulp_candidates[n_ulp++] = neg_d16;
	}

	for (int ui = 0; ui < n_ulp; ui++) {
	float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
	float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
	float err = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	int q = (int)(bw[j] * trial_id + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	float deq = ((float)q - 8.0f) * trial_d;
	float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
	err += (bw[j] - deq) * (bw[j] - deq) * w;
	}
	if (err < best_ulp_err) {
	best_ulp_err = err;
	best_d16 = ulp_candidates[ui];
	}
	}
	d_current = gguf_fp16_to_fp32(best_d16);
	}

	/* Store the extracted optimal FP16 scale */
	output[blk].d = gguf_fp32_to_fp16(d_current);
	float actual_d = d_current;
	float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;

	/* ── D₆ Hadamard Error Shaping for Q4_0 ──
	* 32 elements per block = 5 full D₆ groups of 6 + 2 tail.
	* Apply the same antipodal fold as Q2_K: minimize vesica energy
	* to push quantization noise into wave (high-frequency) modes
	* that cancel in dot products. */

	/* Step 1: Standard nearest-rounding as baseline */
	int q_base[QK4_0], q_shaped[QK4_0];
	float q_cont[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	q_cont[j] = bw[j] * id + 8.0f;
	q_base[j] = (int)(q_cont[j] + 0.5f);
	if (q_base[j] < 0) q_base[j] = 0;
	if (q_base[j] > 15) q_base[j] = 15;
	}
	memcpy(q_shaped, q_base, QK4_0 * sizeof(int));

	/* Step 2: D₆ greedy flipping on 5 groups of 6 */
	for (int g = 0; g < 5; g++) {
	int g_off = g * 6;

	for (int pass = 0; pass < 6; pass++) {
	int best_k = -1;
	int best_q_alt = 0;
	float best_delta = 0.0f;

	/* Current group errors */
	float e_cur[6];
	for (int kk = 0; kk < 6; kk++) {
	float deq = ((float)q_shaped[g_off+kk] - 8.0f) * actual_d;
	e_cur[kk] = bw[g_off+kk] - deq;
	}

	/* Current D₆ metric: vesica energy + DC² */
	float vesica_cur = 0.0f, dc_cur = 0.0f;
	for (int p = 0; p < 3; p++) {
	float v = e_cur[p] + e_cur[p+3];
	vesica_cur += v * v;
	}
	for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk];
	float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

	/* Try flipping each element */
	for (int k = 0; k < 6; k++) {
	int idx = g_off + k;
	int q_cur = q_shaped[idx];

	int q_try;
	if (q_cont[idx] - (float)q_cur >= 0) {
	q_try = q_cur + 1;
	} else {
	q_try = q_cur - 1;
	}
	if (q_try < 0 \|\| q_try > 15) continue;

	/* Alt errors */
	float e_alt[6];
	for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk];
	float deq_try = ((float)q_try - 8.0f) * actual_d;
	e_alt[k] = bw[idx] - deq_try;

	/* Alt D₆ metric */
	float vesica_alt = 0.0f, dc_alt = 0.0f;
	for (int p = 0; p < 3; p++) {
	float v = e_alt[p] + e_alt[p+3];
	vesica_alt += v * v;
	}
	for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk];
	float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;

	float delta = metric_cur - metric_alt;
	if (delta > best_delta) {
	best_delta = delta;
	best_k = k;
	best_q_alt = q_try;
	}
	}

	if (best_k < 0) break;
	q_shaped[g_off + best_k] = best_q_alt;
	}
	}

	/* Step 3: Error comparison — keep shaped only if MSE doesn't worsen >5% */
	float err_base = 0.0f, err_shaped = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
	float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
	float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
	err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w;
	err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w;
	}
	int q_final = (err_shaped <= err_base 1.05f) ? q_shaped : q_base;

	/* Pack nibbles and compute error */
	for (int j = 0; j < QK4_0 / 2; j++) {
	int q0 = q_final[j];
	int q1 = q_final[j + QK4_0/2];
	output[blk].qs[j] = (uint8_t)(q0 \| (q1 << 4));

	float deq0 = ((float)q0 - 8.0f) * actual_d;
	float deq1 = ((float)q1 - 8.0f) * actual_d;
	total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
	}
	}

	*out_total_error = total_err;
	free(greedy_d);
	free(cand_errors);
	free(cand_d16);
	free(best_candidate);
	}

	static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
	BlockQ2K output, float out_total_error,
	OptimizerMode opt_mode,
	const float *imat_importance,
	int verbose)
	{
	int64_t n_blocks = n_elements / QK_K;
	float total_err = 0.0f;
	const int N_SUB = QK_K / 16;

	init_scale_table();

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 1: Greedy quantization — produce seed (d, dmin) per block
	* ══════════════════════════════════════════════════════════════════ */

	/* Store Phase A/B results for all blocks */
	typedef struct {
	float dm, mm; /* greedy d, dmin (fp32) */
	uint16_t d_fp16, dmin_fp16; /* greedy d, dmin (fp16) */
	uint8_t Ls[16], Lm[16]; /* sub-block scale/min indices */
	float scales[16], mins[16], sw[16];
	} BlockSeed;

	BlockSeed seeds = (BlockSeed )calloc(n_blocks, sizeof(BlockSeed));

	#pragma omp parallel for schedule(dynamic, 64)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float block_x = weights + blk QK_K;
	uint8_t L[QK_K], Laux[16];
	float wt[16];

	float sumx2 = 0;
	for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i];
	float sigma2 = sumx2 / (float)QK_K;

	for (int j = 0; j < N_SUB; j++) {
	const float sx = block_x + 16 j;
	seeds[blk].sw[j] = 0;
	for (int l = 0; l < 16; l++) {
	float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f;
	wt[l] = imp * sqrtf(sigma2 + sx[l] * sx[l]);
	seeds[blk].sw[j] += wt[l];
	}
	seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx, wt,
	L + 16 * j, &seeds[blk].mins[j], Laux);
	}

	seeds[blk].dm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].scales,
	seeds[blk].Ls, seeds[blk].sw);
	seeds[blk].mm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].mins,
	seeds[blk].Lm, seeds[blk].sw);
	seeds[blk].d_fp16 = gguf_fp32_to_fp16(seeds[blk].dm);
	seeds[blk].dmin_fp16 = gguf_fp32_to_fp16(seeds[blk].mm);
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 2: WLS-Optimal Candidate Generation
	*
	* Instead of a fixed multiplier grid centered on greedy seeds,
	* we first solve a 3-iteration Weighted Least-Squares to find
	* the true optimal (d, dmin) per block, then generate the
	* 16×16 candidate grid centered on THOSE optimal values.
	* This makes the candidate space data-driven, not fabricated.
	* ══════════════════════════════════════════════════════════════════ */

	/* Wide neighborhood around WLS optimum: ±20% with asymmetric spacing
	* — finer near 1.0 for precision, wider at edges for exploration.
	* Critical for large-σ weights where the optimal (d,dmin) may be
	* far from the WLS seed. */
	static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
	0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f,
	1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f
	};
	static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
	0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f,
	1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f
	};
	/* Map 16 candidates → 6 quhit states for BP encoding */
	static const int CAND_TO_QUHIT[16] = {
	0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5
	};

	/* candidate_errors[blk][256] — weighted MSE per candidate */
	float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
	uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
	uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
	/* Per-candidate Ls/Lm — must recompute for each (d, dmin) */
	uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL;
	uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL;

	candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
	sizeof(float[TOTAL_SCALE_CANDIDATES]));
	candidate_d = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
	sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
	candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
	sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
	candidate_Ls = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks,
	sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16]));
	candidate_Lm = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks,
	sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16]));

	#pragma omp parallel for schedule(dynamic, 16)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float block_x = weights + blk QK_K;

	/* ── Step 2a: WLS solve to find optimal (d, dmin) ──
	* Seed from Phase 1 greedy, iterate 3× to converge.
	* Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
	* This is a 2-variable WLS: minimize Σ w×(x - d×a + dmin×b)² */
	float wls_dm = seeds[blk].dm;
	float wls_mm = seeds[blk].mm;
	uint8_t wls_Ls[16], wls_Lm[16];
	memcpy(wls_Ls, seeds[blk].Ls, 16);
	memcpy(wls_Lm, seeds[blk].Lm, 16);

	for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
	/* Quantize all elements at current (wls_dm, wls_mm) */
	uint8_t L_wls[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = wls_dm * (float)wls_Ls[j];
	float m_sub = wls_mm * (float)wls_Lm[j];
	if (d_sub < 1e-15f) {
	for (int k = 0; k < 16; k++) L_wls[16*j+k] = 0;
	continue;
	}
	for (int k = 0; k < 16; k++) {
	int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	L_wls[16*j+k] = (uint8_t)q;
	}
	}

	/* Accumulate 2×2 normal equations */
	double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)wls_Ls[j];
	float lm_f = (float)wls_Lm[j];
	for (int k = 0; k < 16; k++) {
	float x = block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j+k] : 1.0f;
	float a = ls_f * (float)L_wls[16*j+k];
	float b = lm_f;
	Saa += w * a * a;
	Sab += w * a * b;
	Sbb += w * b * b;
	Sxa += w * x * a;
	Sxb += w * x * b;
	}
	}

	/* Solve via Cramer's rule */
	double det = Saa * Sbb - Sab * Sab;
	if (fabs(det) > 1e-30) {
	double d_new = (Sbb * Sxa - Sab * Sxb) / det;
	double dm_new = (Sab * Sxa - Saa * Sxb) / det;
	/* Clamp: positive and within 4× of seed (prevent runaway) */
	if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10))
	wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
	if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10))
	wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
	}

	/* Re-derive Ls/Lm for updated (d, dmin) */
	for (int j = 0; j < N_SUB; j++) {
	if (wls_dm > 1e-15f) {
	int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm);
	if (ls < 0) ls = 0; if (ls > 15) ls = 15;
	wls_Ls[j] = (uint8_t)ls;
	} else { wls_Ls[j] = 0; }
	if (wls_mm > 1e-15f) {
	int lm = gguf_nearest_int(seeds[blk].mins[j] / wls_mm);
	if (lm < 0) lm = 0; if (lm > 15) lm = 15;
	wls_Lm[j] = (uint8_t)lm;
	} else { wls_Lm[j] = 0; }
	}
	}

	/* ── Step 2b: Generate 16×16 candidates centered on WLS optimum ──
	* Grid is now centered on (wls_dm, wls_mm) not (greedy_dm, greedy_mm).
	* Tighter spacing because we're already near the true minimum. */
	for (int di = 0; di < N_CAND_D; di++) {
	float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di];
	uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm);
	float actual_dm = gguf_fp16_to_fp32(trial_d16);

	for (int mi = 0; mi < N_CAND_M; mi++) {
	int cidx = di * N_CAND_M + mi;
	float trial_mm = wls_mm * NEIGHBOR_MULTS_M[mi];
	uint16_t trial_dmin16 = gguf_fp32_to_fp16(trial_mm);
	float actual_mm = gguf_fp16_to_fp32(trial_dmin16);

	candidate_d[blk][cidx] = trial_d16;
	candidate_dmin[blk][cidx] = trial_dmin16;

	/* Recompute Ls/Lm for THIS candidate dm/mm */
	uint8_t trial_Ls[16], trial_Lm[16];
	for (int j = 0; j < N_SUB; j++) {
	if (actual_dm > 1e-15f) {
	int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm);
	if (ls < 0) ls = 0; if (ls > 15) ls = 15;
	trial_Ls[j] = (uint8_t)ls;
	} else {
	trial_Ls[j] = 0;
	}
	if (actual_mm > 1e-15f) {
	int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm);
	if (lm < 0) lm = 0; if (lm > 15) lm = 15;
	trial_Lm[j] = (uint8_t)lm;
	} else {
	trial_Lm[j] = 0;
	}
	}
	memcpy(candidate_Ls[blk][cidx], trial_Ls, 16);
	memcpy(candidate_Lm[blk][cidx], trial_Lm, 16);

	/* Fully re-quantize and measure weighted MSE */
	float err = 0.0f;
	for (int j = 0; j < N_SUB; j++) {
	float d = actual_dm * (float)trial_Ls[j];
	float m = actual_mm * (float)trial_Lm[j];
	if (d < 1e-15f) {
	for (int k = 0; k < 16; k++) {
	float x = block_x[16 * j + k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16 * j + k] : 1.0f;
	err += x * x * w;
	}
	continue;
	}
	for (int k = 0; k < 16; k += 6) {
	int g_len = (k + 6 <= 16) ? 6 : (16 - k);
	int half_g = g_len / 2;
	float e_cur[6], w_cur[6];

	for (int kk = 0; kk < g_len; kk++) {
	int idx = 16 * j + k + kk;
	float x = block_x[idx];
	int q = gguf_nearest_int((x + m) / d);
	if (q < 0) q = 0; if (q > 3) q = 3;
	float deq = d * (float)q - m;
	e_cur[kk] = x - deq;
	w_cur[kk] = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
	}

	/* Decompose into vesica and wave */
	float vesica_err = 0.0f, wave_err = 0.0f;
	for (int p = 0; p < half_g; p++) {
	float v = e_cur[p] + e_cur[p + half_g];
	float w_wave = e_cur[p] - e_cur[p + half_g];
	float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f;
	vesica_err += v * v * w_avg;
	wave_err += w_wave * w_wave * w_avg;
	}
	/* Triality weighting: penalize vesica 4×, wave 1× */
	err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err);
	}
	}
	candidate_errors[blk][cidx] = err;
	}
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement
	*
	* Build a multi-quhit graph where each block has 2 quhits
	* encoding the 36 candidate errors. Shor's sequential measurement
	* (IDFT6 + feed-forward + collapse/back-action) extracts exact
	* marginals for optimal (d, dmin) per block — replaces BP.
	* ══════════════════════════════════════════════════════════════════ */

	/* Default: use greedy candidate (index 510+5 = 55, mult 1.00×1.00) /
	int best_candidate = (int )malloc(n_blocks * sizeof(int));
	for (int64_t i = 0; i < n_blocks; i++)
	best_candidate[i] = 10 * N_CAND_M + 10; /* NEIGHBOR_MULTS_D[10]=1.00, _M[10]=1.00 */

	if (opt_mode != OPT_MSE && n_blocks >= 2) {
	int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;
	float temperature = 0.5f;
	int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK;

	HPCGraph *graph = hpc_create(n_sites);
	if (graph) {
	for (int64_t i = 0; i < n_sites; i++)
	triality_dft(&graph->locals[i]);

	/* Encode each stride group's AGGREGATED candidate errors as dual-quhit
	* amplitudes. For stride > 1, average errors across ALL blocks in
	* the group — not just the first block. This is critical for large
	* tensors where stride=97 means 96/97 blocks were being ignored. */

	/* Compute adaptive temperature from median error spread.
	* This ensures the Boltzmann encoding produces meaningful distributions
	* regardless of weight magnitude (σ=0.0003 vs σ=0.024). */
	{
	double err_accum = 0.0;
	int err_count = 0;
	for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
	int64_t blk = gi * stride;
	float max_e = 0.0f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	if (candidate_errors[blk][c] > max_e)
	max_e = candidate_errors[blk][c];
	err_accum += (double)max_e;
	err_count++;
	}
	if (err_count > 0) {
	float median_err = (float)(err_accum / err_count);
	/* Temperature = 10% of median max error — sharp enough to
	* discriminate, soft enough for Shor interference */
	temperature = median_err * 0.1f;
	if (temperature < 1e-10f) temperature = 1e-10f;
	}
	}

	for (int64_t i = 0; i < graph_blocks; i++) {
	/* Aggregate errors across entire stride group */
	float agg_errors[TOTAL_SCALE_CANDIDATES];
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	agg_errors[c] = 0.0f;

	int64_t blk_start = i * stride;
	int64_t blk_end = blk_start + stride;
	if (blk_end > n_blocks) blk_end = n_blocks;
	int64_t group_size = blk_end - blk_start;

	for (int64_t b = blk_start; b < blk_end; b++) {
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	agg_errors[c] += candidate_errors[b][c];
	}
	/* Average across group */
	if (group_size > 1) {
	float inv_gs = 1.0f / (float)group_size;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	agg_errors[c] *= inv_gs;
	}

	float min_err = 1e30f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	if (agg_errors[c] < min_err)
	min_err = agg_errors[c];

	/* Quhit 0 (coarse = d dimension): marginalize over dmin */
	double coarse_re[6];
	double coarse_norm = 0.0;
	for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0;
	for (int di = 0; di < N_CAND_D; di++) {
	int qi = CAND_TO_QUHIT[di];
	for (int mi = 0; mi < N_CAND_M; mi++) {
	int cidx = di * N_CAND_M + mi;
	coarse_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) /
	(2.0 * (double)temperature));
	}
	}
	for (int qi = 0; qi < 6; qi++)
	coarse_norm += coarse_re[qi] * coarse_re[qi];
	if (coarse_norm > 1e-30) {
	double inv = 1.0 / sqrt(coarse_norm);
	for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
	}

	/* Quhit 1 (fine = dmin dimension): marginalize over d */
	double fine_re[6];
	double fine_norm = 0.0;
	for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0;
	for (int mi = 0; mi < N_CAND_M; mi++) {
	int qi = CAND_TO_QUHIT[mi];
	for (int di = 0; di < N_CAND_D; di++) {
	int cidx = di * N_CAND_M + mi;
	fine_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) /
	(2.0 * (double)temperature));
	}
	}
	for (int qi = 0; qi < 6; qi++)
	fine_norm += fine_re[qi] * fine_re[qi];
	if (fine_norm > 1e-30) {
	double inv = 1.0 / sqrt(fine_norm);
	for (int v = 0; v < 6; v++) fine_re[v] *= inv;
	}

	/* Write quhits */
	int64_t s0 = 2 * i, s1 = 2 * i + 1;
	for (int v = 0; v < 6; v++) {
	graph->locals[s0].edge_re[v] = coarse_re[v];
	graph->locals[s0].edge_im[v] = 0.0;
	graph->locals[s1].edge_re[v] = fine_re[v];
	graph->locals[s1].edge_im[v] = 0.0;
	}
	graph->locals[s0].primary = VIEW_EDGE;
	graph->locals[s0].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s0].delta_valid = 0;
	triality_update_mask(&graph->locals[s0]);
	graph->locals[s1].primary = VIEW_EDGE;
	graph->locals[s1].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s1].delta_valid = 0;
	triality_update_mask(&graph->locals[s1]);
	}

	/* Build edges */
	for (int64_t i = 0; i < graph_blocks; i++) {
	hpc_cz(graph, 2 * i, 2 * i + 1); /* intra-block: d ↔ dmin */
	if (i + 1 < graph_blocks) {
	hpc_cz(graph, 2 * i, 2 * (i + 1)); /* d ↔ d neighbor */
	hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* dmin ↔ dmin */
	}
	}

	/* ── Shor's Griffiths-Niu Sequential Measurement (dual quhit) ──
	* Replaces BP with exact marginals via IDFT6 + feed-forward +
	* collapse/back-action (ported 1:1 from tesseract_factor.c).
	*
	* The dual-quhit graph has 2×graph_blocks sites:
	* Even sites (s0 = 2*i): coarse (d dimension)
	* Odd sites (s1 = 2*i+1): fine (dmin dimension)
	*
	* Single-pass sequential measurement produces exact marginals
	* for both dimensions simultaneously through the CZ correlations. */
	double (shor_marg)[6] = (double ()[6])calloc(n_sites, sizeof(double[6]));
	int shor_measured = (int )calloc(n_sites, sizeof(int));

	shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1);

	/* Extract coarse (d) and fine (dmin) marginals from Shor output */
	double (coarse_marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));
	double (fine_marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));

	for (int64_t i = 0; i < graph_blocks; i++) {
	for (int v = 0; v < 6; v++) {
	coarse_marg[i][v] = shor_marg[2 * i][v];
	fine_marg[i][v] = shor_marg[2 * i + 1][v];
	}
	}

	free(shor_marg);
	free(shor_measured);

	/* ══ Hensel-Inspired Beam Search Constraint Propagation ══
	* Like tesseract_factor's Hensel lift: process blocks sequentially,
	* maintain K best configurations, prune by accumulated error.
	*
	* The constraint: blocks are selected JOINTLY. */

	#define N_BEAMS 24 /* K beams — widened for 31B (was 12) */

	typedef struct {
	double acc_error;
	int history_idx; /* index into the backpointer array */
	} QuantBeam;

	typedef struct {
	int cand_idx;
	int parent_idx;
	} BeamHistory;

	QuantBeam beams[N_BEAMS];
	int active_beams = 1;

	/* Pre-allocate history to avoid O(N^2) memory copies */
	BeamHistory history = (BeamHistory )malloc(n_blocks * N_BEAMS * sizeof(BeamHistory));

	for (int b = 0; b < N_BEAMS; b++) {
	beams[b].acc_error = 0.0;
	beams[b].history_idx = -1;
	}

	/* Process blocks sequentially with beam search */
	for (int64_t i = 0; i < graph_blocks; i++) {
	double c_total = 0.0, f_total = 0.0;
	for (int v = 0; v < 6; v++) {
	c_total += coarse_marg[i][v];
	f_total += fine_marg[i][v];
	}

	/* Candidate scores for this block: triality prob × (1/normalized_error) */
	double cand_score[TOTAL_SCALE_CANDIDATES];
	int64_t blk = i * stride;
	int d_bin_count[6] = {0}, m_bin_count[6] = {0};
	for (int k = 0; k < N_CAND_D; k++) d_bin_count[CAND_TO_QUHIT[k]]++;
	for (int k = 0; k < N_CAND_M; k++) m_bin_count[CAND_TO_QUHIT[k]]++;
	/* Per-block error normalization: divide by block mean error
	* so small-weight blocks don't dominate beam selection */
	float blk_mean_err = 0.0f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	blk_mean_err += candidate_errors[blk][c];
	blk_mean_err /= (float)TOTAL_SCALE_CANDIDATES;
	if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f;
	for (int di = 0; di < N_CAND_D; di++) {
	int qi_d = CAND_TO_QUHIT[di];
	double p_d = (c_total > 1e-30) ? coarse_marg[i][qi_d] / c_total : 1.0/6.0;
	p_d /= (double)d_bin_count[qi_d];
	for (int mi = 0; mi < N_CAND_M; mi++) {
	int qi_m = CAND_TO_QUHIT[mi];
	double p_m = (f_total > 1e-30) ? fine_marg[i][qi_m] / f_total : 1.0/6.0;
	p_m /= (double)m_bin_count[qi_m];
	int cidx = di * N_CAND_M + mi;
	cand_score[cidx] = p_d * p_m / (candidate_errors[blk][cidx] / blk_mean_err + 1e-15);
	}
	}

	/* Extend beams × 36 candidates, keep top K */
	typedef struct { double score; int beam_idx; int cand_idx; } BeamExt;
	BeamExt extensions[N_BEAMS * TOTAL_SCALE_CANDIDATES];
	int n_ext = 0;

	for (int b = 0; b < active_beams; b++) {
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
	/* Score = -(accumulated_error + this_block_error) × triality_prob */
	double ext_err = beams[b].acc_error + candidate_errors[blk][c];
	double ext_score = cand_score[c] / (ext_err + 1e-15);
	extensions[n_ext].score = ext_score;
	extensions[n_ext].beam_idx = b;
	extensions[n_ext].cand_idx = c;
	n_ext++;
	}
	}

	/* Top-K selection */
	int top_k = (n_ext < N_BEAMS) ? n_ext : N_BEAMS;
	int top_indices[N_BEAMS];
	for (int k = 0; k < top_k; k++) {
	int best = -1;
	double best_s = -1e30;
	for (int e = 0; e < n_ext; e++) {
	if (extensions[e].score > best_s) {
	best_s = extensions[e].score;
	best = e;
	}
	}
	top_indices[k] = best;
	extensions[best].score = -2e30; /* poison */
	}

	/* Build new beams from top-K extensions using backpointers */
	QuantBeam new_beams[N_BEAMS];
	for (int k = 0; k < top_k; k++) {
	int ext_idx = top_indices[k];
	int src_beam = extensions[ext_idx].beam_idx;
	int cand = extensions[ext_idx].cand_idx;

	int hist_idx = i * N_BEAMS + k;
	history[hist_idx].cand_idx = cand;
	history[hist_idx].parent_idx = beams[src_beam].history_idx;

	new_beams[k].history_idx = hist_idx;
	new_beams[k].acc_error = beams[src_beam].acc_error
	+ candidate_errors[blk][cand];
	}

	for (int k = 0; k < top_k; k++)
	beams[k] = new_beams[k];
	active_beams = top_k;
	}

	/* Trace back the best beam's selections.
	* The beam search selects one candidate per GRAPH NODE (stride group).
	* For stride > 1, each block within the stride group independently
	* picks its own best candidate — using the beam's coarse/fine quhit
	* bins as a constraint, but evaluating its own candidate_errors.
	* This eliminates stride-aliasing: previously 96/97 blocks were
	* forced to use a candidate chosen for 1 representative block. */
	int curr_hist = beams[0].history_idx;
	for (int64_t i = graph_blocks - 1; i >= 0; i--) {
	int group_cidx;
	if (curr_hist >= 0) {
	group_cidx = history[curr_hist].cand_idx;
	curr_hist = history[curr_hist].parent_idx;
	} else {
	group_cidx = 10 * N_CAND_M + 10;
	}

	if (stride <= 1) {
	/* No stride group — direct assignment */
	best_candidate[i] = group_cidx;
	} else {
	/* Per-block local optimization within the stride group.
	* The beam-selected candidate determines the target quhit
	* bins (d_bin, dmin_bin). Each block picks its own best
	* candidate that falls in compatible bins, or falls back
	* to the globally best candidate for that block. */
	int group_di = group_cidx / N_CAND_M;
	int group_mi = group_cidx % N_CAND_M;
	int target_d_bin = CAND_TO_QUHIT[group_di];
	int target_m_bin = CAND_TO_QUHIT[group_mi];

	for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) {
	/* Find best candidate in same quhit bins */
	float best_err = 1e30f;
	int best_c = group_cidx;

	for (int di = 0; di < N_CAND_D; di++) {
	if (CAND_TO_QUHIT[di] != target_d_bin) continue;
	for (int mi = 0; mi < N_CAND_M; mi++) {
	if (CAND_TO_QUHIT[mi] != target_m_bin) continue;
	int cidx = di * N_CAND_M + mi;
	if (candidate_errors[b][cidx] < best_err) {
	best_err = candidate_errors[b][cidx];
	best_c = cidx;
	}
	}
	}

	/* Also check if the block's overall best is significantly
	* better — if so, use it (greedy override) */
	float global_best = 1e30f;
	int global_best_c = group_cidx;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
	if (candidate_errors[b][c] < global_best) {
	global_best = candidate_errors[b][c];
	global_best_c = c;
	}
	}

	/* Use bin-constrained choice unless the global best
	* is >5% better — preserves Shor coherence while
	* allowing escape from bad bin assignments */
	if (global_best < best_err * 0.95f)
	best_candidate[b] = global_best_c;
	else
	best_candidate[b] = best_c;
	}
	}
	}

	free(history);

	/* ══════════════════════════════════════════════════════════════
	* Phase 3.5: Born-Rule Multi-Shot Scale Refinement (Q2_K)
	*
	* 2D Born sampling: sample coarse quhit (d dimension) and
	* fine quhit (dmin dimension) jointly from triality marginals.
	* Each shot produces a (d_idx, dmin_idx) pair per block.
	* ══════════════════════════════════════════════════════════════ */
	{
	#define Q2K_BORN_SHOTS 64

	float beam_total_err = 0.0f;
	for (int64_t bi = 0; bi < n_blocks; bi++)
	beam_total_err += candidate_errors[bi][best_candidate[bi]];

	unsigned int born_rng_q2 = 271828;
	/* Compute tail error once (blocks beyond graph coverage) */
	float tail_err = 0.0f;
	for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
	tail_err += candidate_errors[bi][best_candidate[bi]];

	/* Sparse shot buffer: only track stride-sampled blocks */
	int shot_sparse = (int )malloc(graph_blocks * sizeof(int));

	for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
	float shot_err = tail_err;

	for (int64_t gi = 0; gi < graph_blocks; gi++) {
	/* Born sample coarse (d) quhit */
	double c_total = 0.0;
	for (int v = 0; v < 6; v++) c_total += coarse_marg[gi][v];
	born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u;
	double rnd_c = (double)(born_rng_q2 >> 8) / 16777216.0;
	double target_c = rnd_c * c_total;
	double cum_c = 0.0;
	int qi_d = 5;
	for (int v = 0; v < 6; v++) {
	cum_c += coarse_marg[gi][v];
	if (cum_c > target_c) { qi_d = v; break; }
	}

	/* Born sample fine (dmin) quhit */
	double f_total = 0.0;
	for (int v = 0; v < 6; v++) f_total += fine_marg[gi][v];
	born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u;
	double rnd_f = (double)(born_rng_q2 >> 8) / 16777216.0;
	double target_f = rnd_f * f_total;
	double cum_f = 0.0;
	int qi_m = 5;
	for (int v = 0; v < 6; v++) {
	cum_f += fine_marg[gi][v];
	if (cum_f > target_f) { qi_m = v; break; }
	}

	/* Find best candidate within the sampled (d_bin, m_bin) */
	int64_t blk = gi * stride;
	float best_bin_err = 1e30f;
	int best_bin_cand = 10 * N_CAND_M + 10;
	for (int di = 0; di < N_CAND_D; di++) {
	if (CAND_TO_QUHIT[di] != qi_d) continue;
	for (int mi = 0; mi < N_CAND_M; mi++) {
	if (CAND_TO_QUHIT[mi] != qi_m) continue;
	int cidx = di * N_CAND_M + mi;
	if (candidate_errors[blk][cidx] < best_bin_err) {
	best_bin_err = candidate_errors[blk][cidx];
	best_bin_cand = cidx;
	}
	}
	}

	shot_sparse[gi] = best_bin_cand;
	shot_err += candidate_errors[blk][best_bin_cand];
	}

	if (shot_err < beam_total_err) {
	/* Only now apply the sparse updates to best_candidate */
	for (int64_t gi = 0; gi < graph_blocks; gi++)
	best_candidate[gi * stride] = shot_sparse[gi];
	beam_total_err = shot_err;
	}
	}

	free(shot_sparse);
	}

	free(coarse_marg);
	free(fine_marg);
	hpc_destroy(graph);
	}
	} else {
	/* OPT_MSE or single block: pick candidate with lowest raw error */
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	float best_err = candidate_errors[blk][0];
	int best_idx = 0;
	for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
	if (candidate_errors[blk][c] < best_err) {
	best_err = candidate_errors[blk][c];
	best_idx = c;
	}
	}
	best_candidate[blk] = best_idx;
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
	*
	* Like Q4_0's CF analog: the beam search / Born shots selected a
	* grid candidate (d_grid, dmin_grid). Now we EXTRACT the exact
	* optimal FP16 (d, dmin) via weighted least-squares, holding the
	* sub-block Ls/Lm and quantized levels fixed.
	*
	* Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
	*
	* Full analog assembly: at each iteration, EXHAUSTIVELY search
	* all 16×16 = 256 possible (Ls[j], Lm[j]) pairs per sub-block
	* to find the assignment that minimizes weighted reconstruction
	* error. Then WLS-solve for the global (d, dmin). Repeat 5×.
	*
	* This guarantees every parameter is at its conditional optimum —
	* the perfect bit analog at 2-bit resolution.
	* ══════════════════════════════════════════════════════════════════ */

	/* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement.
	* This eliminates ~776K malloc/free cycles from the inner loop.
	* Each thread reuses its graph via hpc_reset_for_subblock(). */
	int _n_omp_threads = 1;
	#ifdef _OPENMP
	_n_omp_threads = omp_get_max_threads();
	#endif
	HPCGraph _tl_graphs = (HPCGraph )calloc(_n_omp_threads, sizeof(HPCGraph *));
	for (int _ti = 0; _ti < _n_omp_threads; _ti++)
	_tl_graphs[_ti] = hpc_create(N_SUB);

	#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float block_x = weights + blk QK_K;
	int cidx = best_candidate[blk];
	uint8_t Ls_blk[16], Lm_blk[16];

	/* Start from HPC-selected candidate */
	memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
	memcpy(Lm_blk, candidate_Lm[blk][cidx], 16);

	float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
	float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);

	/* ── Analog assembly: iterate to convergence ──
	* 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes
	* after 2-3 passes. Additional iterations produce negligible
	* change in the committed FP16 values.
	* A) Sub-block Shor measurement to find coupled (Ls,Lm) states
	* B) Optimal q-value assignment
	* C) WLS solve for (d, dmin) */
	for (int ls_iter = 0; ls_iter < 3; ls_iter++) {

	/* ── Step A: Sub-block Quhit BP (Strategy 1) ──
	* For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
	* Keep the 6 best pairs as quhit states for a 16-node graph.
	* Run BP to jointly select the globally optimal (Ls, Lm). */
	uint8_t state_ls[N_SUB][6];
	uint8_t state_lm[N_SUB][6];
	float state_err[N_SUB][6];

	for (int j = 0; j < N_SUB; j++) {
	const float sx = block_x + 16 j;
	for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f;

	for (int try_ls = 0; try_ls <= 15; try_ls++) {
	float d_sub = dm * (float)try_ls;
	for (int try_lm = 0; try_lm <= 15; try_lm++) {
	float m_sub = mm * (float)try_lm;
	float sub_err = 0.0f;

	for (int k = 0; k < 16; k++) {
	float x = sx[k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j + k] : 1.0f;
	int q = 0;
	if (d_sub >= 1e-15f) {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	float diff = x - deq;
	sub_err += diff * diff * w;
	}

	/* Insert into top 6 */
	for (int v = 0; v < 6; v++) {
	if (sub_err < state_err[j][v]) {
	for (int u = 5; u > v; u--) {
	state_err[j][u] = state_err[j][u-1];
	state_ls[j][u] = state_ls[j][u-1];
	state_lm[j][u] = state_lm[j][u-1];
	}
	state_err[j][v] = sub_err;
	state_ls[j][v] = (uint8_t)try_ls;
	state_lm[j][v] = (uint8_t)try_lm;
	break;
	}
	}
	}
	}
	}

	/* Reset thread-local sub-block graph (zero allocations) */
	int _tid = 0;
	#ifdef _OPENMP
	_tid = omp_get_thread_num();
	#endif
	HPCGraph *sg = _tl_graphs[_tid];
	hpc_reset_for_subblock(sg, N_SUB);
	{
	float min_sub_err[N_SUB];
	for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];

	/* Initialize unary potentials from local errors */
	for (int j = 0; j < N_SUB; j++) {
	triality_dft(&sg->locals[j]);
	double amp_re[6];
	double amp_norm = 0.0;
	for (int v = 0; v < 6; v++) {
	/* Adaptive temperature: scale with local error spread
	* so Shor measurement produces meaningful interference
	* patterns regardless of weight magnitude */
	float err_spread = state_err[j][5] - state_err[j][0];
	float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
	if (sub_temp < 1e-12f) sub_temp = 1e-12f;
	amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
	amp_norm += amp_re[v] * amp_re[v];
	}
	if (amp_norm > 1e-30) {
	double inv = 1.0 / sqrt(amp_norm);
	for (int v = 0; v < 6; v++) amp_re[v] *= inv;
	}
	for (int v = 0; v < 6; v++) {
	sg->locals[j].edge_re[v] = amp_re[v];
	sg->locals[j].edge_im[v] = 0.0;
	}
	sg->locals[j].primary = VIEW_EDGE;
	sg->locals[j].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	sg->locals[j].delta_valid = 0;
	triality_update_mask(&sg->locals[j]);
	}

	/* Add coupling edges between adjacent sub-blocks */
	for (int j = 0; j < N_SUB - 1; j++)
	hpc_cz(sg, j, j + 1);

	/* ── Shor sequential measurement on sub-block graph ──
	* Stack-allocated arrays: eliminates 2 calloc/free per iteration */
	double sub_marg[N_SUB][6];
	int sub_measured[N_SUB];
	memset(sub_marg, 0, sizeof(sub_marg));
	memset(sub_measured, 0, sizeof(sub_measured));

	shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);

	/* Extract optimal Ls/Lm from Shor marginals */
	for (int j = 0; j < N_SUB; j++) {
	double best_prob = -1.0;
	int best_v = 0;
	for (int v = 0; v < 6; v++) {
	if (sub_marg[j][v] > best_prob) {
	best_prob = sub_marg[j][v];
	best_v = v;
	}
	}
	Ls_blk[j] = state_ls[j][best_v];
	Lm_blk[j] = state_lm[j][best_v];
	}
	}

	/* ── Step B: Quantize q-values with optimal Ls/Lm ── */
	uint8_t L[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = dm * (float)Ls_blk[j];
	float m_sub = mm * (float)Lm_blk[j];
	if (d_sub < 1e-15f) {
	for (int k = 0; k < 16; k++) L[16*j+k] = 0;
	continue;
	}
	for (int k = 0; k < 16; k++) {
	int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	L[16*j+k] = (uint8_t)q;
	}
	}

	/* ── Step C: WLS solve for (d, dmin) ──
	* x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
	* Let a = Ls[j]×q[j,k], b = Lm[j]
	* Normal equations via Cramer's rule */
	double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)Ls_blk[j];
	float lm_f = (float)Lm_blk[j];
	for (int k = 0; k < 16; k++) {
	float x = block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j+k] : 1.0f;
	float a = ls_f * (float)L[16*j+k];
	float b = lm_f;
	Saa += w * a * a;
	Sab += w * a * b;
	Sbb += w * b * b;
	Sxa += w * x * a;
	Sxb += w * x * b;
	}
	}

	double det = Saa * Sbb - Sab * Sab;
	if (fabs(det) > 1e-30) {
	double d_new = (Sbb * Sxa - Sab * Sxb) / det;
	double dm_new = (Sab * Sxa - Saa * Sxb) / det;
	/* Clamp: positive and within 4× of candidate seed */
	float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
	float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
	if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10))
	dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
	if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10))
	mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
	}
	if (isnan(dm) \|\| isnan(mm)) {
	printf("NaN detected before ULP: dm=%f mm=%f det=%f\n", dm, mm, det);
	exit(1);
	}
	}

	/* ── FP16 ULP neighborhood search for (d, dmin) ──
	* The WLS solve found continuous-optimal (d, dmin). But FP16
	* truncation may shift the optimum. Try ±4 ULP around both
	* d and dmin, pick the pair with minimum reconstruction error. */
	{
	uint16_t base_d16 = gguf_fp32_to_fp16(dm);
	uint16_t base_m16 = gguf_fp32_to_fp16(mm);
	uint16_t best_d16 = base_d16, best_m16 = base_m16;
	float best_ulp_err = 1e30f;

	for (int dd = -2; dd <= 2; dd++) {
	int cd16 = (int)base_d16 + dd;
	if (cd16 < 0 \|\| cd16 > 0x7BFF) continue;
	float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16);

	for (int dm_delta = -2; dm_delta <= 2; dm_delta++) {
	int cm16 = (int)base_m16 + dm_delta;
	if (cm16 < 0 \|\| cm16 > 0x7BFF) continue;
	float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16);

	float err = 0.0f;
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = trial_dm * (float)Ls_blk[j];
	float m_sub = trial_mm * (float)Lm_blk[j];
	for (int k = 0; k < 16; k++) {
	float x = block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j+k] : 1.0f;
	int q;
	if (d_sub < 1e-15f) { q = 0; }
	else {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	float diff = x - deq;
	err += diff * diff * w;
	}
	}
	if (err < best_ulp_err) {
	best_ulp_err = err;
	best_d16 = (uint16_t)cd16;
	best_m16 = (uint16_t)cm16;
	}
	}
	}
	dm = gguf_fp16_to_fp32(best_d16);
	mm = gguf_fp16_to_fp32(best_m16);
	}

	/* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
	* The WLS solve may have shifted (d, dmin) after the last Step A.
	* Neighborhood search ±2 around current values (25 pairs vs 256)
	* is sufficient since WLS shifts are typically < 1 Ls/Lm step. */
	for (int j = 0; j < N_SUB; j++) {
	const float sx = block_x + 16 j;
	float best_sub_err = 1e30f;
	uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
	int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0;
	int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15;
	int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0;
	int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15;
	for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) {
	float d_sub = dm * (float)try_ls;
	for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) {
	float m_sub = mm * (float)try_lm;
	float sub_err = 0.0f;
	for (int k = 0; k < 16; k++) {
	float x = sx[k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j + k] : 1.0f;
	int q;
	if (d_sub < 1e-15f) { q = 0; }
	else {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	float diff = x - deq;
	sub_err += diff * diff * w;
	}
	if (sub_err < best_sub_err) {
	best_sub_err = sub_err;
	best_ls = (uint8_t)try_ls;
	best_lm = (uint8_t)try_lm;
	}
	}
	}
	Ls_blk[j] = best_ls;
	Lm_blk[j] = best_lm;
	}

	/* Store the extracted optimal FP16 (d, dmin) */
	output[blk].d = gguf_fp32_to_fp16(dm);
	output[blk].dmin = gguf_fp32_to_fp16(mm);

	for (int j = 0; j < N_SUB; j++)
	output[blk].scales[j] = Ls_blk[j] \| (Lm_blk[j] << 4);

	/* ── Final quantization with D₆ Hadamard Error Shaping ──
	*
	* Standard Q2_K rounds each weight independently: q = round((x+m)/d).
	* But within a sub-block, weights share (d, m), so their quantization
	* errors are CORRELATED. Independent rounding is suboptimal.
	*
	* The D₆ fold (antipodal Hadamard from the triality quhit) decomposes
	* the error vector into vesica (sum) and wave (difference) components:
	* vesica[k] = (e[k] + e[k+3]) / √2 — DC-like, accumulates in dot products
	* wave[k] = (e[k] - e[k+3]) / √2 — noise-like, cancels in dot products
	*
	* We WANT large wave error and small vesica error. So we greedily
	* flip rounding decisions (floor↔ceil) to minimize vesica energy,
	* even if total element-wise error increases slightly.
	*
	* Process: 16 elements per sub-block, treat as 2 groups of 6 + 4 tail.
	* Apply DFT₆-fold to each group of 6, minimize vesica component.
	*/
	uint8_t L[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d = dm * (float)(output[blk].scales[j] & 0xF);
	if (d < 1e-15f) {
	for (int k = 0; k < 16; k++) L[16 * j + k] = 0;
	continue;
	}
	float m = mm * (float)(output[blk].scales[j] >> 4);
	float id = 1.0f / d;

	/* Step 1: Standard nearest-rounding as baseline */
	int q_base[16];
	float q_cont[16]; /* continuous q values before rounding */
	for (int k = 0; k < 16; k++) {
	q_cont[k] = (block_x[16j+k] + m) id;
	q_base[k] = gguf_nearest_int(q_cont[k]);
	if (q_base[k] < 0) q_base[k] = 0;
	if (q_base[k] > 3) q_base[k] = 3;
	}

	/* Step 2: D₆ Hadamard Error Shaping
	* For each 6-element group, greedily flip the rounding decision
	* that most reduces the D₆-folded vesica error component.
	*
	* D₆ fold on 6-element groups: antipodal pairs (0,3), (1,4), (2,5)
	* vesica[k] = e[k] + e[k+3] (k=0,1,2) — DC-like, propagates
	* wave[k] = e[k] - e[k+3] (k=0,1,2) — noise-like, cancels
	*
	* Weight vesica 4× over wave + penalize DC (sum of all 6 errors) */
	int q_shaped[16];
	memcpy(q_shaped, q_base, 16 * sizeof(int));

	/* Process groups: [0..5], [6..11], tail [12..15] handled by D₆ metric on available pairs */
	for (int g = 0; g < 2; g++) {
	int g_off = g * 6;
	if (g_off + 5 >= 16) break;

	/* Multiple greedy passes — each pass finds the single best flip */
	for (int pass = 0; pass < 6; pass++) {
	int best_k = -1;
	int best_q_alt = 0;
	float best_delta = 0.0f; /* improvement = current_metric - alt_metric */

	/* Compute current group errors */
	float e_cur[6];
	for (int kk = 0; kk < 6; kk++) {
	int ii = g_off + kk;
	float deq = d * (float)q_shaped[ii] - m;
	e_cur[kk] = block_x[16*j+ii] - deq;
	}

	/* Current D₆ metric: vesica energy + DC² */
	float vesica_cur = 0.0f, dc_cur = 0.0f;
	for (int p = 0; p < 3; p++) {
	float v = e_cur[p] + e_cur[p+3];
	vesica_cur += v * v;
	}
	for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk];
	float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

	/* Try flipping each element */
	for (int k = 0; k < 6; k++) {
	int idx = g_off + k;
	int q_cur = q_shaped[idx];

	/* Try the alternative rounding */
	int q_try;
	if (q_cont[idx] - (float)q_cur >= 0) {
	q_try = q_cur + 1;
	} else {
	q_try = q_cur - 1;
	}
	if (q_try < 0 \|\| q_try > 3) continue;

	/* Compute alt errors (only element k changes) */
	float e_alt[6];
	for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk];
	float deq_try = d * (float)q_try - m;
	e_alt[k] = block_x[16*j+idx] - deq_try;

	/* Alt D₆ metric */
	float vesica_alt = 0.0f, dc_alt = 0.0f;
	for (int p = 0; p < 3; p++) {
	float v = e_alt[p] + e_alt[p+3];
	vesica_alt += v * v;
	}
	for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk];
	float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;

	float delta = metric_cur - metric_alt;
	if (delta > best_delta) {
	best_delta = delta;
	best_k = k;
	best_q_alt = q_try;
	}
	}

	if (best_k < 0) break; /* no improvement found */
	q_shaped[g_off + best_k] = best_q_alt; /* commit the flip */
	}
	}

	/* Step 3: Final error comparison — only keep shaped if it improves
	* or is within 5% of baseline (vesica shaping trades element MSE
	* for better spectral distribution of error) */
	float err_base = 0.0f, err_shaped = 0.0f;
	for (int k = 0; k < 16; k++) {
	float x = block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j + k] : 1.0f;
	float deq_b = d * (float)q_base[k] - m;
	float deq_s = d * (float)q_shaped[k] - m;
	err_base += (x - deq_b) * (x - deq_b) * w;
	err_shaped += (x - deq_s) * (x - deq_s) * w;
	}

	int q_final = (err_shaped <= err_base 1.05f) ? q_shaped : q_base;
	for (int k = 0; k < 16; k++)
	L[16 * j + k] = (uint8_t)q_final[k];
	}

	for (int j = 0; j < QK_K; j += 128) {
	for (int l = 0; l < 32; l++) {
	output[blk].qs[j / 4 + l] = L[j + l]
	\| (L[j + l + 32] << 2)
	\| (L[j + l + 64] << 4)
	\| (L[j + l + 96] << 6);
	}
	}

	float berr = gguf_q2_k_block_error(block_x, &output[blk]);
	if (isnan(berr)) {
	printf("NaN block error at blk %ld! dm=%f mm=%f\n", (long)blk, dm, mm);
	for (int j=0; j<16; j++) printf("Ls[%d]=%d Lm[%d]=%d\n", j, Ls_blk[j], j, Lm_blk[j]);
	exit(1);
	}
	total_err += berr;
	}

	/* Free thread-local sub-block graphs */
	for (int _ti = 0; _ti < _n_omp_threads; _ti++)
	hpc_destroy(_tl_graphs[_ti]);
	free(_tl_graphs);

	free(seeds);
	free(candidate_errors);
	free(candidate_d);
	free(candidate_dmin);
	free(candidate_Ls);
	free(candidate_Lm);
	free(best_candidate);
	if (out_total_error) *out_total_error = total_err;

	if (verbose) {
	float rmse = sqrtf(total_err / (float)n_elements);

	/* Compute weight σ for fidelity classification */
	double w_sum2 = 0.0;
	for (int64_t i = 0; i < n_elements; i++)
	w_sum2 += (double)weights[i] * (double)weights[i];
	float w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
	float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f;

	/* Fidelity classification */
	const char *fidelity_class;
	const char *fidelity_icon;
	if (rmse <= 1.0e-04f) {
	fidelity_class = "ULTRA (≤1e-04)";
	fidelity_icon = "★★★★";
	} else if (rmse <= 3.0e-04f) {
	fidelity_class = "HIGH (≤3e-04)";
	fidelity_icon = "★★★☆";
	} else if (rmse <= 1.0e-03f) {
	fidelity_class = "GOOD (≤1e-03)";
	fidelity_icon = "★★☆☆";
	} else {
	fidelity_class = "STANDARD";
	fidelity_icon = "★☆☆☆";
	}

	printf("\n ┌──── Shor Measurement Q2_K Report ────────────────────────────────┐\n");
	printf(" │ Elements: %-12lld Blocks: %-12lld │\n",
	(long long)n_elements, (long long)(n_elements / QK_K));
	printf(" │ Weight σ: %-12.4e Range: [%.4e, %.4e] │\n",
	w_sigma, w_sigma * -4.0f, w_sigma * 4.0f);
	printf(" │ Total MSE: %-12.6f │\n", total_err);
	printf(" │ RMSE: %-12.4e RMSE/σ: %-8.4f │\n",
	rmse, rmse_over_sigma);
	printf(" │ Fidelity: %s %-14s │\n",
	fidelity_icon, fidelity_class);
	printf(" │ Engine: Shor Griffiths-Niu (IDFT6 + feed-forward) │\n");
	printf(" └─────────────────────────────────────────────────────────────────┘\n");
	}
	}


	/* ═══════════════════════════════════════════════════════════════════════════
	* PROGRESS REPORTING
	* ═══════════════════════════════════════════════════════════════════════════ */

	static void print_progress_bar(int current, int total, const char *label,
	clock_t start_time)
	{
	if (total <= 0) return;
	float pct = (float)current / (float)total;
	int bar_width = 40;
	int filled = (int)(pct * bar_width);

	double elapsed = (double)(clock() - start_time) / CLOCKS_PER_SEC;
	double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;

	printf("\r [");
	for (int i = 0; i < bar_width; i++) {
	if (i < filled) printf("█");
	else if (i == filled) printf("▓");
	else printf("░");
	}
	printf("] %3d%% (%d/%d) %.0fs ETA:%.0fs %s",
	(int)(pct * 100), current, total, elapsed, eta, label);
	fflush(stdout);

	if (current == total) printf("\n");
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* GGUF FILE WRITER — Assembles the complete output file
	* ═══════════════════════════════════════════════════════════════════════════ */

	static int write_gguf(const char output_path, const STMultiFile mf,
	const ModelArchitecture *arch,
	const TokenizerData *tokenizer,
	OptimizerMode opt_mode,
	const IMatrixData *imatrix,
	int verbose)
	{
	FILE *fp = fopen(output_path, "wb");
	if (!fp) {
	fprintf(stderr, " ERROR: Cannot open '%s' for writing\n", output_path);
	return -1;
	}

	printf("\n ╔════════════════════════════════════════════════════════════════╗\n");
	printf(" ║ WRITING GGUF FILE ║\n");
	printf(" ╚════════════════════════════════════════════════════════════════╝\n\n");

	/* ── Determine which tensors to include ── */
	int include_list = (int )calloc(mf->n_tensors, sizeof(int));
	int n_include = 0;
	for (int i = 0; i < mf->n_tensors; i++) {
	if (!should_skip_tensor(mf->tensor_map[i].name)) {
	include_list[n_include++] = i;
	} else {
	if (verbose) printf(" SKIP: %s (not needed in GGUF)\n", mf->tensor_map[i].name);
	}
	}

	/* ── Count metadata KV pairs ── */
	int n_kv = 0;
	n_kv++; /* general.architecture */
	n_kv++; /* general.name */
	n_kv++; /* general.quantization_version */
	n_kv++; /* general.file_type */
	n_kv++; /* {arch}.context_length */
	n_kv++; /* {arch}.embedding_length */
	n_kv++; /* {arch}.block_count */
	n_kv++; /* {arch}.feed_forward_length */
	n_kv++; /* {arch}.attention.head_count */
	n_kv++; /* {arch}.attention.head_count_kv */
	n_kv++; /* {arch}.attention.layer_norm_rms_epsilon */
	n_kv++; /* {arch}.rope.freq_base */
	n_kv++; /* {arch}.vocab_size */

	/* Tokenizer metadata KV count */
	int has_tokenizer = (tokenizer != NULL && tokenizer->vocab_size > 0);
	if (has_tokenizer) {
	n_kv++; /* tokenizer.ggml.model */
	n_kv++; /* tokenizer.ggml.tokens */
	n_kv++; /* tokenizer.ggml.scores */
	n_kv++; /* tokenizer.ggml.token_type */
	n_kv++; /* tokenizer.ggml.bos_token_id */
	n_kv++; /* tokenizer.ggml.eos_token_id */
	n_kv++; /* tokenizer.ggml.unknown_token_id */
	if (tokenizer->n_merges > 0)
	n_kv++; /* tokenizer.ggml.merges */
	}

	/* ── Check for weight tying ──
	* If tie_word_embeddings is set and there's no separate lm_head,
	* llama.cpp handles this internally — do NOT duplicate the tensor.
	* Only add output.weight if the model has a separate lm_head.weight. */
	int has_lm_head = (st_multi_find_tensor(mf, "lm_head.weight") >= 0);
	int total_tensors = n_include;

	if (arch->tie_word_embeddings && !has_lm_head) {
	printf(" Weight-tied embeddings detected — llama.cpp handles internally\n\n");
	}

	/* ── Prepare tensor info ── */
	char (*gguf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN);
	GGMLType *tensor_types = calloc(total_tensors, sizeof(GGMLType));
	int64_t *tensor_sizes = calloc(total_tensors, sizeof(int64_t));
	uint64_t data_offset = 0;
	uint64_t *tensor_offsets = calloc(total_tensors, sizeof(uint64_t));
	int tensor_src_idx = calloc(total_tensors, sizeof(int)); / map to unified ST index */
	char (*tensor_hf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN);

	GGMLType quant_type = GGML_TYPE_Q2_K;

	for (int i = 0; i < n_include; i++) {
	int src = include_list[i];
	const STTensorInfo *ti = st_multi_tensor_info(mf, src);
	map_tensor_name(mf->tensor_map[src].name, gguf_names[i], ST_MAX_NAME_LEN);
	strncpy(tensor_hf_names[i], mf->tensor_map[src].name, ST_MAX_NAME_LEN - 1);
	tensor_src_idx[i] = src;

	if (should_quantize(ti, gguf_names[i])) {
	if (is_attention_tensor(gguf_names[i])) {
	/* Promote attention Q/K/V/O to Q4_0 for higher precision.
	* Attention scores are most sensitive to quantization noise. */
	tensor_types[i] = GGML_TYPE_Q4_0;
	int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0;
	tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0);
	if (verbose)
	printf(" [ATTN→Q4_0] %s (%ld elements)\n",
	gguf_names[i], (long)ti->n_elements);
	} else {
	tensor_types[i] = quant_type;
	tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements);
	}
	} else if (ti->n_dims >= 2) {
	/* 2D non-quantized tensors (embeddings, output) → F16 */
	tensor_types[i] = GGML_TYPE_F16;
	tensor_sizes[i] = ti->n_elements * sizeof(uint16_t);
	} else {
	/* 1D tensors (norms, biases) → F32 */
	tensor_types[i] = GGML_TYPE_F32;
	tensor_sizes[i] = ti->n_elements * sizeof(float);
	}

	tensor_offsets[i] = data_offset;

	/* Align each tensor to 32 bytes */
	data_offset += tensor_sizes[i];
	data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) &
	~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1);
	}

	/* ── Write header ── */
	gguf_write_header(fp, total_tensors, n_kv);

	/* ── Write metadata KV pairs ── */
	gguf_write_kv_string(fp, "general.architecture", arch->architecture);
	gguf_write_kv_string(fp, "general.name", arch->name);
	gguf_write_kv_uint32(fp, "general.quantization_version", 2);
	gguf_write_kv_uint32(fp, "general.file_type", 10); /* Q2_K = 10 */

	char kbuf[128];
	snprintf(kbuf, sizeof(kbuf), "%s.context_length", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->context_length);

	snprintf(kbuf, sizeof(kbuf), "%s.embedding_length", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->embedding_length);

	snprintf(kbuf, sizeof(kbuf), "%s.block_count", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->block_count);

	snprintf(kbuf, sizeof(kbuf), "%s.feed_forward_length", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->feed_forward_length);

	snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->head_count);

	snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count_kv", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->head_count_kv);

	snprintf(kbuf, sizeof(kbuf), "%s.attention.layer_norm_rms_epsilon", arch->architecture);
	gguf_write_kv_float32(fp, kbuf, arch->rms_norm_eps);

	snprintf(kbuf, sizeof(kbuf), "%s.rope.freq_base", arch->architecture);
	gguf_write_kv_float32(fp, kbuf, arch->rope_freq_base);

	snprintf(kbuf, sizeof(kbuf), "%s.vocab_size", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->vocab_size);

	/* ── Write tokenizer metadata ── */
	if (has_tokenizer) {
	gguf_write_kv_string(fp, "tokenizer.ggml.model", tokenizer->model_type);
	gguf_write_kv_string_array(fp, "tokenizer.ggml.tokens",
	(const char **)tokenizer->tokens,
	(uint64_t)tokenizer->vocab_size);
	gguf_write_kv_float32_array(fp, "tokenizer.ggml.scores",
	tokenizer->scores,
	(uint64_t)tokenizer->vocab_size);
	gguf_write_kv_int32_array(fp, "tokenizer.ggml.token_type",
	tokenizer->token_types,
	(uint64_t)tokenizer->vocab_size);
	gguf_write_kv_uint32(fp, "tokenizer.ggml.bos_token_id",
	(uint32_t)tokenizer->bos_id);
	gguf_write_kv_uint32(fp, "tokenizer.ggml.eos_token_id",
	(uint32_t)tokenizer->eos_id);
	gguf_write_kv_uint32(fp, "tokenizer.ggml.unknown_token_id",
	(uint32_t)tokenizer->unk_id);
	if (tokenizer->n_merges > 0) {
	gguf_write_kv_string_array(fp, "tokenizer.ggml.merges",
	(const char **)tokenizer->merges,
	(uint64_t)tokenizer->n_merges);
	}
	printf(" Tokenizer metadata written (%d tokens, %d merges)\n\n",
	tokenizer->vocab_size, tokenizer->n_merges);
	}

	/* ── Write tensor info descriptors ── */
	for (int i = 0; i < total_tensors; i++) {
	int src = tensor_src_idx[i];
	const STTensorInfo *ti = st_multi_tensor_info(mf, src);
	uint64_t dims[ST_MAX_DIMS];
	/* GGUF uses reversed dimension order from SafeTensors/PyTorch */
	int nd = ti->n_dims;
	for (int d = 0; d < nd; d++) {
	dims[d] = (uint64_t)ti->shape[nd - 1 - d];
	}
	gguf_write_tensor_info(fp, gguf_names[i],
	ti->n_dims, dims,
	tensor_types[i], tensor_offsets[i]);
	}

	/* ── Alignment padding before data section ── */
	gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);

	/* ── Write tensor data ── */
	printf(" Quantizing and writing %d tensors...\n\n", total_tensors);

	float total_error_sum = 0.0f;
	int quant_count = 0;
	int64_t total_elements_quantized = 0;
	int64_t total_bytes_quantized = 0;
	int64_t total_bytes_unquantized = 0;
	clock_t quant_start = clock();

	for (int i = 0; i < total_tensors; i++) {
	int src = tensor_src_idx[i];
	const STTensorInfo *ti = st_multi_tensor_info(mf, src);

	print_progress_bar(i, total_tensors, gguf_names[i], quant_start);

	if (tensor_types[i] == GGML_TYPE_Q2_K) {
	/* ── HPC-Optimized Q2_K Quantization ── */
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n",
	ti->name);
	continue;
	}

	int64_t n_elements = ti->n_elements;
	float tensor_error = 0.0f;

	/* Pad to QK_K boundary */
	int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
	if (padded > n_elements) {
	f32_data = realloc(f32_data, padded * sizeof(float));
	for (int64_t j = n_elements; j < padded; j++)
	f32_data[j] = 0.0f;
	n_elements = padded;
	}

	int64_t n_blocks = n_elements / QK_K;
	BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K));

	/* Look up imatrix importance for this tensor */
	const float *imp = NULL;
	if (imatrix) {
	const IMatrixEntry *ime = imatrix_find_any(imatrix,
	gguf_names[i], tensor_hf_names[i]);
	if (ime && ime->n_values > 0) {
	imp = ime->normalized;
	if (verbose)
	printf("\n imatrix: using %d importance weights for %s\n",
	ime->n_values, gguf_names[i]);
	}
	}

	quantize_tensor_q2k_hpc(f32_data, n_elements,
	quant_data, &tensor_error,
	opt_mode, imp, verbose);

	fwrite(quant_data, sizeof(BlockQ2K), n_blocks, fp);

	float rmse = sqrtf(tensor_error / (float)ti->n_elements);

	/* Compute weight σ for fidelity gate */
	double wss = 0.0;
	for (int64_t j = 0; j < ti->n_elements; j++)
	wss += (double)f32_data[j] * (double)f32_data[j];
	float w_sig = (float)sqrt(wss / (double)ti->n_elements);

	/* Fidelity gate: classify RMSE vs 1e-04 target */
	const char *fid;
	if (rmse <= 1.0e-04f) fid = "★★★★ ULTRA";
	else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH";
	else if (rmse <= 1.0e-03f) fid = "★★☆☆ GOOD";
	else fid = "★☆☆☆ STD";

	if (verbose) {
	printf("\n [Q2_K·Shor] %-47s\n", gguf_names[i]);
	printf(" %10ld elements → %ld bytes σ=%.2e RMSE=%.4e %s\n",
	(long)ti->n_elements,
	(long)(n_blocks * sizeof(BlockQ2K)),
	w_sig, rmse, fid);
	}

	total_error_sum += tensor_error;
	total_elements_quantized += ti->n_elements;
	total_bytes_quantized += n_blocks * sizeof(BlockQ2K);
	quant_count++;

	free(quant_data);
	free(f32_data);
	} else if (tensor_types[i] == GGML_TYPE_Q4_0) {
	/* ── HPC-Optimized Q4_0 Quantization (attention tensors) ── */
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n",
	ti->name);
	continue;
	}

	int64_t n_elements = ti->n_elements;

	/* Pad to QK4_0 boundary */
	int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
	if (padded > n_elements) {
	f32_data = realloc(f32_data, padded * sizeof(float));
	for (int64_t j = n_elements; j < padded; j++)
	f32_data[j] = 0.0f;
	n_elements = padded;
	}

	int64_t n_blocks_q4 = n_elements / QK4_0;
	BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0));
	float tensor_error = 0.0f;

	/* Look up imatrix importance for this tensor */
	const float *imp = NULL;
	if (imatrix) {
	const IMatrixEntry *ime = imatrix_find_any(imatrix,
	gguf_names[i], tensor_hf_names[i]);
	if (ime && ime->n_values > 0) {
	imp = ime->normalized;
	if (verbose)
	printf("\n imatrix: using %d importance weights for %s\n",
	ime->n_values, gguf_names[i]);
	}
	}

	quantize_tensor_q4_0_hpc(f32_data, n_elements,
	q4_data, &tensor_error,
	imp, verbose);

	fwrite(q4_data, sizeof(BlockQ4_0), n_blocks_q4, fp);

	float rmse = sqrtf(tensor_error / (float)ti->n_elements);

	/* Compute weight σ for fidelity gate */
	double wss4 = 0.0;
	for (int64_t j = 0; j < ti->n_elements; j++)
	wss4 += (double)f32_data[j] * (double)f32_data[j];
	float w_sig4 = (float)sqrt(wss4 / (double)ti->n_elements);

	const char *fid4;
	if (rmse <= 1.0e-04f) fid4 = "★★★★ ULTRA";
	else if (rmse <= 3.0e-04f) fid4 = "★★★☆ HIGH";
	else if (rmse <= 1.0e-03f) fid4 = "★★☆☆ GOOD";
	else fid4 = "★☆☆☆ STD";

	if (verbose) {
	printf("\n [Q4_0·Shor] %-47s\n", gguf_names[i]);
	printf(" %10ld elements → %ld bytes σ=%.2e RMSE=%.4e %s\n",
	(long)ti->n_elements,
	(long)(n_blocks_q4 * sizeof(BlockQ4_0)),
	w_sig4, rmse, fid4);
	}

	total_error_sum += tensor_error;
	total_elements_quantized += ti->n_elements;
	total_bytes_quantized += n_blocks_q4 * sizeof(BlockQ4_0);
	quant_count++;

	free(q4_data);
	free(f32_data);
	} else if (tensor_types[i] == GGML_TYPE_F16) {
	/* ── Store as F16 (embeddings, output, 2D non-quantized) ── */
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n",
	ti->name);
	continue;
	}

	/* Convert F32 → F16 */
	uint16_t f16_data = (uint16_t )malloc(ti->n_elements * sizeof(uint16_t));
	for (int64_t j = 0; j < ti->n_elements; j++)
	f16_data[j] = gguf_fp32_to_fp16(f32_data[j]);

	fwrite(f16_data, sizeof(uint16_t), ti->n_elements, fp);

	total_bytes_unquantized += ti->n_elements * sizeof(uint16_t);

	if (verbose) {
	printf("\n [F16 ] %-50s %10ld elements → %ld bytes\n",
	gguf_names[i], (long)ti->n_elements,
	(long)(ti->n_elements * sizeof(uint16_t)));
	}

	free(f16_data);
	free(f32_data);
	} else {
	/* ── Keep as F32 (1D: norms, biases) ── */
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n",
	ti->name);
	continue;
	}

	fwrite(f32_data, sizeof(float), ti->n_elements, fp);

	total_bytes_unquantized += ti->n_elements * sizeof(float);

	if (verbose) {
	printf("\n [F32 ] %-50s %10ld elements → %ld bytes\n",
	gguf_names[i], (long)ti->n_elements,
	(long)(ti->n_elements * sizeof(float)));
	}

	free(f32_data);
	}

	/* Pad to alignment */
	gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);
	}

	print_progress_bar(total_tensors, total_tensors, "done", quant_start);

	long final_size = ftell(fp);
	fclose(fp);

	/* ── Final summary with Shor fidelity metrics ── */
	/* Compute original model size (all as F32) */
	int64_t original_f32_size = 0;
	for (int i = 0; i < total_tensors; i++) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]);
	original_f32_size += ti->n_elements * sizeof(float);
	}
	float compression_ratio = (original_f32_size > 0) ?
	(float)original_f32_size / (float)final_size : 0.0f;
	float effective_bpw = (total_elements_quantized > 0) ?
	8.0f * (float)total_bytes_quantized / (float)total_elements_quantized :
	0.0f;
	float total_rmse = (total_elements_quantized > 0) ?
	sqrtf(total_error_sum / (float)total_elements_quantized) : 0.0f;
	float mean_mse_per_tensor = (quant_count > 0) ?
	total_error_sum / (float)quant_count : 0.0f;

	/* Fidelity classification */
	const char overall_fid, overall_icon;
	if (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)"; overall_icon = "★★★★"; }
	else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)"; overall_icon = "★★★☆"; }
	else if (total_rmse <= 1.0e-03f) { overall_fid = "GOOD (≤1e-03)"; overall_icon = "★★☆☆"; }
	else { overall_fid = "STANDARD"; overall_icon = "★☆☆☆"; }

	printf("\n ╔════════════════════════════════════════════════════════════════╗\n");
	printf(" ║ SHOR-OPTIMIZED QUANTIZATION SUMMARY ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ ║\n");
	printf(" ║ Engine: Griffiths-Niu Sequential Measurement ║\n");
	printf(" ║ Protocol: IDFT6 → feed-forward → Born → collapse ║\n");
	printf(" ║ Origin: tesseract_factor.c (Shor's algorithm) ║\n");
	printf(" ║ ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ Tensors quantized: %-33d ║\n", quant_count);
	printf(" ║ Elements quantized: %15ld ║\n",
	(long)total_elements_quantized);
	printf(" ║ Quantized data: %12ld bytes (%6.1f MB) ║\n",
	(long)total_bytes_quantized,
	(double)total_bytes_quantized / (1024.0 * 1024.0));
	printf(" ║ Unquantized data: %12ld bytes (%6.1f MB) ║\n",
	(long)total_bytes_unquantized,
	(double)total_bytes_unquantized / (1024.0 * 1024.0));
	printf(" ║ Effective bits/weight: %15.2f ║\n",
	effective_bpw);
	printf(" ║ Compression ratio: %15.1fx ║\n",
	compression_ratio);
	printf(" ║ ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ FIDELITY METRICS (target: 1e-04) ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ ║\n");
	printf(" ║ Total MSE: %15.6e ║\n",
	total_error_sum);
	printf(" ║ Per-element RMSE: %15.4e ║\n",
	total_rmse);
	printf(" ║ Mean MSE/tensor: %15.6e ║\n",
	mean_mse_per_tensor);
	printf(" ║ ║\n");
	printf(" ║ Fidelity class: %s %-14s ║\n",
	overall_icon, overall_fid);
	if (total_rmse <= 1.0e-04f)
	printf(" ║ ✓ RMSE ≤ 1e-04: TARGET MET — maximum fidelity achieved ║\n");
	else if (total_rmse <= 3.0e-04f)
	printf(" ║ ◐ RMSE ≤ 3e-04: near target — high fidelity achieved ║\n");
	else
	printf(" ║ ○ RMSE > 3e-04: below target — weight σ may be large ║\n");
	printf(" ║ ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ Output file: %ld bytes (%.1f MB)%*s║\n",
	final_size, (double)final_size / (1024.0 * 1024.0),
	(int)(27 - snprintf(NULL, 0, "%ld bytes (%.1f MB)",
	final_size, (double)final_size / (1024.0 * 1024.0))), "");
	printf(" ╚════════════════════════════════════════════════════════════════╝\n\n");

	free(include_list);
	free(gguf_names);
	free(tensor_types);
	free(tensor_sizes);
	free(tensor_offsets);
	free(tensor_src_idx);
	free(tensor_hf_names);

	return 0;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* LIBRARY API — Exported functions for Python ctypes integration
	*
	* When built with -DHEXSTATE_LIBRARY, these are the only public symbols.
	* The Python GGUF pipeline handles metadata/IO; C handles HPC quantization.
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* Initialize HExState subsystems (must be called once before quantization) */
	void hexstate_init(void)
	{
	static int initialized = 0;
	if (!initialized) {
	srand(42); /* Deterministic for reproducibility */
	triality_exotic_init();
	s6_exotic_init();
	triality_stats_reset();
	initialized = 1;
	}
	}

	/* Quantize a single tensor's F32 data to Q2_K using HPC optimization.
	*
	* Parameters:
	* weights: input F32 data (must be padded to multiple of 256)
	* n_elements: number of elements (must be multiple of 256)
	* output: output buffer (must be n_elements/256 * 84 bytes)
	* out_error: pointer to receive total MSE (can be NULL)
	* opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended)
	* verbose: 1 for per-block diagnostics
	*/
	void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements,
	void output, float out_error,
	int opt_mode, int verbose)
	{
	hexstate_init();
	quantize_tensor_q2k_hpc(weights, n_elements,
	(BlockQ2K *)output, out_error,
	(OptimizerMode)opt_mode, NULL, verbose);
	}

	/* Same as above but with importance matrix weights */
	void hexstate_quantize_tensor_q2k_imat(const float *weights, int64_t n_elements,
	void output, float out_error,
	int opt_mode,
	const float *imat_importance,
	int verbose)
	{
	hexstate_init();
	quantize_tensor_q2k_hpc(weights, n_elements,
	(BlockQ2K *)output, out_error,
	(OptimizerMode)opt_mode, imat_importance, verbose);
	}

	/* Get the block size for Q2_K (84 bytes per 256 elements) */
	int hexstate_q2k_block_bytes(void) { return sizeof(BlockQ2K); }
	int hexstate_q2k_block_elements(void) { return QK_K; }

	/* HPC-optimized Q4_0 quantization for attention tensors.
	* Called from Python requantizer via ctypes.
	* weights: input F32 weights
	* n_elements: number of elements (must be multiple of 32)
	* output: output buffer (must be n_elements/32 * 18 bytes)
	* out_error: pointer to receive total MSE (can be NULL)
	* imat_importance: optional per-element importance weights
	* verbose: 1 for per-block diagnostics
	*/
	void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
	void output, float out_error,
	const float *imat_importance,
	int verbose)
	{
	hexstate_init();
	float err = 0.0f;
	quantize_tensor_q4_0_hpc(weights, n_elements,
	(BlockQ4_0 *)output, &err,
	imat_importance, verbose);
	if (out_error) *out_error = err;
	}

	#ifndef HEXSTATE_LIBRARY
	/* ═══════════════════════════════════════════════════════════════════════════
	* MAIN
	* ═══════════════════════════════════════════════════════════════════════════ */

	int main(int argc, char **argv)
	{
	srand(time(NULL));

	/* Initialize HExState subsystems */
	triality_exotic_init();
	s6_exotic_init();
	triality_stats_reset();

	printf("\n");
	printf(" ╔════════════════════════════════════════════════════════════════╗\n");
	printf(" ║ ║\n");
	printf(" ║ HExState GGUF QUANTIZER v3.0 — Shor-Optimized ║\n");
	printf(" ║ ║\n");
	printf(" ║ Architecture: HPCGraph Sensitivity Propagation ║\n");
	printf(" ║ Optimization: Shor's Griffiths-Niu Measurement + iMatrix ║\n");
	printf(" ║ Output: GGUF v3 (Q2_K, 2.625 bpw) ║\n");
	printf(" ║ ║\n");
	printf(" ║ \"The weight and the quantized are opposite faces.\" ║\n");
	printf(" ║ ║\n");
	printf(" ╚════════════════════════════════════════════════════════════════╝\n\n");

	if (argc < 3) {
	printf(" Usage: %s <input> <output.gguf> [options]\n\n", argv[0]);
	printf(" Input:\n");
	printf(" Single .safetensors file, or\n");
	printf(" Model directory with sharded .safetensors files\n\n");
	printf(" Options:\n");
	printf(" --optimizer hpc\|mse\|hybrid Scale optimization (default: hybrid)\n");
	printf(" --imatrix <file> Importance matrix for Q2_K quality\n");
	printf(" --config <file> Explicit config.json for arch detection\n");
	printf(" --qwen Force Qwen 3.5/3.6 architecture\n");
	printf(" --verbose Per-block diagnostics\n\n");
	return 1;
	}

	const char *input_path = argv[1];
	const char *output_path = argv[2];
	OptimizerMode opt_mode = OPT_HYBRID;
	const char *imatrix_path = NULL;
	const char *config_override = NULL;
	int verbose = 0;
	int force_qwen = 0;

	/* Parse options */
	for (int i = 3; i < argc; i++) {
	if (strcmp(argv[i], "--optimizer") == 0 && i + 1 < argc) {
	i++;
	if (strcmp(argv[i], "hpc") == 0) opt_mode = OPT_HPC;
	else if (strcmp(argv[i], "mse") == 0) opt_mode = OPT_MSE;
	else if (strcmp(argv[i], "hybrid") == 0) opt_mode = OPT_HYBRID;
	else {
	fprintf(stderr, " ERROR: Unknown optimizer '%s'. Use hpc, mse, or hybrid.\n", argv[i]);
	return 1;
	}
	} else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
	imatrix_path = argv[++i];
	} else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
	config_override = argv[++i];
	} else if (strcmp(argv[i], "--qwen") == 0) {
	force_qwen = 1;
	} else if (strcmp(argv[i], "--verbose") == 0) {
	verbose = 1;
	} else {
	fprintf(stderr, " ERROR: Unknown option '%s'\n", argv[i]);
	return 1;
	}
	}

	const char *opt_names[] = {"HPC (BP only)", "MSE (grid search)", "Hybrid (HPC+MSE)"};
	printf(" Input: %s\n", input_path);
	printf(" Output: %s\n", output_path);
	printf(" Quant type: Q2_K (2.625 bpw)\n");
	printf(" Optimizer: %s\n", opt_names[opt_mode]);
	if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path);
	if (config_override) printf(" Config: %s\n", config_override);
	if (force_qwen) printf(" Model: Qwen 3.5/3.6 (forced via --qwen)\n");
	printf("\n");

	/* ── Phase 1: Load model ── */
	printf(" Phase 1: Loading model...\n");
	clock_t t_start = clock();

	/* Determine if input is a file or directory */
	struct stat st;
	if (stat(input_path, &st) != 0) {
	fprintf(stderr, " ERROR: Cannot access '%s'\n", input_path);
	return 1;
	}

	STMultiFile *mf = NULL;
	char input_dir[512] = "";

	if (S_ISDIR(st.st_mode)) {
	/* Input is a directory — open all shards */
	mf = st_open_dir(input_path);
	strncpy(input_dir, input_path, sizeof(input_dir) - 2);
	int dlen = strlen(input_dir);
	if (dlen > 0 && input_dir[dlen - 1] != '/') {
	input_dir[dlen] = '/';
	input_dir[dlen + 1] = '\0';
	}
	} else {
	/* Input is a single file — wrap in STMultiFile */
	STFile *sf = st_open(input_path);
	if (!sf) {
	fprintf(stderr, " ERROR: Failed to open '%s'\n", input_path);
	return 1;
	}
	mf = (STMultiFile *)calloc(1, sizeof(STMultiFile));
	mf->shards[0] = sf;
	mf->n_shards = 1;
	for (int i = 0; i < sf->n_tensors && mf->n_tensors < ST_MAX_TENSORS; i++) {
	strncpy(mf->tensor_map[mf->n_tensors].name,
	sf->tensors[i].name, ST_MAX_NAME_LEN - 1);
	mf->tensor_map[mf->n_tensors].shard_idx = 0;
	mf->tensor_map[mf->n_tensors].tensor_idx = i;
	mf->n_tensors++;
	}

	/* Extract directory from file path */
	strncpy(input_dir, input_path, sizeof(input_dir) - 1);
	char *last_slash = strrchr(input_dir, '/');
	if (last_slash) {
	*(last_slash + 1) = '\0';
	} else {
	strcpy(input_dir, "./");
	}
	}

	if (!mf) {
	fprintf(stderr, " ERROR: Failed to load model from '%s'\n", input_path);
	return 1;
	}

	st_multi_print_summary(mf);

	clock_t t_load = clock();
	printf(" Loaded in %.3f seconds\n\n",
	(double)(t_load - t_start) / CLOCKS_PER_SEC);

	/* ── Phase 2: Detect architecture ── */
	printf(" Phase 2: Detecting model architecture...\n");

	/* Try to read config.json from model directory */
	char config_path[1024];
	snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
	const char *config_ptr = NULL;
	{
	FILE *check = fopen(config_path, "rb");
	if (check) {
	fclose(check);
	config_ptr = config_path;
	printf(" Found config.json: %s\n", config_path);
	}
	}

	ModelArchitecture arch;
	detect_architecture(mf, &arch, config_ptr);

	/* --qwen override: force Qwen 3.5/3.6 architecture parameters */
	if (force_qwen) {
	strcpy(arch.architecture, "qwen2");
	strcpy(arch.name, "Qwen3.6-HExState-Q2K");
	printf(" [--qwen] Forcing qwen2-compatible architecture\n");
	}

	printf(" ╔═══════════════════════════════════════════════════════════════╗\n");
	printf(" ║ Model Architecture ║\n");
	printf(" ╠═══════════════════════════════════════════════════════════════╣\n");
	printf(" ║ Architecture: %-40s ║\n", arch.architecture);
	printf(" ║ Layers: %-40u ║\n", arch.block_count);
	printf(" ║ Hidden size: %-40u ║\n", arch.embedding_length);
	printf(" ║ Attention heads: %-40u ║\n", arch.head_count);
	printf(" ║ KV heads: %-40u ║\n", arch.head_count_kv);
	printf(" ║ Vocab size: %-40u ║\n", arch.vocab_size);
	printf(" ║ FFN size: %-40u ║\n", arch.feed_forward_length);
	printf(" ║ Context length: %-40u ║\n", arch.context_length);
	printf(" ║ Has bias: %-40s ║\n", arch.has_bias ? "yes" : "no");
	printf(" ║ Tied embeddings: %-40s ║\n", arch.tie_word_embeddings ? "yes" : "no");
	printf(" ╚═══════════════════════════════════════════════════════════════╝\n\n");

	/* ── Phase 2b: Load tokenizer ── */
	printf(" Phase 2b: Loading tokenizer...\n");
	TokenizerData *tokenizer = NULL;
	{
	char tok_json[512], tok_config[512];
	snprintf(tok_json, sizeof(tok_json), "%stokenizer.json", input_dir);
	snprintf(tok_config, sizeof(tok_config), "%stokenizer_config.json", input_dir);

	tokenizer = tok_load(tok_json, tok_config);
	if (tokenizer) {
	tok_print_summary(tokenizer);
	} else {
	printf(" No tokenizer found in '%s'\n", input_dir);
	printf(" (Output GGUF will lack tokenizer data — not inference-ready)\n\n");
	}
	}

	/* ── Phase 2c: Load importance matrix (optional) ── */
	IMatrixData *imatrix = NULL;
	if (imatrix_path) {
	printf(" Phase 2c: Loading importance matrix...\n");
	imatrix = imatrix_load(imatrix_path);
	if (imatrix) {
	imatrix_print_summary(imatrix);
	} else {
	printf(" WARNING: Failed to load imatrix from '%s'\n", imatrix_path);
	printf(" Proceeding without importance weighting.\n\n");
	}
	}

	/* ── Phase 3-5: Quantize and write GGUF ── */
	printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
	clock_t t_quant_start = clock();

	int result = write_gguf(output_path, mf, &arch, tokenizer,
	opt_mode, imatrix, verbose);

	clock_t t_end = clock();
	printf(" Total time: %.3f seconds\n\n",
	(double)(t_end - t_start) / CLOCKS_PER_SEC);

	if (imatrix) imatrix_free(imatrix);
	if (tokenizer) tok_free(tokenizer);
	st_multi_close(mf);
	return result;
	}
	#endif /* HEXSTATE_LIBRARY */