/* ═══════════════════════════════════════════════════════════════════════════ * hexstate_quantize.c — HExState GGUF Quantizer * * ╔═══════════════════════════════════════════════════════════════╗ * ║ HPC-Optimized GGUF Quantization Engine ║ * ║ ║ * ║ Architecture: HPCGraph Sensitivity Propagation ║ * ║ Optimization: Complex Amplitude BP + MCMC Scale Search ║ * ║ Enhancements: MSE Grid Search, Importance Matrix Weighting ║ * ║ Output: GGUF v3 (Q2_K) ║ * ║ ║ * ║ "The weight and the quantized are opposite faces." ║ * ╚═══════════════════════════════════════════════════════════════╝ * * This tool adapts the HExState HPC Ouroboros factoring engine for * LLM weight quantization. The core mathematical machinery is reused: * * Factoring Domain → Quantization Domain * ───────────────────────────────────────────────── * HPCGraph + CZ edges → Block sensitivity graph * Complex Amplitude BP → Importance propagation * MCMC period sampler → Optimal scale search * try_period() validation → Error bound checking * LLL lattice reduction → (future) Adaptive bit allocation * * Additional techniques ported from llm-compressor: * MSE grid search → Optimal min/max range shrinking * Importance matrix (imatrix) → Per-channel error weighting * * Build: * make -f Makefile.quantize * * Usage: * ./hexstate_quantize [options] * * Input can be: * - A single .safetensors file * - A model directory containing sharded .safetensors files * * Options: * --optimizer hpc|mse|hybrid Scale optimization strategy (default: hybrid) * --imatrix Importance matrix for weighted quantization * --verbose Per-block diagnostics * ═══════════════════════════════════════════════════════════════════════════ */ #include #ifdef _OPENMP #include #endif #include #include #include #include #include #include /* HExState headers — reused from the factoring engine */ #include "quhit_triality.h" #include "hpc_graph.h" #include "hpc_mobius.h" #include "s6_exotic.h" /* Quantization-specific headers */ #include "gguf_format.h" #include "safetensors_reader.h" #include "tokenizer_reader.h" #include "imatrix_reader.h" #define D 6 /* Preserved from HExState — the triality dimension */ /* ═══════════════════════════════════════════════════════════════════════════ * OPTIMIZER MODE * ═══════════════════════════════════════════════════════════════════════════ */ typedef enum { OPT_HPC, /* HExState BP only */ OPT_MSE, /* MSE grid search only */ OPT_HYBRID /* HPC sensitivity + MSE */ } OptimizerMode; /* ═══════════════════════════════════════════════════════════════════════════ * MODEL ARCHITECTURE AUTO-DETECTION * * Infers model architecture metadata from tensor names and shapes. * Supports: LLaMA, Mistral, Qwen2, Phi-3, Gemma, GPT-NeoX, Falcon, DeepSeek * ═══════════════════════════════════════════════════════════════════════════ */ typedef struct { char architecture[64]; /* "llama", "phi3", "gemma", etc. */ char name[256]; /* Human-readable model name */ uint32_t block_count; /* Number of transformer layers */ uint32_t embedding_length; /* Hidden dimension */ uint32_t head_count; /* Number of attention heads */ uint32_t head_count_kv; /* Number of KV heads (GQA) */ uint32_t vocab_size; /* Vocabulary size */ uint32_t context_length; /* Max context length (default) */ float rope_freq_base; /* RoPE frequency base */ uint32_t feed_forward_length; /* FFN intermediate size */ float rms_norm_eps; /* RMS norm epsilon */ int has_bias; /* Whether attention has biases */ int tie_word_embeddings; /* Whether output = embed_tokens */ } ModelArchitecture; /* Count tensor names matching a pattern prefix */ static int count_tensors_with_prefix(const STMultiFile *mf, const char *prefix) { int count = 0; int prefix_len = strlen(prefix); for (int i = 0; i < mf->n_tensors; i++) { if (strncmp(mf->tensor_map[i].name, prefix, prefix_len) == 0) count++; } return count; } /* Find max layer index from tensor names like "model.layers.N.xxx" */ static int find_max_layer_index(const STMultiFile *mf, const char *layer_prefix) { int max_idx = -1; int prefix_len = strlen(layer_prefix); for (int i = 0; i < mf->n_tensors; i++) { if (strncmp(mf->tensor_map[i].name, layer_prefix, prefix_len) == 0) { int idx = atoi(mf->tensor_map[i].name + prefix_len); if (idx > max_idx) max_idx = idx; } } return max_idx; } /* ── Config.json reader for definitive architecture parameters ── */ typedef struct { int valid; uint32_t hidden_size; uint32_t intermediate_size; uint32_t num_attention_heads; uint32_t num_key_value_heads; uint32_t num_hidden_layers; uint32_t vocab_size; uint32_t max_position_embeddings; float rope_theta; float rms_norm_eps; char model_type[64]; int tie_word_embeddings; } ConfigJson; static ConfigJson parse_config_json(const char *path) { ConfigJson cfg; memset(&cfg, 0, sizeof(cfg)); FILE *f = fopen(path, "rb"); if (!f) return cfg; fseek(f, 0, SEEK_END); long size = ftell(f); fseek(f, 0, SEEK_SET); char *json = (char *)malloc(size + 1); if (!json) { fclose(f); return cfg; } fread(json, 1, size, f); json[size] = '\0'; fclose(f); cfg.valid = 1; /* Simple key-value extraction */ const char *p; p = tok_find_key(json, "hidden_size"); if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "intermediate_size"); if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "num_attention_heads"); if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "num_key_value_heads"); if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "num_hidden_layers"); if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "vocab_size"); if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "max_position_embeddings"); if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(json, "rope_theta"); if (p) cfg.rope_theta = (float)strtod(p, NULL); p = tok_find_key(json, "rms_norm_eps"); if (p) cfg.rms_norm_eps = (float)strtod(p, NULL); p = tok_find_key(json, "model_type"); if (p && *p == '"') { char buf[64]; tok_extract_string(p, buf, sizeof(buf)); strncpy(cfg.model_type, buf, sizeof(cfg.model_type) - 1); } p = tok_find_key(json, "tie_word_embeddings"); if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0); /* ── Qwen 3.5/3.6: parameters are nested inside "text_config" ── */ if (cfg.hidden_size == 0) { const char *tc = strstr(json, "\"text_config\""); if (tc) { const char *tc_brace = strchr(tc, '{'); if (tc_brace) { p = tok_find_key(tc_brace, "hidden_size"); if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "intermediate_size"); if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "num_attention_heads"); if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "num_key_value_heads"); if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "num_hidden_layers"); if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "vocab_size"); if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "max_position_embeddings"); if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10); p = tok_find_key(tc_brace, "rms_norm_eps"); if (p) cfg.rms_norm_eps = (float)strtod(p, NULL); p = tok_find_key(tc_brace, "model_type"); if (p && *p == '"') { char buf2[64]; tok_extract_string(p, buf2, sizeof(buf2)); strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1); } p = tok_find_key(tc_brace, "tie_word_embeddings"); if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0); /* Qwen3.6 rope_theta is nested in rope_parameters */ const char *rp = strstr(tc_brace, "\"rope_parameters\""); if (rp) { p = tok_find_key(rp, "rope_theta"); if (p) cfg.rope_theta = (float)strtod(p, NULL); } } } } free(json); return cfg; } static void detect_architecture(const STMultiFile *mf, ModelArchitecture *arch, const char *config_json_path) { memset(arch, 0, sizeof(*arch)); /* Default values */ strcpy(arch->architecture, "llama"); strcpy(arch->name, "HExState-quantized"); arch->context_length = 4096; arch->rope_freq_base = 10000.0f; arch->rms_norm_eps = 1e-5f; /* ── Try config.json for definitive parameters ── */ ConfigJson cfg = {0}; if (config_json_path) { cfg = parse_config_json(config_json_path); } if (cfg.valid) { /* Map model_type to GGUF architecture name */ if (strcmp(cfg.model_type, "llama") == 0 || strcmp(cfg.model_type, "mistral") == 0) { strcpy(arch->architecture, "llama"); } else if (strcmp(cfg.model_type, "qwen2") == 0) { strcpy(arch->architecture, "qwen2"); } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) { strcpy(arch->architecture, "qwen2moe"); } else if (strcmp(cfg.model_type, "qwen3_5") == 0 || strcmp(cfg.model_type, "qwen3_5_text") == 0 || strcmp(cfg.model_type, "qwen3_5_moe") == 0) { strcpy(arch->architecture, "qwen2"); /* GGUF arch: qwen2 compat */ } else if (strcmp(cfg.model_type, "phi3") == 0 || strcmp(cfg.model_type, "phi") == 0) { strcpy(arch->architecture, "phi3"); } else if (strcmp(cfg.model_type, "gemma") == 0 || strcmp(cfg.model_type, "gemma2") == 0) { strcpy(arch->architecture, "gemma"); } else if (strcmp(cfg.model_type, "deepseek_v2") == 0) { strcpy(arch->architecture, "llama"); } else if (strcmp(cfg.model_type, "gpt_neox") == 0) { strcpy(arch->architecture, "gpt_neox"); } else if (strcmp(cfg.model_type, "falcon") == 0) { strcpy(arch->architecture, "falcon"); } else if (cfg.model_type[0]) { /* Unknown — try llama as fallback */ strcpy(arch->architecture, "llama"); } if (cfg.hidden_size) arch->embedding_length = cfg.hidden_size; if (cfg.intermediate_size) arch->feed_forward_length = cfg.intermediate_size; if (cfg.num_attention_heads) arch->head_count = cfg.num_attention_heads; if (cfg.num_key_value_heads) arch->head_count_kv = cfg.num_key_value_heads; if (cfg.num_hidden_layers) arch->block_count = cfg.num_hidden_layers; if (cfg.vocab_size) arch->vocab_size = cfg.vocab_size; if (cfg.max_position_embeddings) arch->context_length = cfg.max_position_embeddings; if (cfg.rope_theta > 0) arch->rope_freq_base = cfg.rope_theta; if (cfg.rms_norm_eps > 0) arch->rms_norm_eps = cfg.rms_norm_eps; arch->tie_word_embeddings = cfg.tie_word_embeddings; printf(" Architecture determined from config.json: %s\n", cfg.model_type); } /* ── Fall back to tensor name pattern detection ── */ int has_model_layers = count_tensors_with_prefix(mf, "model.layers."); int has_gpt_neox = count_tensors_with_prefix(mf, "gpt_neox."); int has_transformer = count_tensors_with_prefix(mf, "transformer."); /* Architecture-specific detection */ int has_qkv_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.qkv_proj"); int has_kv_a_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.kv_a_proj_with_mqa"); int has_final_norm = (st_multi_find_tensor(mf, "model.final_norm.weight") >= 0); if (has_qkv_proj > 0 && !cfg.valid) { strcpy(arch->architecture, "phi3"); } else if (has_kv_a_proj > 0 && !cfg.valid) { strcpy(arch->architecture, "llama"); /* DeepSeek uses llama arch */ } else if (has_final_norm && !cfg.valid) { strcpy(arch->architecture, "gemma"); } if (has_model_layers > 0 && arch->block_count == 0) { arch->block_count = find_max_layer_index(mf, "model.layers.") + 1; } /* Infer dimensions from tensor shapes if not from config.json */ if (arch->embedding_length == 0 || arch->head_count == 0) { int qproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.weight"); int kproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.k_proj.weight"); if (qproj_idx >= 0) { const STTensorInfo *ti = st_multi_tensor_info(mf, qproj_idx); int64_t q_out = ti->shape[0]; int64_t hidden = ti->shape[1]; if (arch->embedding_length == 0) arch->embedding_length = hidden; /* Try common head dimensions: 128, 64, 96 */ int head_dim = 128; if (q_out % 128 == 0) head_dim = 128; else if (q_out % 96 == 0) head_dim = 96; else if (q_out % 64 == 0) head_dim = 64; if (arch->head_count == 0) arch->head_count = q_out / head_dim; if (kproj_idx >= 0 && arch->head_count_kv == 0) { const STTensorInfo *kt = st_multi_tensor_info(mf, kproj_idx); arch->head_count_kv = kt->shape[0] / head_dim; } } } if (arch->vocab_size == 0) { int embed_idx = st_multi_find_tensor(mf, "model.embed_tokens.weight"); if (embed_idx >= 0) { const STTensorInfo *ti = st_multi_tensor_info(mf, embed_idx); arch->vocab_size = ti->shape[0]; } } if (arch->feed_forward_length == 0) { int gate_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.gate_proj.weight"); if (gate_idx >= 0) { const STTensorInfo *ti = st_multi_tensor_info(mf, gate_idx); arch->feed_forward_length = ti->shape[0]; } else { int up_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.up_proj.weight"); if (up_idx >= 0) { const STTensorInfo *ti = st_multi_tensor_info(mf, up_idx); arch->feed_forward_length = ti->shape[0]; } } } /* Check for attention bias */ arch->has_bias = (st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.bias") >= 0); if (has_gpt_neox > 0 && arch->block_count == 0) { strcpy(arch->architecture, "gpt_neox"); arch->block_count = find_max_layer_index(mf, "gpt_neox.layers.") + 1; } if (has_transformer > 0 && arch->block_count == 0) { strcpy(arch->architecture, "falcon"); arch->block_count = find_max_layer_index(mf, "transformer.h.") + 1; } /* Fill in defaults for anything we couldn't detect */ if (arch->head_count == 0) arch->head_count = 32; if (arch->head_count_kv == 0) arch->head_count_kv = arch->head_count; if (arch->embedding_length == 0) arch->embedding_length = 4096; if (arch->vocab_size == 0) arch->vocab_size = 32000; if (arch->feed_forward_length == 0) arch->feed_forward_length = (arch->embedding_length * 8) / 3; /* SwiGLU default */ } /* ═══════════════════════════════════════════════════════════════════════════ * TENSOR NAME MAPPING: HuggingFace → GGUF Standard * * Maps SafeTensors tensor names to the standardized GGUF naming * convention used by llama.cpp for model loading. * * Enhanced with mappings for Phi-3, Gemma, DeepSeek, MoE, and bias tensors. * ═══════════════════════════════════════════════════════════════════════════ */ /* Returns 1 if this tensor should be skipped (not written to GGUF) */ static int should_skip_tensor(const char *hf_name) { /* Rotary embeddings are computed at runtime, not stored */ if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1; if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1; if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1; /* Qwen 3.6 vision encoder — skip all visual.* tensors */ if (strncmp(hf_name, "model.visual.", 13) == 0) return 1; if (strncmp(hf_name, "visual.", 7) == 0) return 1; /* MTP (multi-token prediction) layers — not needed for inference */ if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1; return 0; } static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen) { /* Start with identity mapping */ strncpy(gguf_name, hf_name, buflen - 1); gguf_name[buflen - 1] = '\0'; /* Top-level mappings (common to all architectures) */ struct { const char *from; const char *to; } mappings[] = { {"model.embed_tokens.weight", "token_embd.weight"}, {"model.language_model.embed_tokens.weight","token_embd.weight"}, /* Qwen 3.6 */ {"model.norm.weight", "output_norm.weight"}, {"model.language_model.norm.weight", "output_norm.weight"}, /* Qwen 3.6 */ {"model.final_norm.weight", "output_norm.weight"}, /* Gemma */ {"lm_head.weight", "output.weight"}, {"model.embed_tokens.bias", "token_embd.bias"}, {"model.norm.bias", "output_norm.bias"}, {NULL, NULL} }; for (int m = 0; mappings[m].from; m++) { if (strcmp(hf_name, mappings[m].from) == 0) { strncpy(gguf_name, mappings[m].to, buflen - 1); return; } } /* Layer mappings: "model.layers.N.xxx" or "model.language_model.layers.N.xxx" → "blk.N.xxx" */ const char *layer_prefix = NULL; if (strncmp(hf_name, "model.layers.", 13) == 0) layer_prefix = hf_name + 13; else if (strncmp(hf_name, "model.language_model.layers.", 27) == 0) layer_prefix = hf_name + 27; if (layer_prefix) { int layer_idx; char rest[ST_MAX_NAME_LEN]; if (sscanf(layer_prefix, "%d.%255s", &layer_idx, rest) == 2) { /* Map sublayer names */ struct { const char *from; const char *to; } layer_maps[] = { /* Standard attention projections */ {"self_attn.q_proj.weight", "attn_q.weight"}, {"self_attn.k_proj.weight", "attn_k.weight"}, {"self_attn.v_proj.weight", "attn_v.weight"}, {"self_attn.o_proj.weight", "attn_output.weight"}, /* Attention biases */ {"self_attn.q_proj.bias", "attn_q.bias"}, {"self_attn.k_proj.bias", "attn_k.bias"}, {"self_attn.v_proj.bias", "attn_v.bias"}, {"self_attn.o_proj.bias", "attn_output.bias"}, /* Phi-3 fused QKV */ {"self_attn.qkv_proj.weight", "attn_qkv.weight"}, {"self_attn.qkv_proj.bias", "attn_qkv.bias"}, /* DeepSeek MLA */ {"self_attn.kv_a_proj_with_mqa.weight", "attn_kv_a_mqa.weight"}, {"self_attn.kv_b_proj.weight", "attn_kv_b.weight"}, /* Standard FFN (SwiGLU) */ {"mlp.gate_proj.weight", "ffn_gate.weight"}, {"mlp.up_proj.weight", "ffn_up.weight"}, {"mlp.down_proj.weight", "ffn_down.weight"}, /* FFN biases */ {"mlp.gate_proj.bias", "ffn_gate.bias"}, {"mlp.up_proj.bias", "ffn_up.bias"}, {"mlp.down_proj.bias", "ffn_down.bias"}, /* MoE gate */ {"mlp.gate.weight", "ffn_gate_inp.weight"}, /* MoE expert weights */ {"mlp.experts.gate_proj.weight", "ffn_gate_exps.weight"}, {"mlp.experts.up_proj.weight", "ffn_up_exps.weight"}, {"mlp.experts.down_proj.weight", "ffn_down_exps.weight"}, /* Norm layers */ {"input_layernorm.weight", "attn_norm.weight"}, {"post_attention_layernorm.weight", "ffn_norm.weight"}, {"input_layernorm.bias", "attn_norm.bias"}, {"post_attention_layernorm.bias", "ffn_norm.bias"}, /* Gemma pre/post feedforward norm */ {"pre_feedforward_layernorm.weight", "ffn_norm.weight"}, {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"}, /* Qwen 3.6 full attention QK norms */ {"self_attn.q_norm.weight", "attn_q_norm.weight"}, {"self_attn.k_norm.weight", "attn_k_norm.weight"}, /* Qwen 3.6 DeltaNet (Gated Linear Attention) */ {"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"}, {"linear_attn.in_proj_z.weight", "ssm_in_z.weight"}, {"linear_attn.in_proj_a.weight", "ssm_in_a.weight"}, {"linear_attn.in_proj_b.weight", "ssm_in_b.weight"}, {"linear_attn.out_proj.weight", "ssm_out.weight"}, {"linear_attn.conv1d.weight", "ssm_conv1d.weight"}, {"linear_attn.norm.weight", "ssm_norm.weight"}, {"linear_attn.A_log", "ssm_a"}, {"linear_attn.dt_bias", "ssm_dt.bias"}, {NULL, NULL} }; for (int m = 0; layer_maps[m].from; m++) { if (strcmp(rest, layer_maps[m].from) == 0) { snprintf(gguf_name, buflen, "blk.%d.%s", layer_idx, layer_maps[m].to); return; } } /* MoE expert layer mapping: model.layers.N.mlp.experts.E.xxx */ int expert_idx; char expert_rest[ST_MAX_NAME_LEN]; if (sscanf(rest, "mlp.experts.%d.%255s", &expert_idx, expert_rest) == 2) { struct { const char *from; const char *to; } expert_maps[] = { {"gate_proj.weight", "ffn_gate_exp.weight"}, {"up_proj.weight", "ffn_up_exp.weight"}, {"down_proj.weight", "ffn_down_exp.weight"}, {NULL, NULL} }; for (int m = 0; expert_maps[m].from; m++) { if (strcmp(expert_rest, expert_maps[m].from) == 0) { snprintf(gguf_name, buflen, "blk.%d.%s.%d", layer_idx, expert_maps[m].to, expert_idx); return; } } } /* Fallback: keep original sub-path */ snprintf(gguf_name, buflen, "blk.%d.%s", layer_idx, rest); } } } /* ═══════════════════════════════════════════════════════════════════════════ * SHOULD THIS TENSOR BE QUANTIZED? * * Decision rules: * - Quantize: weight matrices (2D, large) * - Keep F32: norms, biases, embeddings, 1D tensors * ═══════════════════════════════════════════════════════════════════════════ */ static int should_quantize(const STTensorInfo *ti, const char *gguf_name) { /* Never quantize 1D tensors (norms, biases) */ if (ti->n_dims < 2) return 0; /* Never quantize embedding tables (row dimension = vocab) */ if (strstr(gguf_name, "token_embd") != NULL) return 0; /* Never quantize LM head output — use exact match, not substring, * to avoid matching "attn_output.weight" */ if (strcmp(gguf_name, "output.weight") == 0) return 0; /* Never quantize norm weights */ if (strstr(gguf_name, "norm") != NULL) return 0; /* Never quantize bias tensors */ if (strstr(gguf_name, ".bias") != NULL) return 0; /* Never quantize MoE gate routing weights */ if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0; /* Never quantize DeltaNet state-space parameters (1D or small) */ if (strstr(gguf_name, "ssm_a") != NULL) return 0; /* A_log */ if (strstr(gguf_name, "ssm_dt") != NULL) return 0; /* dt_bias */ if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */ /* Quantize everything else (attention projections, FFN weights, SSM projections) */ return 1; } /* Detect attention Q/K/V/O projection tensors. * These are the most sensitive to quantization — errors in attention scores * cascade through the entire sequence, causing self-correction loops. * Promoting these to Q4_0 (~4.5bpw) doubles their precision. */ static int is_attention_tensor(const char *gguf_name) { /* Gemma / LLaMA style GGUF names: blk.N.attn_q/k/v/output.weight */ if (strstr(gguf_name, "attn_q.weight") != NULL) return 1; if (strstr(gguf_name, "attn_k.weight") != NULL) return 1; if (strstr(gguf_name, "attn_v.weight") != NULL) return 1; if (strstr(gguf_name, "attn_output.weight") != NULL) return 1; if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1; /* Qwen 3.6 DeltaNet SSM projections — treat as attention-class (Q4_0) */ if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1; if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1; if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1; /* HuggingFace style (fallthrough names) */ if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1; if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1; if (strstr(gguf_name, "self_attn.v_proj.weight") != NULL) return 1; if (strstr(gguf_name, "self_attn.o_proj.weight") != NULL) return 1; return 0; } /* ═══════════════════════════════════════════════════════════════════════════ * HPC SENSITIVITY GRAPH BUILDER * * Creates an HPCGraph where each node represents a weight block. * For Q2_K: 256-weight superblocks. * * The 6 values per site correspond to 6 candidate scale factors: * v=0: scale * 0.85 (aggressive, high compression) * v=1: scale * 0.90 * v=2: scale * 0.95 * v=3: scale * 1.00 (standard) * v=4: scale * 1.05 * v=5: scale * 1.10 (conservative, less compression error) * * BP propagates: "if your neighbor block is sensitive, you should be * conservative too" — creating coherent precision allocation. * ═══════════════════════════════════════════════════════════════════════════ */ #define SCALE_FACTOR_COUNT 6 static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = { 0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f }; /* ── Multi-quhit expanded scale table ── * Search grid: 10×10 = 100 (d, dmin) candidates * Quhit encoding: bin 10 → 6 for D=6 quhits (BP operates on 6-state marginals) * Beam search: operates on all 100 candidates directly */ #define QUHITS_PER_BLOCK 2 #define N_CAND_D 16 /* d multiplier candidates (was 10) */ #define N_CAND_M 16 /* dmin multiplier candidates (was 10) */ #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M) static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES]; static int scale_table_initialized = 0; static void init_scale_table(void) { if (scale_table_initialized) return; /* 100 candidates: uniform spacing centered on 1.0 */ for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) { SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1)); } scale_table_initialized = 1; } /* ═══════════════════════════════════════════════════════════════════════════ * THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles * * The sub-block Shor measurement uses a 16-node linear-chain graph that * is identical in topology every time. Instead of hpc_create()/hpc_destroy() * inside the OMP hot loop, we reset the same graph to a clean state. * * This function resets an existing HPCGraph with n_sites nodes to its * initial state: clears all edges, resets adjacency lists, reinitializes * locals. Zero allocations. * ═══════════════════════════════════════════════════════════════════════════ */ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites) { /* Reset edge state */ g->n_edges = 0; g->cz_edges = 0; g->phase_edges = 0; g->syntheme_edges = 0; g->n_log = 0; g->min_fidelity = 1.0; g->avg_fidelity = 1.0; g->amp_evals = 0; g->prob_evals = 0; g->measurements = 0; /* Reset adjacency lists (just zero the counts, keep allocated buffers) */ for (uint64_t i = 0; i < n_sites; i++) { g->adj[i].count = 0; } /* Reinitialize local quhit states */ for (uint64_t i = 0; i < n_sites; i++) triality_init(&g->locals[i]); } /* ═══════════════════════════════════════════════════════════════════════════ * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search * * powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 × x^0.4 * where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation: * x^0.4 ≈ sqrtf(cbrtf(x^2 × x^2)) but simpler: x^2 × sqrtf(sqrtf(x)) * is close enough for error norm purposes (~1% relative error). * ═══════════════════════════════════════════════════════════════════════════ */ static inline float fast_pow_2_4(float x) { /* x^2.4 = x^2 × x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5)) * x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain: * x^0.4 ≈ sqrtf(sqrtf(x)) × x^(-0.1) — too complex. * Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12) * Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */ float x2 = x * x; return x2 * sqrtf(cbrtf(x2)); /* x^2 × (x^2)^(1/6) ≈ x^(2+1/3) ≈ x^2.333 */ } /* Compute the Q2_K sub-block reconstruction error for a block at a given * scale multiplier, optionally weighted by importance vector */ static float compute_block_error_q2k(const float *weights, int block_size, float scale_mult, const float *importance, int imp_offset) { float min_val = weights[0]; float max_val = weights[0]; for (int j = 1; j < block_size; j++) { if (weights[j] < min_val) min_val = weights[j]; if (weights[j] > max_val) max_val = weights[j]; } if (min_val > 0) min_val = 0; float range = (max_val - min_val) * scale_mult; if (range < 1e-15f) return 0.0f; float inv_range = 3.0f / range; float err = 0.0f; for (int j = 0; j < block_size; j++) { float x = weights[j]; int q = (int)((x - min_val * scale_mult) * inv_range + 0.5f); if (q < 0) q = 0; if (q > 3) q = 3; float deq = min_val * scale_mult + (float)q * range / 3.0f; float diff = x - deq; float w = (importance) ? importance[imp_offset + j] : 1.0f; err += diff * diff * w; } return err; } /* Build multi-quhit HPC sensitivity graph. * 2 quhits per block → 36 scale candidates per block. * * Graph layout: sites [0..2*n-1] where: * site 2*i = coarse quhit for block i * site 2*i + 1 = fine quhit for block i * * Edges: * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */ static HPCGraph *build_sensitivity_graph(const float *weights, int64_t n_elements, int block_size, float temperature, const float *importance) { int64_t n_blocks = n_elements / block_size; if (n_blocks < 2) return NULL; init_scale_table(); int64_t graph_blocks = (n_blocks > 8192) ? 8192 : n_blocks; int64_t stride = n_blocks / graph_blocks; int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK; HPCGraph *graph = hpc_create(n_sites); if (!graph) return NULL; for (int64_t i = 0; i < n_sites; i++) triality_dft(&graph->locals[i]); /* Compute errors for all 36 scale candidates per block, * then project onto coarse (quhit 0) and fine (quhit 1) marginals */ for (int64_t i = 0; i < graph_blocks; i++) { int64_t block_idx = i * stride; const float *block_weights = weights + block_idx * block_size; /* Evaluate all 36 candidates */ float errors[TOTAL_SCALE_CANDIDATES]; float min_err = 1e30f; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { errors[c] = compute_block_error_q2k(block_weights, block_size, SCALE_TABLE[c], importance, (int)(block_idx * block_size)); if (errors[c] < min_err) min_err = errors[c]; } /* Project onto quhit 0 (coarse): marginalize over fine dimension * amp_coarse[v0] = Σ_{v1} exp(-error(v0*6+v1) / 2T) */ double coarse_re[6], coarse_im[6]; double coarse_norm = 0.0; for (int v0 = 0; v0 < 6; v0++) { coarse_re[v0] = 0.0; coarse_im[v0] = 0.0; for (int v1 = 0; v1 < 6; v1++) { int idx = v0 * 6 + v1; coarse_re[v0] += exp(-(double)(errors[idx] - min_err) / (2.0 * (double)temperature)); } coarse_norm += coarse_re[v0] * coarse_re[v0]; } if (coarse_norm > 1e-30) { double inv = 1.0 / sqrt(coarse_norm); for (int v = 0; v < 6; v++) coarse_re[v] *= inv; } /* Project onto quhit 1 (fine): marginalize over coarse dimension * amp_fine[v1] = Σ_{v0} exp(-error(v0*6+v1) / 2T) */ double fine_re[6], fine_im[6]; double fine_norm = 0.0; for (int v1 = 0; v1 < 6; v1++) { fine_re[v1] = 0.0; fine_im[v1] = 0.0; for (int v0 = 0; v0 < 6; v0++) { int idx = v0 * 6 + v1; fine_re[v1] += exp(-(double)(errors[idx] - min_err) / (2.0 * (double)temperature)); } fine_norm += fine_re[v1] * fine_re[v1]; } if (fine_norm > 1e-30) { double inv = 1.0 / sqrt(fine_norm); for (int v = 0; v < 6; v++) fine_re[v] *= inv; } /* Write coarse quhit (site 2*i) */ int64_t s_coarse = 2 * i; for (int v = 0; v < 6; v++) { graph->locals[s_coarse].edge_re[v] = coarse_re[v]; graph->locals[s_coarse].edge_im[v] = 0.0; } graph->locals[s_coarse].primary = VIEW_EDGE; graph->locals[s_coarse].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; graph->locals[s_coarse].delta_valid = 0; triality_update_mask(&graph->locals[s_coarse]); /* Write fine quhit (site 2*i + 1) */ int64_t s_fine = 2 * i + 1; for (int v = 0; v < 6; v++) { graph->locals[s_fine].edge_re[v] = fine_re[v]; graph->locals[s_fine].edge_im[v] = 0.0; } graph->locals[s_fine].primary = VIEW_EDGE; graph->locals[s_fine].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; graph->locals[s_fine].delta_valid = 0; triality_update_mask(&graph->locals[s_fine]); } /* ── Build edges ── */ for (int64_t i = 0; i < graph_blocks; i++) { /* Intra-block: coarse ↔ fine coupling */ hpc_cz(graph, 2 * i, 2 * i + 1); /* Inter-block: neighbor coupling */ if (i + 1 < graph_blocks) { hpc_cz(graph, 2 * i, 2 * (i + 1)); /* coarse ↔ coarse */ hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* fine ↔ fine */ } } return graph; } /* ═══════════════════════════════════════════════════════════════════════════ * MSE GRID SEARCH (ported from llm-compressor observers/mse.py) * * For a Q2_K sub-block, progressively shrink the min/max range to find * the candidate that minimizes weighted reconstruction error. * * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink): * candidate_min = p * min * candidate_max = p * max * error = ||x - quantize(x, candidate_min, candidate_max)||^norm * if error < best: update best * else: patience--; if patience == 0: break * * This is a direct C port of llm-compressor's _grid_search_mse. * ═══════════════════════════════════════════════════════════════════════════ */ typedef struct { float maxshrink; /* Maximum shrink factor (0.0 to 1.0) */ int grid; /* Number of grid divisions */ int patience; /* Early stopping patience */ float norm; /* Error norm exponent (2.0 = MSE, 2.4 = ...)*/ } MSEGridConfig; static const MSEGridConfig MSE_DEFAULT_CONFIG = { .maxshrink = 0.20f, .grid = 200, .patience = 8, .norm = 2.4f }; /* Grid search for optimal scale/min for a Q2_K sub-block of n weights * with nmax = 3 quantization levels. * Returns optimized scale; stores absolute min in *out_min. * importance: per-element weights (can be NULL for uniform). */ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax, uint8_t *L, float *out_min, const float *importance, const MSEGridConfig *cfg) { float min_val = x[0], max_val = x[0]; for (int i = 1; i < n; i++) { if (x[i] < min_val) min_val = x[i]; if (x[i] > max_val) max_val = x[i]; } if (max_val == min_val) { for (int i = 0; i < n; i++) L[i] = 0; *out_min = -min_val; return 0.0f; } if (min_val > 0) min_val = 0; float best_scale = 0.0f; float best_min = -min_val; float best_error = 1e30f; int no_improve = 0; int shrink_steps = (int)(cfg->maxshrink * cfg->grid); if (shrink_steps < 1) shrink_steps = 1; for (int step = 0; step <= shrink_steps; step++) { float p = 1.0f - (float)step / (float)cfg->grid; float cand_min = p * min_val; float cand_max = p * max_val; if (cand_max <= cand_min) continue; float iscale = (float)nmax / (cand_max - cand_min); float scale = 1.0f / iscale; /* Quantize and measure error */ float err = 0.0f; uint8_t tmp_L[256]; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(iscale * (x[i] - cand_min)); if (l < 0) l = 0; if (l > nmax) l = nmax; tmp_L[i] = (uint8_t)l; float deq = cand_min + scale * (float)l; float diff = fabsf(x[i] - deq); /* Apply error norm — fast path for default norm=2.4 */ float e = diff; if (cfg->norm == 2.4f) { e = fast_pow_2_4(diff); } else if (cfg->norm != 1.0f) { e = powf(diff, cfg->norm); } /* Apply importance weighting */ if (importance) e *= importance[i]; err += e; } if (err < best_error) { best_error = err; best_scale = scale; best_min = -cand_min; memcpy(L, tmp_L, n); no_improve = 0; } else { no_improve++; if (no_improve >= cfg->patience) break; } } /* Iterative refinement on the best candidate (from ggml) */ float cur_min = -best_min; float cur_scale = best_scale; if (cur_scale > 1e-15f) { float iscale = 1.0f / cur_scale; for (int itry = 0; itry < 3; itry++) { float sumlx = 0; int suml2 = 0; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(iscale * (x[i] - cur_min)); if (l < 0) l = 0; if (l > nmax) l = nmax; L[i] = (uint8_t)l; sumlx += (x[i] - cur_min) * l; suml2 += l * l; } if (suml2 > 0) cur_scale = sumlx / suml2; float sum = 0; for (int i = 0; i < n; i++) sum += x[i] - cur_scale * L[i]; cur_min = 0.7f * cur_min + 0.3f * sum / n; if (cur_min > 0) cur_min = 0; if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale; } } *out_min = -cur_min; return cur_scale; } /* ═══════════════════════════════════════════════════════════════════════════ * HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT * * Two-phase approach: * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants) * This produces per-sub-block (scale, min) with 16-step search. * Phase B: HPC BP refines the superblock-level d/dmin rounding. * 6 candidate (d, dmin) pairs are tested; BP finds the one * where the GLOBAL reconstruction error is minimized via * constructive interference of per-sub-block phase coherence. * ═══════════════════════════════════════════════════════════════════════════ */ /* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants). * Finds optimal (scale, min) by searching 16 candidate iscale values * and solving weighted least-squares for each. * Returns scale; *the_min is set to the negative of the optimal min. */ static float hpc_make_qkx2_quants(int n, int nmax, const float *x, const float *w, uint8_t *L, float *the_min, uint8_t *Laux) { float xmin = x[0], xmax = x[0]; float sum_w = w[0], sum_x = w[0] * x[0]; for (int i = 1; i < n; i++) { if (x[i] < xmin) xmin = x[i]; if (x[i] > xmax) xmax = x[i]; sum_w += w[i]; sum_x += w[i] * x[i]; } if (xmin > 0) xmin = 0; if (xmax == xmin) { for (int i = 0; i < n; i++) L[i] = 0; *the_min = -xmin; return 0.0f; } float iscale = (float)nmax / (xmax - xmin); float scale = 1.0f / iscale; float best_mad = 0; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(iscale * (x[i] - xmin)); if (l < 0) l = 0; if (l > nmax) l = nmax; L[i] = (uint8_t)l; float diff = scale * (float)l + xmin - x[i]; best_mad += w[i] * fabsf(diff); } /* 16 candidate iscale values: search [-0.5, -0.5 + 0.1*15] + nmax */ for (int is = 0; is <= 15; is++) { float try_iscale = (-0.5f + 0.1f * (float)is + (float)nmax) / (xmax - xmin); float sl = 0, sl2 = 0, sxl = 0; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(try_iscale * (x[i] - xmin)); if (l < 0) l = 0; if (l > nmax) l = nmax; Laux[i] = (uint8_t)l; sl += w[i] * (float)l; sl2 += w[i] * (float)(l * l); sxl += w[i] * (float)l * x[i]; } float det = sum_w * sl2 - sl * sl; if (det > 0) { float this_scale = (sum_w * sxl - sum_x * sl) / det; float this_min = (sl2 * sum_x - sl * sxl) / det; if (this_min > 0) { this_min = 0; this_scale = sxl / sl2; } float mad = 0; for (int i = 0; i < n; i++) { float diff = this_scale * (float)Laux[i] + this_min - x[i]; mad += w[i] * fabsf(diff); } if (mad < best_mad) { for (int i = 0; i < n; i++) L[i] = Laux[i]; best_mad = mad; scale = this_scale; xmin = this_min; } } } *the_min = -xmin; return scale; } /* Quantize the scale/min arrays into 4-bit values: make_qp_quants equivalent. * Returns the optimal d such that scales[j] ≈ d × Ls[j]. */ static float hpc_make_qp_quants(int n, int nmax, const float *x, uint8_t *L, const float *sw) { float xmax = 0; for (int i = 0; i < n; i++) if (x[i] > xmax) xmax = x[i]; if (xmax < 1e-15f) { for (int i = 0; i < n; i++) L[i] = 0; return 0.0f; } float iscale = (float)nmax / xmax; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(iscale * x[i]); if (l < 0) l = 0; if (l > nmax) l = nmax; L[i] = (uint8_t)l; } float scale = 1.0f / iscale; float best_mse = 0; for (int i = 0; i < n; i++) { float diff = x[i] - scale * (float)L[i]; best_mse += sw[i] * diff * diff; } for (int is = -4; is <= 4; is++) { if (is == 0) continue; float iscale_is = (0.1f * (float)is + (float)nmax) / xmax; float scale_is = 1.0f / iscale_is; float mse = 0; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(iscale_is * x[i]); if (l < 0) l = 0; if (l > nmax) l = nmax; float diff = x[i] - scale_is * (float)l; mse += sw[i] * diff * diff; } if (mse < best_mse) { best_mse = mse; iscale = iscale_is; } } /* Recompute with best iscale + iterative refinement */ float sumlx = 0, suml2 = 0; for (int i = 0; i < n; i++) { int l = gguf_nearest_int(iscale * x[i]); if (l < 0) l = 0; if (l > nmax) l = nmax; L[i] = (uint8_t)l; sumlx += sw[i] * x[i] * (float)l; suml2 += sw[i] * (float)(l * l); } /* Iterative greedy refinement */ for (int itry = 0; itry < 5; itry++) { int n_changed = 0; for (int i = 0; i < n; i++) { float wi = sw[i]; float slx = sumlx - wi * x[i] * (float)L[i]; float sl2 = suml2 - wi * (float)(L[i] * L[i]); if (slx > 0 && sl2 > 0) { int new_l = gguf_nearest_int(x[i] * sl2 / slx); if (new_l < 0) new_l = 0; if (new_l > nmax) new_l = nmax; if (new_l != L[i]) { slx += wi * x[i] * (float)new_l; sl2 += wi * (float)(new_l * new_l); if (slx * slx * suml2 > sumlx * sumlx * sl2) { L[i] = (uint8_t)new_l; sumlx = slx; suml2 = sl2; n_changed++; } } } } if (!n_changed) break; } return suml2 > 0 ? sumlx / suml2 : 0.0f; } /* ═══════════════════════════════════════════════════════════════════════════ * SHOR'S GRIFFITHS-NIU SEQUENTIAL MEASUREMENT FOR RMSE OPTIMIZATION * (Ported 1:1 from tesseract_factor.c — replaces BP) * * Instead of iterative message-passing (BP), this uses the EXACT sequential * measurement protocol from Shor's algorithm: * * For each block k (MSB → LSB): * 1. Compute feed-forward phase correction from previously measured blocks * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w) * 3. Bake C_k into locals: α(d) *= C_k(d) * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k} * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales * 6. Born rule measurement → select optimal scale candidate * 7. Collapse site + absorb edge weights into neighbors (back-action) * * This IS the quantum Fourier transform that creates constructive * interference at the optimal RMSE configuration, exactly as Shor's * algorithm creates interference at the correct period. * * Domain mapping: * Factoring: oracle phase 2π×d×c_k/N → period r * Quantize: error Boltzmann amplitudes → optimal RMSE block * ═══════════════════════════════════════════════════════════════════════════ */ /* ω₆ roots of unity for CZ phase lookup */ static const double W6_RE[6] = { 1.0, 0.5, -0.5, -1.0, -0.5, 0.5 }; static const double W6_IM[6] = { 0.0, 0.866025403784438647, 0.866025403784438647, 0.0, -0.866025403784438647, -0.866025403784438647 }; static const double INV_SQRT6 = 0.40824829046386301637; /* 1/√6 */ /* ── Collapse + Back-Action core (ported from tesseract_factor.c) ── * After sampling an outcome, collapse the target site to |outcome⟩, * absorb all edge weights into neighbor local states (Magic Pointer * disentanglement), and remove dead edges from the graph. * * This is the EXACT same back-action protocol used in Shor's algorithm * for the semi-classical QFT: measurement of one site conditions all * remaining sites through the CZ phase correlations. */ static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome) { /* Step 1: Collapse local state to |outcome⟩ */ for (int v = 0; v < 6; v++) { graph->locals[target_site].edge_re[v] = (v == outcome) ? 1.0 : 0.0; graph->locals[target_site].edge_im[v] = 0.0; } graph->locals[target_site].primary = VIEW_EDGE; graph->locals[target_site].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; graph->locals[target_site].delta_valid = 0; /* Step 2: Absorb edge weights into neighbor states (back-action). * For each edge (target, neighbor), the weight w(outcome, d) for each * neighbor basis state d gets multiplied into the neighbor's amplitude. * This is the Magic Pointer disentanglement from tesseract_factor.c. */ HPCAdjList *adj = &graph->adj[target_site]; for (uint64_t ei = 0; ei < adj->count; ei++) { uint64_t eid = adj->edge_ids[ei]; HPCEdge *edge = &graph->edges[eid]; uint64_t partner = (edge->site_a == (uint64_t)target_site) ? edge->site_b : edge->site_a; TrialityQuhit *pq = &graph->locals[partner]; for (int d = 0; d < 6; d++) { double w_re, w_im; if (edge->type == HPC_EDGE_CZ) { int pidx = (outcome * d) % 6; w_re = HPC_W6_RE[pidx]; w_im = HPC_W6_IM[pidx]; } else { /* Weighted phase edge */ if (edge->site_a == (uint64_t)target_site) { w_re = edge->w_re[outcome][d]; w_im = edge->w_im[outcome][d]; } else { w_re = edge->w_re[d][outcome]; w_im = edge->w_im[d][outcome]; } } double old_re = pq->edge_re[d], old_im = pq->edge_im[d]; pq->edge_re[d] = old_re * w_re - old_im * w_im; pq->edge_im[d] = old_re * w_im + old_im * w_re; } pq->dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; pq->delta_valid = 0; } /* Step 3: Remove edges touching this site from the graph. * Mark by setting fidelity to -1 and remove from adj lists. */ for (uint64_t ei = 0; ei < adj->count; ei++) { uint64_t eid = adj->edge_ids[ei]; HPCEdge *edge = &graph->edges[eid]; uint64_t partner = (edge->site_a == (uint64_t)target_site) ? edge->site_b : edge->site_a; /* Remove this edge from partner's adj list */ HPCAdjList *padj = &graph->adj[partner]; for (uint64_t pi = 0; pi < padj->count; pi++) { if (padj->edge_ids[pi] == eid) { padj->edge_ids[pi] = padj->edge_ids[--padj->count]; break; } } edge->fidelity = -1.0; /* Mark as dead */ } adj->count = 0; /* Clear target's adj list */ } /* ═══════════════════════════════════════════════════════════════════════════ * SHOR SEQUENTIAL MEASUREMENT — Griffiths-Niu Protocol for Quantization * * Ported 1:1 from tesseract_factor.c lines 2343-2500. * * Measures sites MSB→LSB. For each site k: * 1. Compute feed-forward phase correction θ_k from previously measured sites * 2. Compute neighbor contribution C_k(d) analytically * 3. Bake C_k into locals * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k} * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6} * 6. Compute |β(v)|² as measurement probabilities * 7. Sample/argmax → outcome * 8. Collapse + back-action via shor_collapse_site() * * Returns: marginals are written into marg_out[n_sites][6]. * measured_out[n_sites] receives the measurement outcomes. * ═══════════════════════════════════════════════════════════════════════════ */ static void shor_measure_graph(HPCGraph *graph, int64_t n_sites, double (*marg_out)[6], int *measured_out, int deterministic) { /* Measure sites from last to first (MSB→LSB, same as Griffiths-Niu) */ for (int64_t k = n_sites - 1; k >= 0; k--) { int site_k = (int)k; /* Step 1: Compute feed-forward phase correction from previously * measured sites. The QFT phase is 2π F x / 6^n. For site k, * the fractional phase from previously measured site j (j > k) * is measured_out[j] / 6^{j-k+1}. * Power MUST start at 36.0 (6^2) for the immediately previous site. */ double theta_k = 0.0; { double power = 36.0; for (int64_t j = k + 1; j < n_sites; j++) { theta_k += (double)measured_out[j] / power; power *= 6.0; } } /* Step 2: Compute neighbor contribution C_k(d) analytically. * C_k(d) = Π_neighbor Σ_{w=0}^{5} local_neighbor(w) × edge_weight(d, w) * Each neighbor is independent (product state). */ double ck_re[6], ck_im[6]; for (int d = 0; d < 6; d++) { ck_re[d] = 1.0; ck_im[d] = 0.0; } const HPCAdjList *adj = &graph->adj[site_k]; for (uint64_t ei = 0; ei < adj->count; ei++) { uint64_t eid = adj->edge_ids[ei]; const HPCEdge *edge = &graph->edges[eid]; if (edge->fidelity < 0.0) continue; /* Skip dead edges */ uint64_t partner = (edge->site_a == (uint64_t)site_k) ? edge->site_b : edge->site_a; const TrialityQuhit *pq = &graph->locals[partner]; for (int d = 0; d < 6; d++) { double sr = 0, si = 0; for (int w = 0; w < 6; w++) { double lr = pq->edge_re[w], li = pq->edge_im[w]; double wr, wi; if (edge->type == HPC_EDGE_CZ) { int pidx = (d * w) % 6; wr = HPC_W6_RE[pidx]; wi = HPC_W6_IM[pidx]; } else if (edge->site_a == (uint64_t)site_k) { wr = edge->w_re[d][w]; wi = edge->w_im[d][w]; } else { wr = edge->w_re[w][d]; wi = edge->w_im[w][d]; } sr += lr*wr - li*wi; si += lr*wi + li*wr; } double nr = ck_re[d]*sr - ck_im[d]*si; double ni = ck_re[d]*si + ck_im[d]*sr; ck_re[d] = nr; ck_im[d] = ni; } } /* Step 3: Bake C_k(d) into locals: α(d) *= C_k(d) */ for (int d = 0; d < 6; d++) { double re = graph->locals[site_k].edge_re[d]; double im = graph->locals[site_k].edge_im[d]; graph->locals[site_k].edge_re[d] = re*ck_re[d] - im*ck_im[d]; graph->locals[site_k].edge_im[d] = re*ck_im[d] + im*ck_re[d]; } /* Step 4: Apply feed-forward phase correction to locals. */ for (int d = 0; d < 6; d++) { double angle = -2.0 * 3.14159265358979323846 * d * theta_k; double pr = cos(angle), pi2 = sin(angle); double re = graph->locals[site_k].edge_re[d]; double im = graph->locals[site_k].edge_im[d]; graph->locals[site_k].edge_re[d] = re*pr - im*pi2; graph->locals[site_k].edge_im[d] = re*pi2 + im*pr; } /* Step 5: Apply IDFT6 in-place: phase basis → computational basis. * β(v) = (1/√6) Σ_{d=0}^{5} α'(d) × e^{2πi d v / 6} * C_k(d) is INSIDE the coherent sum — THIS creates interference * peaks at the optimal RMSE configuration, exactly as Shor's * algorithm creates peaks at the correct period. */ { double alpha_re[6], alpha_im[6]; for (int d = 0; d < 6; d++) { alpha_re[d] = graph->locals[site_k].edge_re[d]; alpha_im[d] = graph->locals[site_k].edge_im[d]; } for (int v = 0; v < 6; v++) { double sum_re = 0.0, sum_im = 0.0; for (int d = 0; d < 6; d++) { double angle = 2.0 * 3.14159265358979323846 * d * v / 6.0; double er = cos(angle), ei = sin(angle); sum_re += alpha_re[d]*er - alpha_im[d]*ei; sum_im += alpha_re[d]*ei + alpha_im[d]*er; } graph->locals[site_k].edge_re[v] = sum_re * INV_SQRT6; graph->locals[site_k].edge_im[v] = sum_im * INV_SQRT6; } } /* Step 6: Compute marginals from |local(v)|² */ double probs[6]; double total = 0.0; for (int v = 0; v < 6; v++) { probs[v] = graph->locals[site_k].edge_re[v] * graph->locals[site_k].edge_re[v] + graph->locals[site_k].edge_im[v] * graph->locals[site_k].edge_im[v]; total += probs[v]; } if (total > 1e-30) { for (int v = 0; v < 6; v++) probs[v] /= total; } else { for (int v = 0; v < 6; v++) probs[v] = 1.0 / 6.0; } /* Store marginals for downstream beam search */ for (int v = 0; v < 6; v++) marg_out[k][v] = probs[v]; /* Step 7: Select outcome — deterministic argmax for quantization * (unlike factoring which uses Born sampling for probabilistic * period recovery, quantization wants the MAP estimate) */ int outcome; if (deterministic) { outcome = 0; double max_p = probs[0]; for (int v = 1; v < 6; v++) { if (probs[v] > max_p) { max_p = probs[v]; outcome = v; } } } else { /* Born sampling (for multi-shot refinement) */ static unsigned int shor_rng = 271828; shor_rng = shor_rng * 1664525u + 1013904223u; double r01 = (double)(shor_rng >> 8) / 16777216.0; double cumul = 0.0; outcome = 5; for (int v = 0; v < 6; v++) { cumul += probs[v]; if (r01 <= cumul) { outcome = v; break; } } } measured_out[k] = outcome; /* Step 8: Collapse + back-action — absorb edge weights into * neighbor locals (Magic Pointer disentanglement) */ shor_collapse_site(graph, site_k, outcome); } } /* ═══════════════════════════════════════════════════════════════════════════ * HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors) * * Same architecture as Q2_K HPC pipeline, but simpler: * - One parameter per block (scale d only, no dmin) * - Single quhit per block (6 states) * - 10 candidate scales → bin to 6 for BP * - 12-beam Hensel search for globally optimal configuration * - Triality 3-view marginals for robust scoring * * Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d * ═══════════════════════════════════════════════════════════════════════════ */ #define Q4_N_CAND 16 /* scale candidates for Q4_0 (was 10) */ #define Q4_N_BEAMS 24 /* beam width (was 12) */ /* Tight neighborhood around WLS optimum: ±10% */ static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = { 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, 0.975f, 0.985f, 0.995f, 1.005f, 1.015f, 1.025f, 1.035f, 1.050f, 1.070f, 1.100f }; static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = { 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 }; static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, BlockQ4_0 *output, float *out_total_error, const float *imat_importance, int verbose) { int64_t n_blocks = n_elements / QK4_0; float total_err = 0.0f; /* ── Phase 1: Greedy seed — compute scale per block ── */ float *greedy_d = (float *)calloc(n_blocks, sizeof(float)); #pragma omp parallel for schedule(dynamic, 64) for (int64_t blk = 0; blk < n_blocks; blk++) { const float *bw = weights + blk * QK4_0; float amax = 0.0f; for (int j = 0; j < QK4_0; j++) { float av = fabsf(bw[j]); if (av > amax) amax = av; } greedy_d[blk] = amax / 7.0f; } /* ── Phase 2: WLS-Optimal Candidate Generation for Q4_0 ── * First find the true optimal d* via 3-iteration WLS, * then generate candidates centered on d* with tight spacing. */ float (*cand_errors)[Q4_N_CAND] = (float (*)[Q4_N_CAND]) calloc(n_blocks, sizeof(float[Q4_N_CAND])); uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND]) calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND])); for (int64_t blk = 0; blk < n_blocks; blk++) { const float *bw = weights + blk * QK4_0; /* ── Step 2a: WLS solve to find optimal d* ── */ float wls_d = greedy_d[blk]; for (int ls_iter = 0; ls_iter < 3; ls_iter++) { if (wls_d < 1e-15f) break; float inv_d = 1.0f / wls_d; float num = 0.0f, den = 0.0f; for (int j = 0; j < QK4_0; j++) { int q = (int)(bw[j] * inv_d + 8.5f); if (q < 0) q = 0; if (q > 15) q = 15; float qc = (float)q - 8.0f; float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; num += w * bw[j] * qc; den += w * qc * qc; } if (den > 1e-15f) { float d_new = num / den; if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f)) wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new)); } } /* ── Step 2b: Generate candidates centered on WLS optimum ── */ for (int ci = 0; ci < Q4_N_CAND; ci++) { float trial_d = wls_d * Q4_NEIGHBOR_MULTS[ci]; uint16_t d16 = gguf_fp32_to_fp16(trial_d); float actual_d = gguf_fp16_to_fp32(d16); cand_d16[blk][ci] = d16; float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f; float err = 0.0f; for (int j = 0; j < QK4_0; j += 6) { int g_len = (j + 6 <= QK4_0) ? 6 : (QK4_0 - j); int half_g = g_len / 2; float e_cur[6], w_cur[6]; for (int kk = 0; kk < g_len; kk++) { int idx = j + kk; float x = bw[idx]; int q = (int)(x * id + 8.5f); if (q < 0) q = 0; if (q > 15) q = 15; float deq = ((float)q - 8.0f) * actual_d; e_cur[kk] = x - deq; w_cur[kk] = (imat_importance) ? imat_importance[blk * QK4_0 + idx] : 1.0f; } /* Decompose into vesica (DC) and wave (AC) components */ float vesica_err = 0.0f, wave_err = 0.0f; for (int p = 0; p < half_g; p++) { float v = e_cur[p] + e_cur[p + half_g]; float w_wave = e_cur[p] - e_cur[p + half_g]; float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f; vesica_err += v * v * w_avg; wave_err += w_wave * w_wave * w_avg; } /* Triality weighting: penalize vesica 4×, wave 1×. * Factor of 0.5 keeps scale consistent with standard MSE. */ err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err); } cand_errors[blk][ci] = err; } } /* ── Phase 3: HPC graph — single quhit per block ── */ int *best_candidate = (int *)malloc(n_blocks * sizeof(int)); for (int64_t i = 0; i < n_blocks; i++) best_candidate[i] = 10; /* Q4_NEIGHBOR_MULTS[10] = 1.00 */ if (n_blocks >= 2) { float temperature = 0.5f; int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks; int64_t stride = n_blocks / graph_blocks; int64_t n_sites = graph_blocks; /* 1 quhit per block */ HPCGraph *graph = hpc_create(n_sites); if (graph) { for (int64_t i = 0; i < n_sites; i++) triality_dft(&graph->locals[i]); /* Adaptive temperature from error landscape */ { double err_accum = 0.0; int err_count = 0; for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) { int64_t blk = gi * stride; float max_e = 0.0f; for (int c = 0; c < Q4_N_CAND; c++) if (cand_errors[blk][c] > max_e) max_e = cand_errors[blk][c]; err_accum += (double)max_e; err_count++; } if (err_count > 0) { temperature = (float)(err_accum / err_count) * 0.1f; if (temperature < 1e-10f) temperature = 1e-10f; } } /* Encode stride-group AGGREGATED candidate errors as Boltzmann amplitudes */ for (int64_t i = 0; i < graph_blocks; i++) { /* Aggregate errors across stride group */ float agg_errors[Q4_N_CAND]; for (int c = 0; c < Q4_N_CAND; c++) agg_errors[c] = 0.0f; int64_t blk_start = i * stride; int64_t blk_end = blk_start + stride; if (blk_end > n_blocks) blk_end = n_blocks; int64_t group_size = blk_end - blk_start; for (int64_t b = blk_start; b < blk_end; b++) { for (int c = 0; c < Q4_N_CAND; c++) agg_errors[c] += cand_errors[b][c]; } if (group_size > 1) { float inv_gs = 1.0f / (float)group_size; for (int c = 0; c < Q4_N_CAND; c++) agg_errors[c] *= inv_gs; } float min_err = 1e30f; for (int c = 0; c < Q4_N_CAND; c++) if (agg_errors[c] < min_err) min_err = agg_errors[c]; double amp_re[6]; double amp_norm = 0.0; for (int qi = 0; qi < 6; qi++) amp_re[qi] = 0.0; for (int ci = 0; ci < Q4_N_CAND; ci++) { int qi = Q4_CAND_TO_QUHIT[ci]; amp_re[qi] += exp(-(double)(agg_errors[ci] - min_err) / (2.0 * (double)temperature)); } for (int qi = 0; qi < 6; qi++) amp_norm += amp_re[qi] * amp_re[qi]; if (amp_norm > 1e-30) { double inv = 1.0 / sqrt(amp_norm); for (int v = 0; v < 6; v++) amp_re[v] *= inv; } for (int v = 0; v < 6; v++) { graph->locals[i].edge_re[v] = amp_re[v]; graph->locals[i].edge_im[v] = 0.0; } graph->locals[i].primary = VIEW_EDGE; graph->locals[i].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; graph->locals[i].delta_valid = 0; triality_update_mask(&graph->locals[i]); } /* Neighbor edges */ for (int64_t i = 0; i < graph_blocks - 1; i++) hpc_cz(graph, i, i + 1); /* ── Shor's Griffiths-Niu Sequential Measurement ── * Replaces BP with exact marginals via IDFT6 + feed-forward + * collapse/back-action (ported 1:1 from tesseract_factor.c). * Single pass, no iteration, no message damping. */ double (*marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); int *shor_measured = (int *)calloc(graph_blocks, sizeof(int)); shor_measure_graph(graph, graph_blocks, marg, shor_measured, 1); free(shor_measured); /* Beam search over candidates */ typedef struct { double acc_error; int history_idx; } Q4Beam; typedef struct { int cand_idx; int parent_idx; } Q4BeamHistory; Q4Beam beams[Q4_N_BEAMS]; int active_beams = 1; Q4BeamHistory *history = (Q4BeamHistory *)malloc(n_blocks * Q4_N_BEAMS * sizeof(Q4BeamHistory)); for (int b = 0; b < Q4_N_BEAMS; b++) { beams[b].acc_error = 0.0; beams[b].history_idx = -1; } for (int64_t i = 0; i < graph_blocks; i++) { double m_total = 0.0; for (int v = 0; v < 6; v++) m_total += marg[i][v]; double cand_score[Q4_N_CAND]; int64_t blk = i * stride; /* Count candidates per quhit bin for normalization */ int q4_bin_count[6] = {0}; for (int ci = 0; ci < Q4_N_CAND; ci++) q4_bin_count[Q4_CAND_TO_QUHIT[ci]]++; /* Per-block error normalization: divide by block mean error * so small-weight blocks don't dominate beam selection */ float blk_mean_err = 0.0f; for (int ci = 0; ci < Q4_N_CAND; ci++) blk_mean_err += cand_errors[blk][ci]; blk_mean_err /= (float)Q4_N_CAND; if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f; for (int ci = 0; ci < Q4_N_CAND; ci++) { int qi = Q4_CAND_TO_QUHIT[ci]; double p = (m_total > 1e-30) ? marg[i][qi] / m_total : 1.0/6.0; p /= (double)q4_bin_count[qi]; /* normalize by bin occupancy */ cand_score[ci] = p / (cand_errors[blk][ci] / blk_mean_err + 1e-15); } typedef struct { double score; int beam_idx; int cand_idx; } Q4Ext; Q4Ext extensions[Q4_N_BEAMS * Q4_N_CAND]; int n_ext = 0; for (int b = 0; b < active_beams; b++) { for (int c = 0; c < Q4_N_CAND; c++) { double ext_err = beams[b].acc_error + cand_errors[blk][c]; extensions[n_ext].score = cand_score[c] / (ext_err + 1e-15); extensions[n_ext].beam_idx = b; extensions[n_ext].cand_idx = c; n_ext++; } } int top_k = (n_ext < Q4_N_BEAMS) ? n_ext : Q4_N_BEAMS; int top_indices[Q4_N_BEAMS]; for (int k = 0; k < top_k; k++) { int best = -1; double best_s = -1e30; for (int e = 0; e < n_ext; e++) { if (extensions[e].score > best_s) { best_s = extensions[e].score; best = e; } } top_indices[k] = best; extensions[best].score = -2e30; } Q4Beam new_beams[Q4_N_BEAMS]; for (int k = 0; k < top_k; k++) { int ei = top_indices[k]; int sb = extensions[ei].beam_idx; int cand = extensions[ei].cand_idx; int hist_idx = i * Q4_N_BEAMS + k; history[hist_idx].cand_idx = cand; history[hist_idx].parent_idx = beams[sb].history_idx; new_beams[k].history_idx = hist_idx; new_beams[k].acc_error = beams[sb].acc_error + cand_errors[blk][cand]; } for (int k = 0; k < top_k; k++) beams[k] = new_beams[k]; active_beams = top_k; } int curr_hist = beams[0].history_idx; for (int64_t i = graph_blocks - 1; i >= 0; i--) { int group_cidx; if (curr_hist >= 0) { group_cidx = history[curr_hist].cand_idx; curr_hist = history[curr_hist].parent_idx; } else { group_cidx = 10; } if (stride <= 1) { best_candidate[i] = group_cidx; } else { /* Per-block local optimization within stride group. * Beam picks the quhit bin; each block picks its best * candidate in that bin from its own error landscape. */ int target_bin = Q4_CAND_TO_QUHIT[group_cidx]; for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) { float best_err = 1e30f; int best_c = group_cidx; for (int c = 0; c < Q4_N_CAND; c++) { if (Q4_CAND_TO_QUHIT[c] != target_bin) continue; if (cand_errors[b][c] < best_err) { best_err = cand_errors[b][c]; best_c = c; } } /* Greedy override if global best is >5% better */ float global_best = 1e30f; int global_best_c = group_cidx; for (int c = 0; c < Q4_N_CAND; c++) { if (cand_errors[b][c] < global_best) { global_best = cand_errors[b][c]; global_best_c = c; } } if (global_best < best_err * 0.95f) best_candidate[b] = global_best_c; else best_candidate[b] = best_c; } } } free(history); /* ══════════════════════════════════════════════════════════════ * Phase 3.5: Born-Rule Multi-Shot Scale Refinement * * The beam search found the MAP candidate sequence. But the * triality marginals encode quantum phase-coherent structure * that a greedy beam can miss. * * Like tesseract_factor's MCMC period recovery (lines 1920-1964): * 1. Take N independent Born samples from triality marginals * 2. Each sample → full candidate assignment across all blocks * 3. Evaluate actual RMSE for each assignment * 4. Keep assignment with lowest total RMSE * * Reuses the EXISTING converged Möbius sheet — zero new BP. * ══════════════════════════════════════════════════════════════ */ { #define Q4_BORN_SHOTS 64 /* Compute beam-search baseline RMSE for comparison */ float beam_total_err = 0.0f; for (int64_t bi = 0; bi < n_blocks; bi++) beam_total_err += cand_errors[bi][best_candidate[bi]]; /* Build per-block CDFs from triality marginals */ unsigned int born_rng = 314159; /* Compute tail error once (blocks beyond graph coverage) */ float tail_err_q4 = 0.0f; for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++) tail_err_q4 += cand_errors[bi][best_candidate[bi]]; /* Sparse shot buffer: only track stride-sampled blocks */ int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int)); for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) { float shot_err = tail_err_q4; for (int64_t gi = 0; gi < graph_blocks; gi++) { /* Normalize marginals to CDF */ double m_total = 0.0; for (int v = 0; v < 6; v++) m_total += marg[gi][v]; /* Born sample: CDF inversion (same as born_sample) */ born_rng = born_rng * 1664525u + 1013904223u; double rnd = (double)(born_rng >> 8) / 16777216.0; double target = rnd * m_total; double cum = 0.0; int sampled_qi = 5; for (int v = 0; v < 6; v++) { cum += marg[gi][v]; if (cum > target) { sampled_qi = v; break; } } /* Find the best candidate WITHIN this quhit bin */ int64_t blk = gi * stride; float best_bin_err = 1e30f; int best_bin_cand = 10; /* default */ for (int ci = 0; ci < Q4_N_CAND; ci++) { if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) { if (cand_errors[blk][ci] < best_bin_err) { best_bin_err = cand_errors[blk][ci]; best_bin_cand = ci; } } } shot_sparse_q4[gi] = best_bin_cand; shot_err += cand_errors[blk][best_bin_cand]; } /* Metropolis acceptance: adopt if better than current best */ if (shot_err < beam_total_err) { for (int64_t gi = 0; gi < graph_blocks; gi++) best_candidate[gi * stride] = shot_sparse_q4[gi]; beam_total_err = shot_err; } } free(shot_sparse_q4); } free(marg); hpc_destroy(graph); } } /* ══════════════════════════════════════════════════════════════════ * PHASE 4: Assemble blocks via least-squares scale extraction * * The factorer assembles a frequency register from BP marginals, * then EXTRACTS the exact period via continued fractions. * * We do the same: the beam search / Born shots selected a grid * candidate (the "assembled frequency"). Now we EXTRACT the exact * optimal FP16 scale via weighted least-squares (the "CF step"). * * For Q4_0: d_optimal = Σ(w_j × x_j × q̃_j) / Σ(w_j × q̃_j²) * where q̃_j = (q_j - 8) and q_j is quantized at the grid scale. * * This iterates: quantize at d_init → compute d_optimal → re-quantize * → re-compute until convergence. 3 iterations suffice since Q4_0 * has only 16 levels — the assignment stabilizes immediately. * * The grid gave us 16 possible scales. This gives us 65,536 (all FP16). * ══════════════════════════════════════════════════════════════════ */ #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err) for (int64_t blk = 0; blk < n_blocks; blk++) { const float *bw = weights + blk * QK4_0; int cidx = best_candidate[blk]; /* Start from the grid-selected scale (the "assembled frequency") */ float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]); /* Analog assembly: iterate to full convergence. * 5 iterations for stable (d, q-values) coupling. */ for (int ls_iter = 0; ls_iter < 5; ls_iter++) { if (d_current < 1e-15f) break; float id = 1.0f / d_current; /* Quantize at current scale */ int qs_tmp[QK4_0]; for (int j = 0; j < QK4_0; j++) { int q = (int)(bw[j] * id + 8.5f); if (q < 0) q = 0; if (q > 15) q = 15; qs_tmp[j] = q; } /* Weighted least-squares: d = Σ(w × x × q̃) / Σ(w × q̃²) * where q̃ = q - 8 (centered quantized value) */ float num = 0.0f, den = 0.0f; for (int j = 0; j < QK4_0; j++) { float q_centered = (float)qs_tmp[j] - 8.0f; float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; num += w * bw[j] * q_centered; den += w * q_centered * q_centered; } if (den > 1e-15f) { float d_new = num / den; /* Clamp magnitude to prevent runaway (Q4_0 d can be negative) */ float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]); if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) { uint16_t d16 = gguf_fp32_to_fp16(d_new); d_current = gguf_fp16_to_fp32(d16); } } } /* ── FP16 ULP neighborhood search + sign-flip exploration ── * The WLS solve found the continuous-optimal d. But FP16 truncation * may shift the optimum. Try ±4 ULP around d in FP16 space, plus * the negated scale, and pick the one with minimum reconstruction error. */ { uint16_t base_d16 = gguf_fp32_to_fp16(d_current); uint16_t best_d16 = base_d16; float best_ulp_err = 1e30f; /* Try ±4 ULP neighborhood + sign flip = up to 17 candidates */ uint16_t ulp_candidates[17]; int n_ulp = 0; for (int delta = -4; delta <= 4; delta++) { int cand16 = (int)base_d16 + delta; if (cand16 >= 0 && cand16 <= 0x7BFF) /* valid positive FP16 */ ulp_candidates[n_ulp++] = (uint16_t)cand16; } /* Sign-flipped d: negate and try ±0 ULP */ { float neg_d = -d_current; uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d); ulp_candidates[n_ulp++] = neg_d16; } for (int ui = 0; ui < n_ulp; ui++) { float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]); float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f; float err = 0.0f; for (int j = 0; j < QK4_0; j++) { int q = (int)(bw[j] * trial_id + 8.5f); if (q < 0) q = 0; if (q > 15) q = 15; float deq = ((float)q - 8.0f) * trial_d; float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; err += (bw[j] - deq) * (bw[j] - deq) * w; } if (err < best_ulp_err) { best_ulp_err = err; best_d16 = ulp_candidates[ui]; } } d_current = gguf_fp16_to_fp32(best_d16); } /* Store the extracted optimal FP16 scale */ output[blk].d = gguf_fp32_to_fp16(d_current); float actual_d = d_current; float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f; /* ── D₆ Hadamard Error Shaping for Q4_0 ── * 32 elements per block = 5 full D₆ groups of 6 + 2 tail. * Apply the same antipodal fold as Q2_K: minimize vesica energy * to push quantization noise into wave (high-frequency) modes * that cancel in dot products. */ /* Step 1: Standard nearest-rounding as baseline */ int q_base[QK4_0], q_shaped[QK4_0]; float q_cont[QK4_0]; for (int j = 0; j < QK4_0; j++) { q_cont[j] = bw[j] * id + 8.0f; q_base[j] = (int)(q_cont[j] + 0.5f); if (q_base[j] < 0) q_base[j] = 0; if (q_base[j] > 15) q_base[j] = 15; } memcpy(q_shaped, q_base, QK4_0 * sizeof(int)); /* Step 2: D₆ greedy flipping on 5 groups of 6 */ for (int g = 0; g < 5; g++) { int g_off = g * 6; for (int pass = 0; pass < 6; pass++) { int best_k = -1; int best_q_alt = 0; float best_delta = 0.0f; /* Current group errors */ float e_cur[6]; for (int kk = 0; kk < 6; kk++) { float deq = ((float)q_shaped[g_off+kk] - 8.0f) * actual_d; e_cur[kk] = bw[g_off+kk] - deq; } /* Current D₆ metric: vesica energy + DC² */ float vesica_cur = 0.0f, dc_cur = 0.0f; for (int p = 0; p < 3; p++) { float v = e_cur[p] + e_cur[p+3]; vesica_cur += v * v; } for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk]; float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; /* Try flipping each element */ for (int k = 0; k < 6; k++) { int idx = g_off + k; int q_cur = q_shaped[idx]; int q_try; if (q_cont[idx] - (float)q_cur >= 0) { q_try = q_cur + 1; } else { q_try = q_cur - 1; } if (q_try < 0 || q_try > 15) continue; /* Alt errors */ float e_alt[6]; for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk]; float deq_try = ((float)q_try - 8.0f) * actual_d; e_alt[k] = bw[idx] - deq_try; /* Alt D₆ metric */ float vesica_alt = 0.0f, dc_alt = 0.0f; for (int p = 0; p < 3; p++) { float v = e_alt[p] + e_alt[p+3]; vesica_alt += v * v; } for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk]; float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt; float delta = metric_cur - metric_alt; if (delta > best_delta) { best_delta = delta; best_k = k; best_q_alt = q_try; } } if (best_k < 0) break; q_shaped[g_off + best_k] = best_q_alt; } } /* Step 3: Error comparison — keep shaped only if MSE doesn't worsen >5% */ float err_base = 0.0f, err_shaped = 0.0f; for (int j = 0; j < QK4_0; j++) { float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; float deq_b = ((float)q_base[j] - 8.0f) * actual_d; float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d; err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w; err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w; } int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base; /* Pack nibbles and compute error */ for (int j = 0; j < QK4_0 / 2; j++) { int q0 = q_final[j]; int q1 = q_final[j + QK4_0/2]; output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4)); float deq0 = ((float)q0 - 8.0f) * actual_d; float deq1 = ((float)q1 - 8.0f) * actual_d; total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1); } } *out_total_error = total_err; free(greedy_d); free(cand_errors); free(cand_d16); free(best_candidate); } static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, BlockQ2K *output, float *out_total_error, OptimizerMode opt_mode, const float *imat_importance, int verbose) { int64_t n_blocks = n_elements / QK_K; float total_err = 0.0f; const int N_SUB = QK_K / 16; init_scale_table(); /* ══════════════════════════════════════════════════════════════════ * PHASE 1: Greedy quantization — produce seed (d, dmin) per block * ══════════════════════════════════════════════════════════════════ */ /* Store Phase A/B results for all blocks */ typedef struct { float dm, mm; /* greedy d, dmin (fp32) */ uint16_t d_fp16, dmin_fp16; /* greedy d, dmin (fp16) */ uint8_t Ls[16], Lm[16]; /* sub-block scale/min indices */ float scales[16], mins[16], sw[16]; } BlockSeed; BlockSeed *seeds = (BlockSeed *)calloc(n_blocks, sizeof(BlockSeed)); #pragma omp parallel for schedule(dynamic, 64) for (int64_t blk = 0; blk < n_blocks; blk++) { const float *block_x = weights + blk * QK_K; uint8_t L[QK_K], Laux[16]; float wt[16]; float sumx2 = 0; for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i]; float sigma2 = sumx2 / (float)QK_K; for (int j = 0; j < N_SUB; j++) { const float *sx = block_x + 16 * j; seeds[blk].sw[j] = 0; for (int l = 0; l < 16; l++) { float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f; wt[l] = imp * sqrtf(sigma2 + sx[l] * sx[l]); seeds[blk].sw[j] += wt[l]; } seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx, wt, L + 16 * j, &seeds[blk].mins[j], Laux); } seeds[blk].dm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].scales, seeds[blk].Ls, seeds[blk].sw); seeds[blk].mm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].mins, seeds[blk].Lm, seeds[blk].sw); seeds[blk].d_fp16 = gguf_fp32_to_fp16(seeds[blk].dm); seeds[blk].dmin_fp16 = gguf_fp32_to_fp16(seeds[blk].mm); } /* ══════════════════════════════════════════════════════════════════ * PHASE 2: WLS-Optimal Candidate Generation * * Instead of a fixed multiplier grid centered on greedy seeds, * we first solve a 3-iteration Weighted Least-Squares to find * the true optimal (d*, dmin*) per block, then generate the * 16×16 candidate grid centered on THOSE optimal values. * This makes the candidate space data-driven, not fabricated. * ══════════════════════════════════════════════════════════════════ */ /* Wide neighborhood around WLS optimum: ±20% with asymmetric spacing * — finer near 1.0 for precision, wider at edges for exploration. * Critical for large-σ weights where the optimal (d,dmin) may be * far from the WLS seed. */ static const float NEIGHBOR_MULTS_D[N_CAND_D] = { 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f, 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f }; static const float NEIGHBOR_MULTS_M[N_CAND_M] = { 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f, 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f }; /* Map 16 candidates → 6 quhit states for BP encoding */ static const int CAND_TO_QUHIT[16] = { 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 }; /* candidate_errors[blk][256] — weighted MSE per candidate */ float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL; uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL; uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL; /* Per-candidate Ls/Lm — must recompute for each (d, dmin) */ uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL; uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL; candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks, sizeof(float[TOTAL_SCALE_CANDIDATES])); candidate_d = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks, sizeof(uint16_t[TOTAL_SCALE_CANDIDATES])); candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks, sizeof(uint16_t[TOTAL_SCALE_CANDIDATES])); candidate_Ls = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks, sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16])); candidate_Lm = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks, sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16])); #pragma omp parallel for schedule(dynamic, 16) for (int64_t blk = 0; blk < n_blocks; blk++) { const float *block_x = weights + blk * QK_K; /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ── * Seed from Phase 1 greedy, iterate 3× to converge. * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j] * This is a 2-variable WLS: minimize Σ w×(x - d×a + dmin×b)² */ float wls_dm = seeds[blk].dm; float wls_mm = seeds[blk].mm; uint8_t wls_Ls[16], wls_Lm[16]; memcpy(wls_Ls, seeds[blk].Ls, 16); memcpy(wls_Lm, seeds[blk].Lm, 16); for (int ls_iter = 0; ls_iter < 5; ls_iter++) { /* Quantize all elements at current (wls_dm, wls_mm) */ uint8_t L_wls[QK_K]; for (int j = 0; j < N_SUB; j++) { float d_sub = wls_dm * (float)wls_Ls[j]; float m_sub = wls_mm * (float)wls_Lm[j]; if (d_sub < 1e-15f) { for (int k = 0; k < 16; k++) L_wls[16*j+k] = 0; continue; } for (int k = 0; k < 16; k++) { int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; L_wls[16*j+k] = (uint8_t)q; } } /* Accumulate 2×2 normal equations */ double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0; for (int j = 0; j < N_SUB; j++) { float ls_f = (float)wls_Ls[j]; float lm_f = (float)wls_Lm[j]; for (int k = 0; k < 16; k++) { float x = block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j+k] : 1.0f; float a = ls_f * (float)L_wls[16*j+k]; float b = lm_f; Saa += w * a * a; Sab += w * a * b; Sbb += w * b * b; Sxa += w * x * a; Sxb += w * x * b; } } /* Solve via Cramer's rule */ double det = Saa * Sbb - Sab * Sab; if (fabs(det) > 1e-30) { double d_new = (Sbb * Sxa - Sab * Sxb) / det; double dm_new = (Sab * Sxa - Saa * Sxb) / det; /* Clamp: positive and within 4× of seed (prevent runaway) */ if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10)) wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new)); if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10)) wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new)); } /* Re-derive Ls/Lm for updated (d*, dmin*) */ for (int j = 0; j < N_SUB; j++) { if (wls_dm > 1e-15f) { int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm); if (ls < 0) ls = 0; if (ls > 15) ls = 15; wls_Ls[j] = (uint8_t)ls; } else { wls_Ls[j] = 0; } if (wls_mm > 1e-15f) { int lm = gguf_nearest_int(seeds[blk].mins[j] / wls_mm); if (lm < 0) lm = 0; if (lm > 15) lm = 15; wls_Lm[j] = (uint8_t)lm; } else { wls_Lm[j] = 0; } } } /* ── Step 2b: Generate 16×16 candidates centered on WLS optimum ── * Grid is now centered on (wls_dm, wls_mm) not (greedy_dm, greedy_mm). * Tighter spacing because we're already near the true minimum. */ for (int di = 0; di < N_CAND_D; di++) { float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di]; uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm); float actual_dm = gguf_fp16_to_fp32(trial_d16); for (int mi = 0; mi < N_CAND_M; mi++) { int cidx = di * N_CAND_M + mi; float trial_mm = wls_mm * NEIGHBOR_MULTS_M[mi]; uint16_t trial_dmin16 = gguf_fp32_to_fp16(trial_mm); float actual_mm = gguf_fp16_to_fp32(trial_dmin16); candidate_d[blk][cidx] = trial_d16; candidate_dmin[blk][cidx] = trial_dmin16; /* Recompute Ls/Lm for THIS candidate dm/mm */ uint8_t trial_Ls[16], trial_Lm[16]; for (int j = 0; j < N_SUB; j++) { if (actual_dm > 1e-15f) { int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm); if (ls < 0) ls = 0; if (ls > 15) ls = 15; trial_Ls[j] = (uint8_t)ls; } else { trial_Ls[j] = 0; } if (actual_mm > 1e-15f) { int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm); if (lm < 0) lm = 0; if (lm > 15) lm = 15; trial_Lm[j] = (uint8_t)lm; } else { trial_Lm[j] = 0; } } memcpy(candidate_Ls[blk][cidx], trial_Ls, 16); memcpy(candidate_Lm[blk][cidx], trial_Lm, 16); /* Fully re-quantize and measure weighted MSE */ float err = 0.0f; for (int j = 0; j < N_SUB; j++) { float d = actual_dm * (float)trial_Ls[j]; float m = actual_mm * (float)trial_Lm[j]; if (d < 1e-15f) { for (int k = 0; k < 16; k++) { float x = block_x[16 * j + k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + k] : 1.0f; err += x * x * w; } continue; } for (int k = 0; k < 16; k += 6) { int g_len = (k + 6 <= 16) ? 6 : (16 - k); int half_g = g_len / 2; float e_cur[6], w_cur[6]; for (int kk = 0; kk < g_len; kk++) { int idx = 16 * j + k + kk; float x = block_x[idx]; int q = gguf_nearest_int((x + m) / d); if (q < 0) q = 0; if (q > 3) q = 3; float deq = d * (float)q - m; e_cur[kk] = x - deq; w_cur[kk] = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f; } /* Decompose into vesica and wave */ float vesica_err = 0.0f, wave_err = 0.0f; for (int p = 0; p < half_g; p++) { float v = e_cur[p] + e_cur[p + half_g]; float w_wave = e_cur[p] - e_cur[p + half_g]; float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f; vesica_err += v * v * w_avg; wave_err += w_wave * w_wave * w_avg; } /* Triality weighting: penalize vesica 4×, wave 1× */ err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err); } } candidate_errors[blk][cidx] = err; } } } /* ══════════════════════════════════════════════════════════════════ * PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement * * Build a multi-quhit graph where each block has 2 quhits * encoding the 36 candidate errors. Shor's sequential measurement * (IDFT6 + feed-forward + collapse/back-action) extracts exact * marginals for optimal (d, dmin) per block — replaces BP. * ══════════════════════════════════════════════════════════════════ */ /* Default: use greedy candidate (index 5*10+5 = 55, mult 1.00×1.00) */ int *best_candidate = (int *)malloc(n_blocks * sizeof(int)); for (int64_t i = 0; i < n_blocks; i++) best_candidate[i] = 10 * N_CAND_M + 10; /* NEIGHBOR_MULTS_D[10]=1.00, _M[10]=1.00 */ if (opt_mode != OPT_MSE && n_blocks >= 2) { int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks; int64_t stride = n_blocks / graph_blocks; float temperature = 0.5f; int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK; HPCGraph *graph = hpc_create(n_sites); if (graph) { for (int64_t i = 0; i < n_sites; i++) triality_dft(&graph->locals[i]); /* Encode each stride group's AGGREGATED candidate errors as dual-quhit * amplitudes. For stride > 1, average errors across ALL blocks in * the group — not just the first block. This is critical for large * tensors where stride=97 means 96/97 blocks were being ignored. */ /* Compute adaptive temperature from median error spread. * This ensures the Boltzmann encoding produces meaningful distributions * regardless of weight magnitude (σ=0.0003 vs σ=0.024). */ { double err_accum = 0.0; int err_count = 0; for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) { int64_t blk = gi * stride; float max_e = 0.0f; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) if (candidate_errors[blk][c] > max_e) max_e = candidate_errors[blk][c]; err_accum += (double)max_e; err_count++; } if (err_count > 0) { float median_err = (float)(err_accum / err_count); /* Temperature = 10% of median max error — sharp enough to * discriminate, soft enough for Shor interference */ temperature = median_err * 0.1f; if (temperature < 1e-10f) temperature = 1e-10f; } } for (int64_t i = 0; i < graph_blocks; i++) { /* Aggregate errors across entire stride group */ float agg_errors[TOTAL_SCALE_CANDIDATES]; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] = 0.0f; int64_t blk_start = i * stride; int64_t blk_end = blk_start + stride; if (blk_end > n_blocks) blk_end = n_blocks; int64_t group_size = blk_end - blk_start; for (int64_t b = blk_start; b < blk_end; b++) { for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] += candidate_errors[b][c]; } /* Average across group */ if (group_size > 1) { float inv_gs = 1.0f / (float)group_size; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] *= inv_gs; } float min_err = 1e30f; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) if (agg_errors[c] < min_err) min_err = agg_errors[c]; /* Quhit 0 (coarse = d dimension): marginalize over dmin */ double coarse_re[6]; double coarse_norm = 0.0; for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0; for (int di = 0; di < N_CAND_D; di++) { int qi = CAND_TO_QUHIT[di]; for (int mi = 0; mi < N_CAND_M; mi++) { int cidx = di * N_CAND_M + mi; coarse_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) / (2.0 * (double)temperature)); } } for (int qi = 0; qi < 6; qi++) coarse_norm += coarse_re[qi] * coarse_re[qi]; if (coarse_norm > 1e-30) { double inv = 1.0 / sqrt(coarse_norm); for (int v = 0; v < 6; v++) coarse_re[v] *= inv; } /* Quhit 1 (fine = dmin dimension): marginalize over d */ double fine_re[6]; double fine_norm = 0.0; for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0; for (int mi = 0; mi < N_CAND_M; mi++) { int qi = CAND_TO_QUHIT[mi]; for (int di = 0; di < N_CAND_D; di++) { int cidx = di * N_CAND_M + mi; fine_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) / (2.0 * (double)temperature)); } } for (int qi = 0; qi < 6; qi++) fine_norm += fine_re[qi] * fine_re[qi]; if (fine_norm > 1e-30) { double inv = 1.0 / sqrt(fine_norm); for (int v = 0; v < 6; v++) fine_re[v] *= inv; } /* Write quhits */ int64_t s0 = 2 * i, s1 = 2 * i + 1; for (int v = 0; v < 6; v++) { graph->locals[s0].edge_re[v] = coarse_re[v]; graph->locals[s0].edge_im[v] = 0.0; graph->locals[s1].edge_re[v] = fine_re[v]; graph->locals[s1].edge_im[v] = 0.0; } graph->locals[s0].primary = VIEW_EDGE; graph->locals[s0].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; graph->locals[s0].delta_valid = 0; triality_update_mask(&graph->locals[s0]); graph->locals[s1].primary = VIEW_EDGE; graph->locals[s1].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; graph->locals[s1].delta_valid = 0; triality_update_mask(&graph->locals[s1]); } /* Build edges */ for (int64_t i = 0; i < graph_blocks; i++) { hpc_cz(graph, 2 * i, 2 * i + 1); /* intra-block: d ↔ dmin */ if (i + 1 < graph_blocks) { hpc_cz(graph, 2 * i, 2 * (i + 1)); /* d ↔ d neighbor */ hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* dmin ↔ dmin */ } } /* ── Shor's Griffiths-Niu Sequential Measurement (dual quhit) ── * Replaces BP with exact marginals via IDFT6 + feed-forward + * collapse/back-action (ported 1:1 from tesseract_factor.c). * * The dual-quhit graph has 2×graph_blocks sites: * Even sites (s0 = 2*i): coarse (d dimension) * Odd sites (s1 = 2*i+1): fine (dmin dimension) * * Single-pass sequential measurement produces exact marginals * for both dimensions simultaneously through the CZ correlations. */ double (*shor_marg)[6] = (double (*)[6])calloc(n_sites, sizeof(double[6])); int *shor_measured = (int *)calloc(n_sites, sizeof(int)); shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1); /* Extract coarse (d) and fine (dmin) marginals from Shor output */ double (*coarse_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); double (*fine_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); for (int64_t i = 0; i < graph_blocks; i++) { for (int v = 0; v < 6; v++) { coarse_marg[i][v] = shor_marg[2 * i][v]; fine_marg[i][v] = shor_marg[2 * i + 1][v]; } } free(shor_marg); free(shor_measured); /* ══ Hensel-Inspired Beam Search Constraint Propagation ══ * Like tesseract_factor's Hensel lift: process blocks sequentially, * maintain K best configurations, prune by accumulated error. * * The constraint: blocks are selected JOINTLY. */ #define N_BEAMS 24 /* K beams — widened for 31B (was 12) */ typedef struct { double acc_error; int history_idx; /* index into the backpointer array */ } QuantBeam; typedef struct { int cand_idx; int parent_idx; } BeamHistory; QuantBeam beams[N_BEAMS]; int active_beams = 1; /* Pre-allocate history to avoid O(N^2) memory copies */ BeamHistory *history = (BeamHistory *)malloc(n_blocks * N_BEAMS * sizeof(BeamHistory)); for (int b = 0; b < N_BEAMS; b++) { beams[b].acc_error = 0.0; beams[b].history_idx = -1; } /* Process blocks sequentially with beam search */ for (int64_t i = 0; i < graph_blocks; i++) { double c_total = 0.0, f_total = 0.0; for (int v = 0; v < 6; v++) { c_total += coarse_marg[i][v]; f_total += fine_marg[i][v]; } /* Candidate scores for this block: triality prob × (1/normalized_error) */ double cand_score[TOTAL_SCALE_CANDIDATES]; int64_t blk = i * stride; int d_bin_count[6] = {0}, m_bin_count[6] = {0}; for (int k = 0; k < N_CAND_D; k++) d_bin_count[CAND_TO_QUHIT[k]]++; for (int k = 0; k < N_CAND_M; k++) m_bin_count[CAND_TO_QUHIT[k]]++; /* Per-block error normalization: divide by block mean error * so small-weight blocks don't dominate beam selection */ float blk_mean_err = 0.0f; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) blk_mean_err += candidate_errors[blk][c]; blk_mean_err /= (float)TOTAL_SCALE_CANDIDATES; if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f; for (int di = 0; di < N_CAND_D; di++) { int qi_d = CAND_TO_QUHIT[di]; double p_d = (c_total > 1e-30) ? coarse_marg[i][qi_d] / c_total : 1.0/6.0; p_d /= (double)d_bin_count[qi_d]; for (int mi = 0; mi < N_CAND_M; mi++) { int qi_m = CAND_TO_QUHIT[mi]; double p_m = (f_total > 1e-30) ? fine_marg[i][qi_m] / f_total : 1.0/6.0; p_m /= (double)m_bin_count[qi_m]; int cidx = di * N_CAND_M + mi; cand_score[cidx] = p_d * p_m / (candidate_errors[blk][cidx] / blk_mean_err + 1e-15); } } /* Extend beams × 36 candidates, keep top K */ typedef struct { double score; int beam_idx; int cand_idx; } BeamExt; BeamExt extensions[N_BEAMS * TOTAL_SCALE_CANDIDATES]; int n_ext = 0; for (int b = 0; b < active_beams; b++) { for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { /* Score = -(accumulated_error + this_block_error) × triality_prob */ double ext_err = beams[b].acc_error + candidate_errors[blk][c]; double ext_score = cand_score[c] / (ext_err + 1e-15); extensions[n_ext].score = ext_score; extensions[n_ext].beam_idx = b; extensions[n_ext].cand_idx = c; n_ext++; } } /* Top-K selection */ int top_k = (n_ext < N_BEAMS) ? n_ext : N_BEAMS; int top_indices[N_BEAMS]; for (int k = 0; k < top_k; k++) { int best = -1; double best_s = -1e30; for (int e = 0; e < n_ext; e++) { if (extensions[e].score > best_s) { best_s = extensions[e].score; best = e; } } top_indices[k] = best; extensions[best].score = -2e30; /* poison */ } /* Build new beams from top-K extensions using backpointers */ QuantBeam new_beams[N_BEAMS]; for (int k = 0; k < top_k; k++) { int ext_idx = top_indices[k]; int src_beam = extensions[ext_idx].beam_idx; int cand = extensions[ext_idx].cand_idx; int hist_idx = i * N_BEAMS + k; history[hist_idx].cand_idx = cand; history[hist_idx].parent_idx = beams[src_beam].history_idx; new_beams[k].history_idx = hist_idx; new_beams[k].acc_error = beams[src_beam].acc_error + candidate_errors[blk][cand]; } for (int k = 0; k < top_k; k++) beams[k] = new_beams[k]; active_beams = top_k; } /* Trace back the best beam's selections. * The beam search selects one candidate per GRAPH NODE (stride group). * For stride > 1, each block within the stride group independently * picks its own best candidate — using the beam's coarse/fine quhit * bins as a constraint, but evaluating its own candidate_errors. * This eliminates stride-aliasing: previously 96/97 blocks were * forced to use a candidate chosen for 1 representative block. */ int curr_hist = beams[0].history_idx; for (int64_t i = graph_blocks - 1; i >= 0; i--) { int group_cidx; if (curr_hist >= 0) { group_cidx = history[curr_hist].cand_idx; curr_hist = history[curr_hist].parent_idx; } else { group_cidx = 10 * N_CAND_M + 10; } if (stride <= 1) { /* No stride group — direct assignment */ best_candidate[i] = group_cidx; } else { /* Per-block local optimization within the stride group. * The beam-selected candidate determines the target quhit * bins (d_bin, dmin_bin). Each block picks its own best * candidate that falls in compatible bins, or falls back * to the globally best candidate for that block. */ int group_di = group_cidx / N_CAND_M; int group_mi = group_cidx % N_CAND_M; int target_d_bin = CAND_TO_QUHIT[group_di]; int target_m_bin = CAND_TO_QUHIT[group_mi]; for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) { /* Find best candidate in same quhit bins */ float best_err = 1e30f; int best_c = group_cidx; for (int di = 0; di < N_CAND_D; di++) { if (CAND_TO_QUHIT[di] != target_d_bin) continue; for (int mi = 0; mi < N_CAND_M; mi++) { if (CAND_TO_QUHIT[mi] != target_m_bin) continue; int cidx = di * N_CAND_M + mi; if (candidate_errors[b][cidx] < best_err) { best_err = candidate_errors[b][cidx]; best_c = cidx; } } } /* Also check if the block's overall best is significantly * better — if so, use it (greedy override) */ float global_best = 1e30f; int global_best_c = group_cidx; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { if (candidate_errors[b][c] < global_best) { global_best = candidate_errors[b][c]; global_best_c = c; } } /* Use bin-constrained choice unless the global best * is >5% better — preserves Shor coherence while * allowing escape from bad bin assignments */ if (global_best < best_err * 0.95f) best_candidate[b] = global_best_c; else best_candidate[b] = best_c; } } } free(history); /* ══════════════════════════════════════════════════════════════ * Phase 3.5: Born-Rule Multi-Shot Scale Refinement (Q2_K) * * 2D Born sampling: sample coarse quhit (d dimension) and * fine quhit (dmin dimension) jointly from triality marginals. * Each shot produces a (d_idx, dmin_idx) pair per block. * ══════════════════════════════════════════════════════════════ */ { #define Q2K_BORN_SHOTS 64 float beam_total_err = 0.0f; for (int64_t bi = 0; bi < n_blocks; bi++) beam_total_err += candidate_errors[bi][best_candidate[bi]]; unsigned int born_rng_q2 = 271828; /* Compute tail error once (blocks beyond graph coverage) */ float tail_err = 0.0f; for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++) tail_err += candidate_errors[bi][best_candidate[bi]]; /* Sparse shot buffer: only track stride-sampled blocks */ int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int)); for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) { float shot_err = tail_err; for (int64_t gi = 0; gi < graph_blocks; gi++) { /* Born sample coarse (d) quhit */ double c_total = 0.0; for (int v = 0; v < 6; v++) c_total += coarse_marg[gi][v]; born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u; double rnd_c = (double)(born_rng_q2 >> 8) / 16777216.0; double target_c = rnd_c * c_total; double cum_c = 0.0; int qi_d = 5; for (int v = 0; v < 6; v++) { cum_c += coarse_marg[gi][v]; if (cum_c > target_c) { qi_d = v; break; } } /* Born sample fine (dmin) quhit */ double f_total = 0.0; for (int v = 0; v < 6; v++) f_total += fine_marg[gi][v]; born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u; double rnd_f = (double)(born_rng_q2 >> 8) / 16777216.0; double target_f = rnd_f * f_total; double cum_f = 0.0; int qi_m = 5; for (int v = 0; v < 6; v++) { cum_f += fine_marg[gi][v]; if (cum_f > target_f) { qi_m = v; break; } } /* Find best candidate within the sampled (d_bin, m_bin) */ int64_t blk = gi * stride; float best_bin_err = 1e30f; int best_bin_cand = 10 * N_CAND_M + 10; for (int di = 0; di < N_CAND_D; di++) { if (CAND_TO_QUHIT[di] != qi_d) continue; for (int mi = 0; mi < N_CAND_M; mi++) { if (CAND_TO_QUHIT[mi] != qi_m) continue; int cidx = di * N_CAND_M + mi; if (candidate_errors[blk][cidx] < best_bin_err) { best_bin_err = candidate_errors[blk][cidx]; best_bin_cand = cidx; } } } shot_sparse[gi] = best_bin_cand; shot_err += candidate_errors[blk][best_bin_cand]; } if (shot_err < beam_total_err) { /* Only now apply the sparse updates to best_candidate */ for (int64_t gi = 0; gi < graph_blocks; gi++) best_candidate[gi * stride] = shot_sparse[gi]; beam_total_err = shot_err; } } free(shot_sparse); } free(coarse_marg); free(fine_marg); hpc_destroy(graph); } } else { /* OPT_MSE or single block: pick candidate with lowest raw error */ for (int64_t blk = 0; blk < n_blocks; blk++) { float best_err = candidate_errors[blk][0]; int best_idx = 0; for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) { if (candidate_errors[blk][c] < best_err) { best_err = candidate_errors[blk][c]; best_idx = c; } } best_candidate[blk] = best_idx; } } /* ══════════════════════════════════════════════════════════════════ * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction * * Like Q4_0's CF analog: the beam search / Born shots selected a * grid candidate (d_grid, dmin_grid). Now we EXTRACT the exact * optimal FP16 (d, dmin) via weighted least-squares, holding the * sub-block Ls/Lm and quantized levels fixed. * * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j] * * Full analog assembly: at each iteration, EXHAUSTIVELY search * all 16×16 = 256 possible (Ls[j], Lm[j]) pairs per sub-block * to find the assignment that minimizes weighted reconstruction * error. Then WLS-solve for the global (d, dmin). Repeat 5×. * * This guarantees every parameter is at its conditional optimum — * the perfect bit analog at 2-bit resolution. * ══════════════════════════════════════════════════════════════════ */ /* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement. * This eliminates ~776K malloc/free cycles from the inner loop. * Each thread reuses its graph via hpc_reset_for_subblock(). */ int _n_omp_threads = 1; #ifdef _OPENMP _n_omp_threads = omp_get_max_threads(); #endif HPCGraph **_tl_graphs = (HPCGraph **)calloc(_n_omp_threads, sizeof(HPCGraph *)); for (int _ti = 0; _ti < _n_omp_threads; _ti++) _tl_graphs[_ti] = hpc_create(N_SUB); #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err) for (int64_t blk = 0; blk < n_blocks; blk++) { const float *block_x = weights + blk * QK_K; int cidx = best_candidate[blk]; uint8_t Ls_blk[16], Lm_blk[16]; /* Start from HPC-selected candidate */ memcpy(Ls_blk, candidate_Ls[blk][cidx], 16); memcpy(Lm_blk, candidate_Lm[blk][cidx], 16); float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]); float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); /* ── Analog assembly: iterate to convergence ── * 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes * after 2-3 passes. Additional iterations produce negligible * change in the committed FP16 values. * A) Sub-block Shor measurement to find coupled (Ls,Lm) states * B) Optimal q-value assignment * C) WLS solve for (d, dmin) */ for (int ls_iter = 0; ls_iter < 3; ls_iter++) { /* ── Step A: Sub-block Quhit BP (Strategy 1) ── * For each sub-block j, evaluate all 256 (Ls, Lm) pairs. * Keep the 6 best pairs as quhit states for a 16-node graph. * Run BP to jointly select the globally optimal (Ls, Lm). */ uint8_t state_ls[N_SUB][6]; uint8_t state_lm[N_SUB][6]; float state_err[N_SUB][6]; for (int j = 0; j < N_SUB; j++) { const float *sx = block_x + 16 * j; for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f; for (int try_ls = 0; try_ls <= 15; try_ls++) { float d_sub = dm * (float)try_ls; for (int try_lm = 0; try_lm <= 15; try_lm++) { float m_sub = mm * (float)try_lm; float sub_err = 0.0f; for (int k = 0; k < 16; k++) { float x = sx[k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j + k] : 1.0f; int q = 0; if (d_sub >= 1e-15f) { q = gguf_nearest_int((x + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; } float deq = d_sub * (float)q - m_sub; float diff = x - deq; sub_err += diff * diff * w; } /* Insert into top 6 */ for (int v = 0; v < 6; v++) { if (sub_err < state_err[j][v]) { for (int u = 5; u > v; u--) { state_err[j][u] = state_err[j][u-1]; state_ls[j][u] = state_ls[j][u-1]; state_lm[j][u] = state_lm[j][u-1]; } state_err[j][v] = sub_err; state_ls[j][v] = (uint8_t)try_ls; state_lm[j][v] = (uint8_t)try_lm; break; } } } } } /* Reset thread-local sub-block graph (zero allocations) */ int _tid = 0; #ifdef _OPENMP _tid = omp_get_thread_num(); #endif HPCGraph *sg = _tl_graphs[_tid]; hpc_reset_for_subblock(sg, N_SUB); { float min_sub_err[N_SUB]; for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0]; /* Initialize unary potentials from local errors */ for (int j = 0; j < N_SUB; j++) { triality_dft(&sg->locals[j]); double amp_re[6]; double amp_norm = 0.0; for (int v = 0; v < 6; v++) { /* Adaptive temperature: scale with local error spread * so Shor measurement produces meaningful interference * patterns regardless of weight magnitude */ float err_spread = state_err[j][5] - state_err[j][0]; float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f; if (sub_temp < 1e-12f) sub_temp = 1e-12f; amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp); amp_norm += amp_re[v] * amp_re[v]; } if (amp_norm > 1e-30) { double inv = 1.0 / sqrt(amp_norm); for (int v = 0; v < 6; v++) amp_re[v] *= inv; } for (int v = 0; v < 6; v++) { sg->locals[j].edge_re[v] = amp_re[v]; sg->locals[j].edge_im[v] = 0.0; } sg->locals[j].primary = VIEW_EDGE; sg->locals[j].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; sg->locals[j].delta_valid = 0; triality_update_mask(&sg->locals[j]); } /* Add coupling edges between adjacent sub-blocks */ for (int j = 0; j < N_SUB - 1; j++) hpc_cz(sg, j, j + 1); /* ── Shor sequential measurement on sub-block graph ── * Stack-allocated arrays: eliminates 2 calloc/free per iteration */ double sub_marg[N_SUB][6]; int sub_measured[N_SUB]; memset(sub_marg, 0, sizeof(sub_marg)); memset(sub_measured, 0, sizeof(sub_measured)); shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1); /* Extract optimal Ls/Lm from Shor marginals */ for (int j = 0; j < N_SUB; j++) { double best_prob = -1.0; int best_v = 0; for (int v = 0; v < 6; v++) { if (sub_marg[j][v] > best_prob) { best_prob = sub_marg[j][v]; best_v = v; } } Ls_blk[j] = state_ls[j][best_v]; Lm_blk[j] = state_lm[j][best_v]; } } /* ── Step B: Quantize q-values with optimal Ls/Lm ── */ uint8_t L[QK_K]; for (int j = 0; j < N_SUB; j++) { float d_sub = dm * (float)Ls_blk[j]; float m_sub = mm * (float)Lm_blk[j]; if (d_sub < 1e-15f) { for (int k = 0; k < 16; k++) L[16*j+k] = 0; continue; } for (int k = 0; k < 16; k++) { int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; L[16*j+k] = (uint8_t)q; } } /* ── Step C: WLS solve for (d, dmin) ── * x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j] * Let a = Ls[j]×q[j,k], b = Lm[j] * Normal equations via Cramer's rule */ double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0; for (int j = 0; j < N_SUB; j++) { float ls_f = (float)Ls_blk[j]; float lm_f = (float)Lm_blk[j]; for (int k = 0; k < 16; k++) { float x = block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j+k] : 1.0f; float a = ls_f * (float)L[16*j+k]; float b = lm_f; Saa += w * a * a; Sab += w * a * b; Sbb += w * b * b; Sxa += w * x * a; Sxb += w * x * b; } } double det = Saa * Sbb - Sab * Sab; if (fabs(det) > 1e-30) { double d_new = (Sbb * Sxa - Sab * Sxb) / det; double dm_new = (Sab * Sxa - Saa * Sxb) / det; /* Clamp: positive and within 4× of candidate seed */ float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]); float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10)) dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new)); if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10)) mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new)); } if (isnan(dm) || isnan(mm)) { printf("NaN detected before ULP: dm=%f mm=%f det=%f\n", dm, mm, det); exit(1); } } /* ── FP16 ULP neighborhood search for (d, dmin) ── * The WLS solve found continuous-optimal (d, dmin). But FP16 * truncation may shift the optimum. Try ±4 ULP around both * d and dmin, pick the pair with minimum reconstruction error. */ { uint16_t base_d16 = gguf_fp32_to_fp16(dm); uint16_t base_m16 = gguf_fp32_to_fp16(mm); uint16_t best_d16 = base_d16, best_m16 = base_m16; float best_ulp_err = 1e30f; for (int dd = -2; dd <= 2; dd++) { int cd16 = (int)base_d16 + dd; if (cd16 < 0 || cd16 > 0x7BFF) continue; float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16); for (int dm_delta = -2; dm_delta <= 2; dm_delta++) { int cm16 = (int)base_m16 + dm_delta; if (cm16 < 0 || cm16 > 0x7BFF) continue; float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16); float err = 0.0f; for (int j = 0; j < N_SUB; j++) { float d_sub = trial_dm * (float)Ls_blk[j]; float m_sub = trial_mm * (float)Lm_blk[j]; for (int k = 0; k < 16; k++) { float x = block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j+k] : 1.0f; int q; if (d_sub < 1e-15f) { q = 0; } else { q = gguf_nearest_int((x + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; } float deq = d_sub * (float)q - m_sub; float diff = x - deq; err += diff * diff * w; } } if (err < best_ulp_err) { best_ulp_err = err; best_d16 = (uint16_t)cd16; best_m16 = (uint16_t)cm16; } } } dm = gguf_fp16_to_fp32(best_d16); mm = gguf_fp16_to_fp32(best_m16); } /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ── * The WLS solve may have shifted (d, dmin) after the last Step A. * Neighborhood search ±2 around current values (25 pairs vs 256) * is sufficient since WLS shifts are typically < 1 Ls/Lm step. */ for (int j = 0; j < N_SUB; j++) { const float *sx = block_x + 16 * j; float best_sub_err = 1e30f; uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j]; int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0; int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15; int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0; int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15; for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) { float d_sub = dm * (float)try_ls; for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) { float m_sub = mm * (float)try_lm; float sub_err = 0.0f; for (int k = 0; k < 16; k++) { float x = sx[k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j + k] : 1.0f; int q; if (d_sub < 1e-15f) { q = 0; } else { q = gguf_nearest_int((x + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; } float deq = d_sub * (float)q - m_sub; float diff = x - deq; sub_err += diff * diff * w; } if (sub_err < best_sub_err) { best_sub_err = sub_err; best_ls = (uint8_t)try_ls; best_lm = (uint8_t)try_lm; } } } Ls_blk[j] = best_ls; Lm_blk[j] = best_lm; } /* Store the extracted optimal FP16 (d, dmin) */ output[blk].d = gguf_fp32_to_fp16(dm); output[blk].dmin = gguf_fp32_to_fp16(mm); for (int j = 0; j < N_SUB; j++) output[blk].scales[j] = Ls_blk[j] | (Lm_blk[j] << 4); /* ── Final quantization with D₆ Hadamard Error Shaping ── * * Standard Q2_K rounds each weight independently: q = round((x+m)/d). * But within a sub-block, weights share (d, m), so their quantization * errors are CORRELATED. Independent rounding is suboptimal. * * The D₆ fold (antipodal Hadamard from the triality quhit) decomposes * the error vector into vesica (sum) and wave (difference) components: * vesica[k] = (e[k] + e[k+3]) / √2 — DC-like, accumulates in dot products * wave[k] = (e[k] - e[k+3]) / √2 — noise-like, cancels in dot products * * We WANT large wave error and small vesica error. So we greedily * flip rounding decisions (floor↔ceil) to minimize vesica energy, * even if total element-wise error increases slightly. * * Process: 16 elements per sub-block, treat as 2 groups of 6 + 4 tail. * Apply DFT₆-fold to each group of 6, minimize vesica component. */ uint8_t L[QK_K]; for (int j = 0; j < N_SUB; j++) { float d = dm * (float)(output[blk].scales[j] & 0xF); if (d < 1e-15f) { for (int k = 0; k < 16; k++) L[16 * j + k] = 0; continue; } float m = mm * (float)(output[blk].scales[j] >> 4); float id = 1.0f / d; /* Step 1: Standard nearest-rounding as baseline */ int q_base[16]; float q_cont[16]; /* continuous q values before rounding */ for (int k = 0; k < 16; k++) { q_cont[k] = (block_x[16*j+k] + m) * id; q_base[k] = gguf_nearest_int(q_cont[k]); if (q_base[k] < 0) q_base[k] = 0; if (q_base[k] > 3) q_base[k] = 3; } /* Step 2: D₆ Hadamard Error Shaping * For each 6-element group, greedily flip the rounding decision * that most reduces the D₆-folded vesica error component. * * D₆ fold on 6-element groups: antipodal pairs (0,3), (1,4), (2,5) * vesica[k] = e[k] + e[k+3] (k=0,1,2) — DC-like, propagates * wave[k] = e[k] - e[k+3] (k=0,1,2) — noise-like, cancels * * Weight vesica 4× over wave + penalize DC (sum of all 6 errors) */ int q_shaped[16]; memcpy(q_shaped, q_base, 16 * sizeof(int)); /* Process groups: [0..5], [6..11], tail [12..15] handled by D₆ metric on available pairs */ for (int g = 0; g < 2; g++) { int g_off = g * 6; if (g_off + 5 >= 16) break; /* Multiple greedy passes — each pass finds the single best flip */ for (int pass = 0; pass < 6; pass++) { int best_k = -1; int best_q_alt = 0; float best_delta = 0.0f; /* improvement = current_metric - alt_metric */ /* Compute current group errors */ float e_cur[6]; for (int kk = 0; kk < 6; kk++) { int ii = g_off + kk; float deq = d * (float)q_shaped[ii] - m; e_cur[kk] = block_x[16*j+ii] - deq; } /* Current D₆ metric: vesica energy + DC² */ float vesica_cur = 0.0f, dc_cur = 0.0f; for (int p = 0; p < 3; p++) { float v = e_cur[p] + e_cur[p+3]; vesica_cur += v * v; } for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk]; float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; /* Try flipping each element */ for (int k = 0; k < 6; k++) { int idx = g_off + k; int q_cur = q_shaped[idx]; /* Try the alternative rounding */ int q_try; if (q_cont[idx] - (float)q_cur >= 0) { q_try = q_cur + 1; } else { q_try = q_cur - 1; } if (q_try < 0 || q_try > 3) continue; /* Compute alt errors (only element k changes) */ float e_alt[6]; for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk]; float deq_try = d * (float)q_try - m; e_alt[k] = block_x[16*j+idx] - deq_try; /* Alt D₆ metric */ float vesica_alt = 0.0f, dc_alt = 0.0f; for (int p = 0; p < 3; p++) { float v = e_alt[p] + e_alt[p+3]; vesica_alt += v * v; } for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk]; float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt; float delta = metric_cur - metric_alt; if (delta > best_delta) { best_delta = delta; best_k = k; best_q_alt = q_try; } } if (best_k < 0) break; /* no improvement found */ q_shaped[g_off + best_k] = best_q_alt; /* commit the flip */ } } /* Step 3: Final error comparison — only keep shaped if it improves * or is within 5% of baseline (vesica shaping trades element MSE * for better spectral distribution of error) */ float err_base = 0.0f, err_shaped = 0.0f; for (int k = 0; k < 16; k++) { float x = block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j + k] : 1.0f; float deq_b = d * (float)q_base[k] - m; float deq_s = d * (float)q_shaped[k] - m; err_base += (x - deq_b) * (x - deq_b) * w; err_shaped += (x - deq_s) * (x - deq_s) * w; } int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base; for (int k = 0; k < 16; k++) L[16 * j + k] = (uint8_t)q_final[k]; } for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; l++) { output[blk].qs[j / 4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } float berr = gguf_q2_k_block_error(block_x, &output[blk]); if (isnan(berr)) { printf("NaN block error at blk %ld! dm=%f mm=%f\n", (long)blk, dm, mm); for (int j=0; j<16; j++) printf("Ls[%d]=%d Lm[%d]=%d\n", j, Ls_blk[j], j, Lm_blk[j]); exit(1); } total_err += berr; } /* Free thread-local sub-block graphs */ for (int _ti = 0; _ti < _n_omp_threads; _ti++) hpc_destroy(_tl_graphs[_ti]); free(_tl_graphs); free(seeds); free(candidate_errors); free(candidate_d); free(candidate_dmin); free(candidate_Ls); free(candidate_Lm); free(best_candidate); if (out_total_error) *out_total_error = total_err; if (verbose) { float rmse = sqrtf(total_err / (float)n_elements); /* Compute weight σ for fidelity classification */ double w_sum2 = 0.0; for (int64_t i = 0; i < n_elements; i++) w_sum2 += (double)weights[i] * (double)weights[i]; float w_sigma = (float)sqrt(w_sum2 / (double)n_elements); float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f; /* Fidelity classification */ const char *fidelity_class; const char *fidelity_icon; if (rmse <= 1.0e-04f) { fidelity_class = "ULTRA (≤1e-04)"; fidelity_icon = "★★★★"; } else if (rmse <= 3.0e-04f) { fidelity_class = "HIGH (≤3e-04)"; fidelity_icon = "★★★☆"; } else if (rmse <= 1.0e-03f) { fidelity_class = "GOOD (≤1e-03)"; fidelity_icon = "★★☆☆"; } else { fidelity_class = "STANDARD"; fidelity_icon = "★☆☆☆"; } printf("\n ┌──── Shor Measurement Q2_K Report ────────────────────────────────┐\n"); printf(" │ Elements: %-12lld Blocks: %-12lld │\n", (long long)n_elements, (long long)(n_elements / QK_K)); printf(" │ Weight σ: %-12.4e Range: [%.4e, %.4e] │\n", w_sigma, w_sigma * -4.0f, w_sigma * 4.0f); printf(" │ Total MSE: %-12.6f │\n", total_err); printf(" │ RMSE: %-12.4e RMSE/σ: %-8.4f │\n", rmse, rmse_over_sigma); printf(" │ Fidelity: %s %-14s │\n", fidelity_icon, fidelity_class); printf(" │ Engine: Shor Griffiths-Niu (IDFT6 + feed-forward) │\n"); printf(" └─────────────────────────────────────────────────────────────────┘\n"); } } /* ═══════════════════════════════════════════════════════════════════════════ * PROGRESS REPORTING * ═══════════════════════════════════════════════════════════════════════════ */ static void print_progress_bar(int current, int total, const char *label, clock_t start_time) { if (total <= 0) return; float pct = (float)current / (float)total; int bar_width = 40; int filled = (int)(pct * bar_width); double elapsed = (double)(clock() - start_time) / CLOCKS_PER_SEC; double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0; printf("\r ["); for (int i = 0; i < bar_width; i++) { if (i < filled) printf("█"); else if (i == filled) printf("▓"); else printf("░"); } printf("] %3d%% (%d/%d) %.0fs ETA:%.0fs %s", (int)(pct * 100), current, total, elapsed, eta, label); fflush(stdout); if (current == total) printf("\n"); } /* ═══════════════════════════════════════════════════════════════════════════ * GGUF FILE WRITER — Assembles the complete output file * ═══════════════════════════════════════════════════════════════════════════ */ static int write_gguf(const char *output_path, const STMultiFile *mf, const ModelArchitecture *arch, const TokenizerData *tokenizer, OptimizerMode opt_mode, const IMatrixData *imatrix, int verbose) { FILE *fp = fopen(output_path, "wb"); if (!fp) { fprintf(stderr, " ERROR: Cannot open '%s' for writing\n", output_path); return -1; } printf("\n ╔════════════════════════════════════════════════════════════════╗\n"); printf(" ║ WRITING GGUF FILE ║\n"); printf(" ╚════════════════════════════════════════════════════════════════╝\n\n"); /* ── Determine which tensors to include ── */ int *include_list = (int *)calloc(mf->n_tensors, sizeof(int)); int n_include = 0; for (int i = 0; i < mf->n_tensors; i++) { if (!should_skip_tensor(mf->tensor_map[i].name)) { include_list[n_include++] = i; } else { if (verbose) printf(" SKIP: %s (not needed in GGUF)\n", mf->tensor_map[i].name); } } /* ── Count metadata KV pairs ── */ int n_kv = 0; n_kv++; /* general.architecture */ n_kv++; /* general.name */ n_kv++; /* general.quantization_version */ n_kv++; /* general.file_type */ n_kv++; /* {arch}.context_length */ n_kv++; /* {arch}.embedding_length */ n_kv++; /* {arch}.block_count */ n_kv++; /* {arch}.feed_forward_length */ n_kv++; /* {arch}.attention.head_count */ n_kv++; /* {arch}.attention.head_count_kv */ n_kv++; /* {arch}.attention.layer_norm_rms_epsilon */ n_kv++; /* {arch}.rope.freq_base */ n_kv++; /* {arch}.vocab_size */ /* Tokenizer metadata KV count */ int has_tokenizer = (tokenizer != NULL && tokenizer->vocab_size > 0); if (has_tokenizer) { n_kv++; /* tokenizer.ggml.model */ n_kv++; /* tokenizer.ggml.tokens */ n_kv++; /* tokenizer.ggml.scores */ n_kv++; /* tokenizer.ggml.token_type */ n_kv++; /* tokenizer.ggml.bos_token_id */ n_kv++; /* tokenizer.ggml.eos_token_id */ n_kv++; /* tokenizer.ggml.unknown_token_id */ if (tokenizer->n_merges > 0) n_kv++; /* tokenizer.ggml.merges */ } /* ── Check for weight tying ── * If tie_word_embeddings is set and there's no separate lm_head, * llama.cpp handles this internally — do NOT duplicate the tensor. * Only add output.weight if the model has a separate lm_head.weight. */ int has_lm_head = (st_multi_find_tensor(mf, "lm_head.weight") >= 0); int total_tensors = n_include; if (arch->tie_word_embeddings && !has_lm_head) { printf(" Weight-tied embeddings detected — llama.cpp handles internally\n\n"); } /* ── Prepare tensor info ── */ char (*gguf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN); GGMLType *tensor_types = calloc(total_tensors, sizeof(GGMLType)); int64_t *tensor_sizes = calloc(total_tensors, sizeof(int64_t)); uint64_t data_offset = 0; uint64_t *tensor_offsets = calloc(total_tensors, sizeof(uint64_t)); int *tensor_src_idx = calloc(total_tensors, sizeof(int)); /* map to unified ST index */ char (*tensor_hf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN); GGMLType quant_type = GGML_TYPE_Q2_K; for (int i = 0; i < n_include; i++) { int src = include_list[i]; const STTensorInfo *ti = st_multi_tensor_info(mf, src); map_tensor_name(mf->tensor_map[src].name, gguf_names[i], ST_MAX_NAME_LEN); strncpy(tensor_hf_names[i], mf->tensor_map[src].name, ST_MAX_NAME_LEN - 1); tensor_src_idx[i] = src; if (should_quantize(ti, gguf_names[i])) { if (is_attention_tensor(gguf_names[i])) { /* Promote attention Q/K/V/O to Q4_0 for higher precision. * Attention scores are most sensitive to quantization noise. */ tensor_types[i] = GGML_TYPE_Q4_0; int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0; tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0); if (verbose) printf(" [ATTN→Q4_0] %s (%ld elements)\n", gguf_names[i], (long)ti->n_elements); } else { tensor_types[i] = quant_type; tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements); } } else if (ti->n_dims >= 2) { /* 2D non-quantized tensors (embeddings, output) → F16 */ tensor_types[i] = GGML_TYPE_F16; tensor_sizes[i] = ti->n_elements * sizeof(uint16_t); } else { /* 1D tensors (norms, biases) → F32 */ tensor_types[i] = GGML_TYPE_F32; tensor_sizes[i] = ti->n_elements * sizeof(float); } tensor_offsets[i] = data_offset; /* Align each tensor to 32 bytes */ data_offset += tensor_sizes[i]; data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) & ~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1); } /* ── Write header ── */ gguf_write_header(fp, total_tensors, n_kv); /* ── Write metadata KV pairs ── */ gguf_write_kv_string(fp, "general.architecture", arch->architecture); gguf_write_kv_string(fp, "general.name", arch->name); gguf_write_kv_uint32(fp, "general.quantization_version", 2); gguf_write_kv_uint32(fp, "general.file_type", 10); /* Q2_K = 10 */ char kbuf[128]; snprintf(kbuf, sizeof(kbuf), "%s.context_length", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->context_length); snprintf(kbuf, sizeof(kbuf), "%s.embedding_length", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->embedding_length); snprintf(kbuf, sizeof(kbuf), "%s.block_count", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->block_count); snprintf(kbuf, sizeof(kbuf), "%s.feed_forward_length", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->feed_forward_length); snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->head_count); snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count_kv", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->head_count_kv); snprintf(kbuf, sizeof(kbuf), "%s.attention.layer_norm_rms_epsilon", arch->architecture); gguf_write_kv_float32(fp, kbuf, arch->rms_norm_eps); snprintf(kbuf, sizeof(kbuf), "%s.rope.freq_base", arch->architecture); gguf_write_kv_float32(fp, kbuf, arch->rope_freq_base); snprintf(kbuf, sizeof(kbuf), "%s.vocab_size", arch->architecture); gguf_write_kv_uint32(fp, kbuf, arch->vocab_size); /* ── Write tokenizer metadata ── */ if (has_tokenizer) { gguf_write_kv_string(fp, "tokenizer.ggml.model", tokenizer->model_type); gguf_write_kv_string_array(fp, "tokenizer.ggml.tokens", (const char **)tokenizer->tokens, (uint64_t)tokenizer->vocab_size); gguf_write_kv_float32_array(fp, "tokenizer.ggml.scores", tokenizer->scores, (uint64_t)tokenizer->vocab_size); gguf_write_kv_int32_array(fp, "tokenizer.ggml.token_type", tokenizer->token_types, (uint64_t)tokenizer->vocab_size); gguf_write_kv_uint32(fp, "tokenizer.ggml.bos_token_id", (uint32_t)tokenizer->bos_id); gguf_write_kv_uint32(fp, "tokenizer.ggml.eos_token_id", (uint32_t)tokenizer->eos_id); gguf_write_kv_uint32(fp, "tokenizer.ggml.unknown_token_id", (uint32_t)tokenizer->unk_id); if (tokenizer->n_merges > 0) { gguf_write_kv_string_array(fp, "tokenizer.ggml.merges", (const char **)tokenizer->merges, (uint64_t)tokenizer->n_merges); } printf(" Tokenizer metadata written (%d tokens, %d merges)\n\n", tokenizer->vocab_size, tokenizer->n_merges); } /* ── Write tensor info descriptors ── */ for (int i = 0; i < total_tensors; i++) { int src = tensor_src_idx[i]; const STTensorInfo *ti = st_multi_tensor_info(mf, src); uint64_t dims[ST_MAX_DIMS]; /* GGUF uses reversed dimension order from SafeTensors/PyTorch */ int nd = ti->n_dims; for (int d = 0; d < nd; d++) { dims[d] = (uint64_t)ti->shape[nd - 1 - d]; } gguf_write_tensor_info(fp, gguf_names[i], ti->n_dims, dims, tensor_types[i], tensor_offsets[i]); } /* ── Alignment padding before data section ── */ gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT); /* ── Write tensor data ── */ printf(" Quantizing and writing %d tensors...\n\n", total_tensors); float total_error_sum = 0.0f; int quant_count = 0; int64_t total_elements_quantized = 0; int64_t total_bytes_quantized = 0; int64_t total_bytes_unquantized = 0; clock_t quant_start = clock(); for (int i = 0; i < total_tensors; i++) { int src = tensor_src_idx[i]; const STTensorInfo *ti = st_multi_tensor_info(mf, src); print_progress_bar(i, total_tensors, gguf_names[i], quant_start); if (tensor_types[i] == GGML_TYPE_Q2_K) { /* ── HPC-Optimized Q2_K Quantization ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n", ti->name); continue; } int64_t n_elements = ti->n_elements; float tensor_error = 0.0f; /* Pad to QK_K boundary */ int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K; if (padded > n_elements) { f32_data = realloc(f32_data, padded * sizeof(float)); for (int64_t j = n_elements; j < padded; j++) f32_data[j] = 0.0f; n_elements = padded; } int64_t n_blocks = n_elements / QK_K; BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K)); /* Look up imatrix importance for this tensor */ const float *imp = NULL; if (imatrix) { const IMatrixEntry *ime = imatrix_find_any(imatrix, gguf_names[i], tensor_hf_names[i]); if (ime && ime->n_values > 0) { imp = ime->normalized; if (verbose) printf("\n imatrix: using %d importance weights for %s\n", ime->n_values, gguf_names[i]); } } quantize_tensor_q2k_hpc(f32_data, n_elements, quant_data, &tensor_error, opt_mode, imp, verbose); fwrite(quant_data, sizeof(BlockQ2K), n_blocks, fp); float rmse = sqrtf(tensor_error / (float)ti->n_elements); /* Compute weight σ for fidelity gate */ double wss = 0.0; for (int64_t j = 0; j < ti->n_elements; j++) wss += (double)f32_data[j] * (double)f32_data[j]; float w_sig = (float)sqrt(wss / (double)ti->n_elements); /* Fidelity gate: classify RMSE vs 1e-04 target */ const char *fid; if (rmse <= 1.0e-04f) fid = "★★★★ ULTRA"; else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH"; else if (rmse <= 1.0e-03f) fid = "★★☆☆ GOOD"; else fid = "★☆☆☆ STD"; if (verbose) { printf("\n [Q2_K·Shor] %-47s\n", gguf_names[i]); printf(" %10ld elements → %ld bytes σ=%.2e RMSE=%.4e %s\n", (long)ti->n_elements, (long)(n_blocks * sizeof(BlockQ2K)), w_sig, rmse, fid); } total_error_sum += tensor_error; total_elements_quantized += ti->n_elements; total_bytes_quantized += n_blocks * sizeof(BlockQ2K); quant_count++; free(quant_data); free(f32_data); } else if (tensor_types[i] == GGML_TYPE_Q4_0) { /* ── HPC-Optimized Q4_0 Quantization (attention tensors) ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n", ti->name); continue; } int64_t n_elements = ti->n_elements; /* Pad to QK4_0 boundary */ int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0; if (padded > n_elements) { f32_data = realloc(f32_data, padded * sizeof(float)); for (int64_t j = n_elements; j < padded; j++) f32_data[j] = 0.0f; n_elements = padded; } int64_t n_blocks_q4 = n_elements / QK4_0; BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0)); float tensor_error = 0.0f; /* Look up imatrix importance for this tensor */ const float *imp = NULL; if (imatrix) { const IMatrixEntry *ime = imatrix_find_any(imatrix, gguf_names[i], tensor_hf_names[i]); if (ime && ime->n_values > 0) { imp = ime->normalized; if (verbose) printf("\n imatrix: using %d importance weights for %s\n", ime->n_values, gguf_names[i]); } } quantize_tensor_q4_0_hpc(f32_data, n_elements, q4_data, &tensor_error, imp, verbose); fwrite(q4_data, sizeof(BlockQ4_0), n_blocks_q4, fp); float rmse = sqrtf(tensor_error / (float)ti->n_elements); /* Compute weight σ for fidelity gate */ double wss4 = 0.0; for (int64_t j = 0; j < ti->n_elements; j++) wss4 += (double)f32_data[j] * (double)f32_data[j]; float w_sig4 = (float)sqrt(wss4 / (double)ti->n_elements); const char *fid4; if (rmse <= 1.0e-04f) fid4 = "★★★★ ULTRA"; else if (rmse <= 3.0e-04f) fid4 = "★★★☆ HIGH"; else if (rmse <= 1.0e-03f) fid4 = "★★☆☆ GOOD"; else fid4 = "★☆☆☆ STD"; if (verbose) { printf("\n [Q4_0·Shor] %-47s\n", gguf_names[i]); printf(" %10ld elements → %ld bytes σ=%.2e RMSE=%.4e %s\n", (long)ti->n_elements, (long)(n_blocks_q4 * sizeof(BlockQ4_0)), w_sig4, rmse, fid4); } total_error_sum += tensor_error; total_elements_quantized += ti->n_elements; total_bytes_quantized += n_blocks_q4 * sizeof(BlockQ4_0); quant_count++; free(q4_data); free(f32_data); } else if (tensor_types[i] == GGML_TYPE_F16) { /* ── Store as F16 (embeddings, output, 2D non-quantized) ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n", ti->name); continue; } /* Convert F32 → F16 */ uint16_t *f16_data = (uint16_t *)malloc(ti->n_elements * sizeof(uint16_t)); for (int64_t j = 0; j < ti->n_elements; j++) f16_data[j] = gguf_fp32_to_fp16(f32_data[j]); fwrite(f16_data, sizeof(uint16_t), ti->n_elements, fp); total_bytes_unquantized += ti->n_elements * sizeof(uint16_t); if (verbose) { printf("\n [F16 ] %-50s %10ld elements → %ld bytes\n", gguf_names[i], (long)ti->n_elements, (long)(ti->n_elements * sizeof(uint16_t))); } free(f16_data); free(f32_data); } else { /* ── Keep as F32 (1D: norms, biases) ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n", ti->name); continue; } fwrite(f32_data, sizeof(float), ti->n_elements, fp); total_bytes_unquantized += ti->n_elements * sizeof(float); if (verbose) { printf("\n [F32 ] %-50s %10ld elements → %ld bytes\n", gguf_names[i], (long)ti->n_elements, (long)(ti->n_elements * sizeof(float))); } free(f32_data); } /* Pad to alignment */ gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT); } print_progress_bar(total_tensors, total_tensors, "done", quant_start); long final_size = ftell(fp); fclose(fp); /* ── Final summary with Shor fidelity metrics ── */ /* Compute original model size (all as F32) */ int64_t original_f32_size = 0; for (int i = 0; i < total_tensors; i++) { const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]); original_f32_size += ti->n_elements * sizeof(float); } float compression_ratio = (original_f32_size > 0) ? (float)original_f32_size / (float)final_size : 0.0f; float effective_bpw = (total_elements_quantized > 0) ? 8.0f * (float)total_bytes_quantized / (float)total_elements_quantized : 0.0f; float total_rmse = (total_elements_quantized > 0) ? sqrtf(total_error_sum / (float)total_elements_quantized) : 0.0f; float mean_mse_per_tensor = (quant_count > 0) ? total_error_sum / (float)quant_count : 0.0f; /* Fidelity classification */ const char *overall_fid, *overall_icon; if (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)"; overall_icon = "★★★★"; } else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)"; overall_icon = "★★★☆"; } else if (total_rmse <= 1.0e-03f) { overall_fid = "GOOD (≤1e-03)"; overall_icon = "★★☆☆"; } else { overall_fid = "STANDARD"; overall_icon = "★☆☆☆"; } printf("\n ╔════════════════════════════════════════════════════════════════╗\n"); printf(" ║ SHOR-OPTIMIZED QUANTIZATION SUMMARY ║\n"); printf(" ╠════════════════════════════════════════════════════════════════╣\n"); printf(" ║ ║\n"); printf(" ║ Engine: Griffiths-Niu Sequential Measurement ║\n"); printf(" ║ Protocol: IDFT6 → feed-forward → Born → collapse ║\n"); printf(" ║ Origin: tesseract_factor.c (Shor's algorithm) ║\n"); printf(" ║ ║\n"); printf(" ╠════════════════════════════════════════════════════════════════╣\n"); printf(" ║ Tensors quantized: %-33d ║\n", quant_count); printf(" ║ Elements quantized: %15ld ║\n", (long)total_elements_quantized); printf(" ║ Quantized data: %12ld bytes (%6.1f MB) ║\n", (long)total_bytes_quantized, (double)total_bytes_quantized / (1024.0 * 1024.0)); printf(" ║ Unquantized data: %12ld bytes (%6.1f MB) ║\n", (long)total_bytes_unquantized, (double)total_bytes_unquantized / (1024.0 * 1024.0)); printf(" ║ Effective bits/weight: %15.2f ║\n", effective_bpw); printf(" ║ Compression ratio: %15.1fx ║\n", compression_ratio); printf(" ║ ║\n"); printf(" ╠════════════════════════════════════════════════════════════════╣\n"); printf(" ║ FIDELITY METRICS (target: 1e-04) ║\n"); printf(" ╠════════════════════════════════════════════════════════════════╣\n"); printf(" ║ ║\n"); printf(" ║ Total MSE: %15.6e ║\n", total_error_sum); printf(" ║ Per-element RMSE: %15.4e ║\n", total_rmse); printf(" ║ Mean MSE/tensor: %15.6e ║\n", mean_mse_per_tensor); printf(" ║ ║\n"); printf(" ║ Fidelity class: %s %-14s ║\n", overall_icon, overall_fid); if (total_rmse <= 1.0e-04f) printf(" ║ ✓ RMSE ≤ 1e-04: TARGET MET — maximum fidelity achieved ║\n"); else if (total_rmse <= 3.0e-04f) printf(" ║ ◐ RMSE ≤ 3e-04: near target — high fidelity achieved ║\n"); else printf(" ║ ○ RMSE > 3e-04: below target — weight σ may be large ║\n"); printf(" ║ ║\n"); printf(" ╠════════════════════════════════════════════════════════════════╣\n"); printf(" ║ Output file: %ld bytes (%.1f MB)%*s║\n", final_size, (double)final_size / (1024.0 * 1024.0), (int)(27 - snprintf(NULL, 0, "%ld bytes (%.1f MB)", final_size, (double)final_size / (1024.0 * 1024.0))), ""); printf(" ╚════════════════════════════════════════════════════════════════╝\n\n"); free(include_list); free(gguf_names); free(tensor_types); free(tensor_sizes); free(tensor_offsets); free(tensor_src_idx); free(tensor_hf_names); return 0; } /* ═══════════════════════════════════════════════════════════════════════════ * LIBRARY API — Exported functions for Python ctypes integration * * When built with -DHEXSTATE_LIBRARY, these are the only public symbols. * The Python GGUF pipeline handles metadata/IO; C handles HPC quantization. * ═══════════════════════════════════════════════════════════════════════════ */ /* Initialize HExState subsystems (must be called once before quantization) */ void hexstate_init(void) { static int initialized = 0; if (!initialized) { srand(42); /* Deterministic for reproducibility */ triality_exotic_init(); s6_exotic_init(); triality_stats_reset(); initialized = 1; } } /* Quantize a single tensor's F32 data to Q2_K using HPC optimization. * * Parameters: * weights: input F32 data (must be padded to multiple of 256) * n_elements: number of elements (must be multiple of 256) * output: output buffer (must be n_elements/256 * 84 bytes) * out_error: pointer to receive total MSE (can be NULL) * opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended) * verbose: 1 for per-block diagnostics */ void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements, void *output, float *out_error, int opt_mode, int verbose) { hexstate_init(); quantize_tensor_q2k_hpc(weights, n_elements, (BlockQ2K *)output, out_error, (OptimizerMode)opt_mode, NULL, verbose); } /* Same as above but with importance matrix weights */ void hexstate_quantize_tensor_q2k_imat(const float *weights, int64_t n_elements, void *output, float *out_error, int opt_mode, const float *imat_importance, int verbose) { hexstate_init(); quantize_tensor_q2k_hpc(weights, n_elements, (BlockQ2K *)output, out_error, (OptimizerMode)opt_mode, imat_importance, verbose); } /* Get the block size for Q2_K (84 bytes per 256 elements) */ int hexstate_q2k_block_bytes(void) { return sizeof(BlockQ2K); } int hexstate_q2k_block_elements(void) { return QK_K; } /* HPC-optimized Q4_0 quantization for attention tensors. * Called from Python requantizer via ctypes. * weights: input F32 weights * n_elements: number of elements (must be multiple of 32) * output: output buffer (must be n_elements/32 * 18 bytes) * out_error: pointer to receive total MSE (can be NULL) * imat_importance: optional per-element importance weights * verbose: 1 for per-block diagnostics */ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, void *output, float *out_error, const float *imat_importance, int verbose) { hexstate_init(); float err = 0.0f; quantize_tensor_q4_0_hpc(weights, n_elements, (BlockQ4_0 *)output, &err, imat_importance, verbose); if (out_error) *out_error = err; } #ifndef HEXSTATE_LIBRARY /* ═══════════════════════════════════════════════════════════════════════════ * MAIN * ═══════════════════════════════════════════════════════════════════════════ */ int main(int argc, char **argv) { srand(time(NULL)); /* Initialize HExState subsystems */ triality_exotic_init(); s6_exotic_init(); triality_stats_reset(); printf("\n"); printf(" ╔════════════════════════════════════════════════════════════════╗\n"); printf(" ║ ║\n"); printf(" ║ HExState GGUF QUANTIZER v3.0 — Shor-Optimized ║\n"); printf(" ║ ║\n"); printf(" ║ Architecture: HPCGraph Sensitivity Propagation ║\n"); printf(" ║ Optimization: Shor's Griffiths-Niu Measurement + iMatrix ║\n"); printf(" ║ Output: GGUF v3 (Q2_K, 2.625 bpw) ║\n"); printf(" ║ ║\n"); printf(" ║ \"The weight and the quantized are opposite faces.\" ║\n"); printf(" ║ ║\n"); printf(" ╚════════════════════════════════════════════════════════════════╝\n\n"); if (argc < 3) { printf(" Usage: %s [options]\n\n", argv[0]); printf(" Input:\n"); printf(" Single .safetensors file, or\n"); printf(" Model directory with sharded .safetensors files\n\n"); printf(" Options:\n"); printf(" --optimizer hpc|mse|hybrid Scale optimization (default: hybrid)\n"); printf(" --imatrix Importance matrix for Q2_K quality\n"); printf(" --config Explicit config.json for arch detection\n"); printf(" --qwen Force Qwen 3.5/3.6 architecture\n"); printf(" --verbose Per-block diagnostics\n\n"); return 1; } const char *input_path = argv[1]; const char *output_path = argv[2]; OptimizerMode opt_mode = OPT_HYBRID; const char *imatrix_path = NULL; const char *config_override = NULL; int verbose = 0; int force_qwen = 0; /* Parse options */ for (int i = 3; i < argc; i++) { if (strcmp(argv[i], "--optimizer") == 0 && i + 1 < argc) { i++; if (strcmp(argv[i], "hpc") == 0) opt_mode = OPT_HPC; else if (strcmp(argv[i], "mse") == 0) opt_mode = OPT_MSE; else if (strcmp(argv[i], "hybrid") == 0) opt_mode = OPT_HYBRID; else { fprintf(stderr, " ERROR: Unknown optimizer '%s'. Use hpc, mse, or hybrid.\n", argv[i]); return 1; } } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) { imatrix_path = argv[++i]; } else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) { config_override = argv[++i]; } else if (strcmp(argv[i], "--qwen") == 0) { force_qwen = 1; } else if (strcmp(argv[i], "--verbose") == 0) { verbose = 1; } else { fprintf(stderr, " ERROR: Unknown option '%s'\n", argv[i]); return 1; } } const char *opt_names[] = {"HPC (BP only)", "MSE (grid search)", "Hybrid (HPC+MSE)"}; printf(" Input: %s\n", input_path); printf(" Output: %s\n", output_path); printf(" Quant type: Q2_K (2.625 bpw)\n"); printf(" Optimizer: %s\n", opt_names[opt_mode]); if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path); if (config_override) printf(" Config: %s\n", config_override); if (force_qwen) printf(" Model: Qwen 3.5/3.6 (forced via --qwen)\n"); printf("\n"); /* ── Phase 1: Load model ── */ printf(" Phase 1: Loading model...\n"); clock_t t_start = clock(); /* Determine if input is a file or directory */ struct stat st; if (stat(input_path, &st) != 0) { fprintf(stderr, " ERROR: Cannot access '%s'\n", input_path); return 1; } STMultiFile *mf = NULL; char input_dir[512] = ""; if (S_ISDIR(st.st_mode)) { /* Input is a directory — open all shards */ mf = st_open_dir(input_path); strncpy(input_dir, input_path, sizeof(input_dir) - 2); int dlen = strlen(input_dir); if (dlen > 0 && input_dir[dlen - 1] != '/') { input_dir[dlen] = '/'; input_dir[dlen + 1] = '\0'; } } else { /* Input is a single file — wrap in STMultiFile */ STFile *sf = st_open(input_path); if (!sf) { fprintf(stderr, " ERROR: Failed to open '%s'\n", input_path); return 1; } mf = (STMultiFile *)calloc(1, sizeof(STMultiFile)); mf->shards[0] = sf; mf->n_shards = 1; for (int i = 0; i < sf->n_tensors && mf->n_tensors < ST_MAX_TENSORS; i++) { strncpy(mf->tensor_map[mf->n_tensors].name, sf->tensors[i].name, ST_MAX_NAME_LEN - 1); mf->tensor_map[mf->n_tensors].shard_idx = 0; mf->tensor_map[mf->n_tensors].tensor_idx = i; mf->n_tensors++; } /* Extract directory from file path */ strncpy(input_dir, input_path, sizeof(input_dir) - 1); char *last_slash = strrchr(input_dir, '/'); if (last_slash) { *(last_slash + 1) = '\0'; } else { strcpy(input_dir, "./"); } } if (!mf) { fprintf(stderr, " ERROR: Failed to load model from '%s'\n", input_path); return 1; } st_multi_print_summary(mf); clock_t t_load = clock(); printf(" Loaded in %.3f seconds\n\n", (double)(t_load - t_start) / CLOCKS_PER_SEC); /* ── Phase 2: Detect architecture ── */ printf(" Phase 2: Detecting model architecture...\n"); /* Try to read config.json from model directory */ char config_path[1024]; snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir); const char *config_ptr = NULL; { FILE *check = fopen(config_path, "rb"); if (check) { fclose(check); config_ptr = config_path; printf(" Found config.json: %s\n", config_path); } } ModelArchitecture arch; detect_architecture(mf, &arch, config_ptr); /* --qwen override: force Qwen 3.5/3.6 architecture parameters */ if (force_qwen) { strcpy(arch.architecture, "qwen2"); strcpy(arch.name, "Qwen3.6-HExState-Q2K"); printf(" [--qwen] Forcing qwen2-compatible architecture\n"); } printf(" ╔═══════════════════════════════════════════════════════════════╗\n"); printf(" ║ Model Architecture ║\n"); printf(" ╠═══════════════════════════════════════════════════════════════╣\n"); printf(" ║ Architecture: %-40s ║\n", arch.architecture); printf(" ║ Layers: %-40u ║\n", arch.block_count); printf(" ║ Hidden size: %-40u ║\n", arch.embedding_length); printf(" ║ Attention heads: %-40u ║\n", arch.head_count); printf(" ║ KV heads: %-40u ║\n", arch.head_count_kv); printf(" ║ Vocab size: %-40u ║\n", arch.vocab_size); printf(" ║ FFN size: %-40u ║\n", arch.feed_forward_length); printf(" ║ Context length: %-40u ║\n", arch.context_length); printf(" ║ Has bias: %-40s ║\n", arch.has_bias ? "yes" : "no"); printf(" ║ Tied embeddings: %-40s ║\n", arch.tie_word_embeddings ? "yes" : "no"); printf(" ╚═══════════════════════════════════════════════════════════════╝\n\n"); /* ── Phase 2b: Load tokenizer ── */ printf(" Phase 2b: Loading tokenizer...\n"); TokenizerData *tokenizer = NULL; { char tok_json[512], tok_config[512]; snprintf(tok_json, sizeof(tok_json), "%stokenizer.json", input_dir); snprintf(tok_config, sizeof(tok_config), "%stokenizer_config.json", input_dir); tokenizer = tok_load(tok_json, tok_config); if (tokenizer) { tok_print_summary(tokenizer); } else { printf(" No tokenizer found in '%s'\n", input_dir); printf(" (Output GGUF will lack tokenizer data — not inference-ready)\n\n"); } } /* ── Phase 2c: Load importance matrix (optional) ── */ IMatrixData *imatrix = NULL; if (imatrix_path) { printf(" Phase 2c: Loading importance matrix...\n"); imatrix = imatrix_load(imatrix_path); if (imatrix) { imatrix_print_summary(imatrix); } else { printf(" WARNING: Failed to load imatrix from '%s'\n", imatrix_path); printf(" Proceeding without importance weighting.\n\n"); } } /* ── Phase 3-5: Quantize and write GGUF ── */ printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n"); clock_t t_quant_start = clock(); int result = write_gguf(output_path, mf, &arch, tokenizer, opt_mode, imatrix, verbose); clock_t t_end = clock(); printf(" Total time: %.3f seconds\n\n", (double)(t_end - t_start) / CLOCKS_PER_SEC); if (imatrix) imatrix_free(imatrix); if (tokenizer) tok_free(tokenizer); st_multi_close(mf); return result; } #endif /* HEXSTATE_LIBRARY */