| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <stdio.h> |
| #ifdef _OPENMP |
| #include <omp.h> |
| #endif |
| #include <stdlib.h> |
| #include <string.h> |
| #include <math.h> |
| #include <time.h> |
| #include <sys/stat.h> |
| #include <mpfr.h> |
|
|
| |
| #include "quhit_triality.h" |
| #include "hpc_graph.h" |
| #include "hpc_mobius.h" |
| #include "s6_exotic.h" |
|
|
| |
| #include "gguf_format.h" |
| #include "safetensors_reader.h" |
| #include "tokenizer_reader.h" |
| #include "imatrix_reader.h" |
|
|
| #define D 6 |
|
|
| |
| |
| |
|
|
| typedef enum { |
| OPT_HPC, |
| OPT_MSE, |
| OPT_HYBRID |
| } OptimizerMode; |
|
|
| |
| |
| |
| |
| |
| |
|
|
| typedef struct { |
| char architecture[64]; |
| char name[256]; |
| uint32_t block_count; |
| uint32_t embedding_length; |
| uint32_t head_count; |
| uint32_t head_count_kv; |
| uint32_t vocab_size; |
| uint32_t context_length; |
| float rope_freq_base; |
| uint32_t feed_forward_length; |
| float rms_norm_eps; |
| int has_bias; |
| int tie_word_embeddings; |
| } ModelArchitecture; |
|
|
| |
| static int count_tensors_with_prefix(const STMultiFile *mf, const char *prefix) |
| { |
| int count = 0; |
| int prefix_len = strlen(prefix); |
| for (int i = 0; i < mf->n_tensors; i++) { |
| if (strncmp(mf->tensor_map[i].name, prefix, prefix_len) == 0) |
| count++; |
| } |
| return count; |
| } |
|
|
| |
| static int find_max_layer_index(const STMultiFile *mf, const char *layer_prefix) |
| { |
| int max_idx = -1; |
| int prefix_len = strlen(layer_prefix); |
| for (int i = 0; i < mf->n_tensors; i++) { |
| if (strncmp(mf->tensor_map[i].name, layer_prefix, prefix_len) == 0) { |
| int idx = atoi(mf->tensor_map[i].name + prefix_len); |
| if (idx > max_idx) max_idx = idx; |
| } |
| } |
| return max_idx; |
| } |
|
|
| |
|
|
| typedef struct { |
| int valid; |
| uint32_t hidden_size; |
| uint32_t intermediate_size; |
| uint32_t num_attention_heads; |
| uint32_t num_key_value_heads; |
| uint32_t num_hidden_layers; |
| uint32_t vocab_size; |
| uint32_t max_position_embeddings; |
| float rope_theta; |
| float rms_norm_eps; |
| char model_type[64]; |
| int tie_word_embeddings; |
| } ConfigJson; |
|
|
| static ConfigJson parse_config_json(const char *path) |
| { |
| ConfigJson cfg; |
| memset(&cfg, 0, sizeof(cfg)); |
|
|
| FILE *f = fopen(path, "rb"); |
| if (!f) return cfg; |
|
|
| fseek(f, 0, SEEK_END); |
| long size = ftell(f); |
| fseek(f, 0, SEEK_SET); |
|
|
| char *json = (char *)malloc(size + 1); |
| if (!json) { fclose(f); return cfg; } |
| fread(json, 1, size, f); |
| json[size] = '\0'; |
| fclose(f); |
|
|
| cfg.valid = 1; |
|
|
| |
| const char *p; |
|
|
| p = tok_find_key(json, "hidden_size"); |
| if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "intermediate_size"); |
| if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "num_attention_heads"); |
| if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "num_key_value_heads"); |
| if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "num_hidden_layers"); |
| if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "vocab_size"); |
| if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "max_position_embeddings"); |
| if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10); |
|
|
| p = tok_find_key(json, "rope_theta"); |
| if (p) cfg.rope_theta = (float)strtod(p, NULL); |
|
|
| p = tok_find_key(json, "rms_norm_eps"); |
| if (p) cfg.rms_norm_eps = (float)strtod(p, NULL); |
|
|
| p = tok_find_key(json, "model_type"); |
| if (p && *p == '"') { |
| char buf[64]; |
| tok_extract_string(p, buf, sizeof(buf)); |
| strncpy(cfg.model_type, buf, sizeof(cfg.model_type) - 1); |
| } |
|
|
| p = tok_find_key(json, "tie_word_embeddings"); |
| if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0); |
|
|
| |
| if (cfg.hidden_size == 0) { |
| const char *tc = strstr(json, "\"text_config\""); |
| if (tc) { |
| const char *tc_brace = strchr(tc, '{'); |
| if (tc_brace) { |
| p = tok_find_key(tc_brace, "hidden_size"); |
| if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "intermediate_size"); |
| if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "num_attention_heads"); |
| if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "num_key_value_heads"); |
| if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "num_hidden_layers"); |
| if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "vocab_size"); |
| if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "max_position_embeddings"); |
| if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10); |
| p = tok_find_key(tc_brace, "rms_norm_eps"); |
| if (p) cfg.rms_norm_eps = (float)strtod(p, NULL); |
| p = tok_find_key(tc_brace, "model_type"); |
| if (p && *p == '"') { |
| char buf2[64]; |
| tok_extract_string(p, buf2, sizeof(buf2)); |
| strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1); |
| } |
| p = tok_find_key(tc_brace, "tie_word_embeddings"); |
| if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0); |
| |
| const char *rp = strstr(tc_brace, "\"rope_parameters\""); |
| if (rp) { |
| p = tok_find_key(rp, "rope_theta"); |
| if (p) cfg.rope_theta = (float)strtod(p, NULL); |
| } |
| } |
| } |
| } |
|
|
| free(json); |
| return cfg; |
| } |
|
|
| static void detect_architecture(const STMultiFile *mf, ModelArchitecture *arch, |
| const char *config_json_path) |
| { |
| memset(arch, 0, sizeof(*arch)); |
|
|
| |
| strcpy(arch->architecture, "llama"); |
| strcpy(arch->name, "HExState-quantized"); |
| arch->context_length = 4096; |
| arch->rope_freq_base = 10000.0f; |
| arch->rms_norm_eps = 1e-5f; |
|
|
| |
| ConfigJson cfg = {0}; |
| if (config_json_path) { |
| cfg = parse_config_json(config_json_path); |
| } |
|
|
| if (cfg.valid) { |
| |
| if (strcmp(cfg.model_type, "llama") == 0 || |
| strcmp(cfg.model_type, "mistral") == 0) { |
| strcpy(arch->architecture, "llama"); |
| } else if (strcmp(cfg.model_type, "qwen2") == 0) { |
| strcpy(arch->architecture, "qwen2"); |
| } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) { |
| strcpy(arch->architecture, "qwen2moe"); |
| } else if (strcmp(cfg.model_type, "qwen3_5") == 0 || |
| strcmp(cfg.model_type, "qwen3_5_text") == 0 || |
| strcmp(cfg.model_type, "qwen3_5_moe") == 0) { |
| strcpy(arch->architecture, "qwen2"); |
| } else if (strcmp(cfg.model_type, "phi3") == 0 || |
| strcmp(cfg.model_type, "phi") == 0) { |
| strcpy(arch->architecture, "phi3"); |
| } else if (strcmp(cfg.model_type, "gemma") == 0 || |
| strcmp(cfg.model_type, "gemma2") == 0) { |
| strcpy(arch->architecture, "gemma"); |
| } else if (strcmp(cfg.model_type, "deepseek_v2") == 0) { |
| strcpy(arch->architecture, "llama"); |
| } else if (strcmp(cfg.model_type, "gpt_neox") == 0) { |
| strcpy(arch->architecture, "gpt_neox"); |
| } else if (strcmp(cfg.model_type, "falcon") == 0) { |
| strcpy(arch->architecture, "falcon"); |
| } else if (cfg.model_type[0]) { |
| |
| strcpy(arch->architecture, "llama"); |
| } |
|
|
| if (cfg.hidden_size) arch->embedding_length = cfg.hidden_size; |
| if (cfg.intermediate_size) arch->feed_forward_length = cfg.intermediate_size; |
| if (cfg.num_attention_heads) arch->head_count = cfg.num_attention_heads; |
| if (cfg.num_key_value_heads) arch->head_count_kv = cfg.num_key_value_heads; |
| if (cfg.num_hidden_layers) arch->block_count = cfg.num_hidden_layers; |
| if (cfg.vocab_size) arch->vocab_size = cfg.vocab_size; |
| if (cfg.max_position_embeddings) arch->context_length = cfg.max_position_embeddings; |
| if (cfg.rope_theta > 0) arch->rope_freq_base = cfg.rope_theta; |
| if (cfg.rms_norm_eps > 0) arch->rms_norm_eps = cfg.rms_norm_eps; |
| arch->tie_word_embeddings = cfg.tie_word_embeddings; |
|
|
| printf(" Architecture determined from config.json: %s\n", cfg.model_type); |
| } |
|
|
| |
| int has_model_layers = count_tensors_with_prefix(mf, "model.layers."); |
| int has_gpt_neox = count_tensors_with_prefix(mf, "gpt_neox."); |
| int has_transformer = count_tensors_with_prefix(mf, "transformer."); |
|
|
| |
| int has_qkv_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.qkv_proj"); |
| int has_kv_a_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.kv_a_proj_with_mqa"); |
| int has_final_norm = (st_multi_find_tensor(mf, "model.final_norm.weight") >= 0); |
|
|
| if (has_qkv_proj > 0 && !cfg.valid) { |
| strcpy(arch->architecture, "phi3"); |
| } else if (has_kv_a_proj > 0 && !cfg.valid) { |
| strcpy(arch->architecture, "llama"); |
| } else if (has_final_norm && !cfg.valid) { |
| strcpy(arch->architecture, "gemma"); |
| } |
|
|
| if (has_model_layers > 0 && arch->block_count == 0) { |
| arch->block_count = find_max_layer_index(mf, "model.layers.") + 1; |
| } |
|
|
| |
| if (arch->embedding_length == 0 || arch->head_count == 0) { |
| int qproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.weight"); |
| int kproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.k_proj.weight"); |
|
|
| if (qproj_idx >= 0) { |
| const STTensorInfo *ti = st_multi_tensor_info(mf, qproj_idx); |
| int64_t q_out = ti->shape[0]; |
| int64_t hidden = ti->shape[1]; |
| if (arch->embedding_length == 0) arch->embedding_length = hidden; |
|
|
| |
| int head_dim = 128; |
| if (q_out % 128 == 0) head_dim = 128; |
| else if (q_out % 96 == 0) head_dim = 96; |
| else if (q_out % 64 == 0) head_dim = 64; |
|
|
| if (arch->head_count == 0) arch->head_count = q_out / head_dim; |
|
|
| if (kproj_idx >= 0 && arch->head_count_kv == 0) { |
| const STTensorInfo *kt = st_multi_tensor_info(mf, kproj_idx); |
| arch->head_count_kv = kt->shape[0] / head_dim; |
| } |
| } |
| } |
|
|
| if (arch->vocab_size == 0) { |
| int embed_idx = st_multi_find_tensor(mf, "model.embed_tokens.weight"); |
| if (embed_idx >= 0) { |
| const STTensorInfo *ti = st_multi_tensor_info(mf, embed_idx); |
| arch->vocab_size = ti->shape[0]; |
| } |
| } |
|
|
| if (arch->feed_forward_length == 0) { |
| int gate_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.gate_proj.weight"); |
| if (gate_idx >= 0) { |
| const STTensorInfo *ti = st_multi_tensor_info(mf, gate_idx); |
| arch->feed_forward_length = ti->shape[0]; |
| } else { |
| int up_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.up_proj.weight"); |
| if (up_idx >= 0) { |
| const STTensorInfo *ti = st_multi_tensor_info(mf, up_idx); |
| arch->feed_forward_length = ti->shape[0]; |
| } |
| } |
| } |
|
|
| |
| arch->has_bias = (st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.bias") >= 0); |
|
|
| if (has_gpt_neox > 0 && arch->block_count == 0) { |
| strcpy(arch->architecture, "gpt_neox"); |
| arch->block_count = find_max_layer_index(mf, "gpt_neox.layers.") + 1; |
| } |
| if (has_transformer > 0 && arch->block_count == 0) { |
| strcpy(arch->architecture, "falcon"); |
| arch->block_count = find_max_layer_index(mf, "transformer.h.") + 1; |
| } |
|
|
| |
| if (arch->head_count == 0) arch->head_count = 32; |
| if (arch->head_count_kv == 0) arch->head_count_kv = arch->head_count; |
| if (arch->embedding_length == 0) arch->embedding_length = 4096; |
| if (arch->vocab_size == 0) arch->vocab_size = 32000; |
| if (arch->feed_forward_length == 0) |
| arch->feed_forward_length = (arch->embedding_length * 8) / 3; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| static int should_skip_tensor(const char *hf_name) |
| { |
| |
| if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1; |
| if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1; |
| if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1; |
| |
| if (strncmp(hf_name, "model.visual.", 13) == 0) return 1; |
| if (strncmp(hf_name, "visual.", 7) == 0) return 1; |
| |
| if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1; |
| return 0; |
| } |
|
|
| static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen) |
| { |
| |
| strncpy(gguf_name, hf_name, buflen - 1); |
| gguf_name[buflen - 1] = '\0'; |
|
|
| |
| struct { const char *from; const char *to; } mappings[] = { |
| {"model.embed_tokens.weight", "token_embd.weight"}, |
| {"model.language_model.embed_tokens.weight","token_embd.weight"}, |
| {"model.norm.weight", "output_norm.weight"}, |
| {"model.language_model.norm.weight", "output_norm.weight"}, |
| {"model.final_norm.weight", "output_norm.weight"}, |
| {"lm_head.weight", "output.weight"}, |
| {"model.embed_tokens.bias", "token_embd.bias"}, |
| {"model.norm.bias", "output_norm.bias"}, |
| {NULL, NULL} |
| }; |
|
|
| for (int m = 0; mappings[m].from; m++) { |
| if (strcmp(hf_name, mappings[m].from) == 0) { |
| strncpy(gguf_name, mappings[m].to, buflen - 1); |
| return; |
| } |
| } |
|
|
| |
| const char *layer_prefix = NULL; |
| if (strncmp(hf_name, "model.layers.", 13) == 0) |
| layer_prefix = hf_name + 13; |
| else if (strncmp(hf_name, "model.language_model.layers.", 27) == 0) |
| layer_prefix = hf_name + 27; |
|
|
| if (layer_prefix) { |
| int layer_idx; |
| char rest[ST_MAX_NAME_LEN]; |
| if (sscanf(layer_prefix, "%d.%255s", &layer_idx, rest) == 2) { |
| |
| struct { const char *from; const char *to; } layer_maps[] = { |
| |
| {"self_attn.q_proj.weight", "attn_q.weight"}, |
| {"self_attn.k_proj.weight", "attn_k.weight"}, |
| {"self_attn.v_proj.weight", "attn_v.weight"}, |
| {"self_attn.o_proj.weight", "attn_output.weight"}, |
| |
| {"self_attn.q_proj.bias", "attn_q.bias"}, |
| {"self_attn.k_proj.bias", "attn_k.bias"}, |
| {"self_attn.v_proj.bias", "attn_v.bias"}, |
| {"self_attn.o_proj.bias", "attn_output.bias"}, |
| |
| {"self_attn.qkv_proj.weight", "attn_qkv.weight"}, |
| {"self_attn.qkv_proj.bias", "attn_qkv.bias"}, |
| |
| {"self_attn.kv_a_proj_with_mqa.weight", "attn_kv_a_mqa.weight"}, |
| {"self_attn.kv_b_proj.weight", "attn_kv_b.weight"}, |
| |
| {"mlp.gate_proj.weight", "ffn_gate.weight"}, |
| {"mlp.up_proj.weight", "ffn_up.weight"}, |
| {"mlp.down_proj.weight", "ffn_down.weight"}, |
| |
| {"mlp.gate_proj.bias", "ffn_gate.bias"}, |
| {"mlp.up_proj.bias", "ffn_up.bias"}, |
| {"mlp.down_proj.bias", "ffn_down.bias"}, |
| |
| {"mlp.gate.weight", "ffn_gate_inp.weight"}, |
| |
| {"mlp.experts.gate_proj.weight", "ffn_gate_exps.weight"}, |
| {"mlp.experts.up_proj.weight", "ffn_up_exps.weight"}, |
| {"mlp.experts.down_proj.weight", "ffn_down_exps.weight"}, |
| |
| {"input_layernorm.weight", "attn_norm.weight"}, |
| {"post_attention_layernorm.weight", "ffn_norm.weight"}, |
| {"input_layernorm.bias", "attn_norm.bias"}, |
| {"post_attention_layernorm.bias", "ffn_norm.bias"}, |
| |
| {"pre_feedforward_layernorm.weight", "ffn_norm.weight"}, |
| {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"}, |
| |
| {"self_attn.q_norm.weight", "attn_q_norm.weight"}, |
| {"self_attn.k_norm.weight", "attn_k_norm.weight"}, |
| |
| {"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"}, |
| {"linear_attn.in_proj_z.weight", "ssm_in_z.weight"}, |
| {"linear_attn.in_proj_a.weight", "ssm_in_a.weight"}, |
| {"linear_attn.in_proj_b.weight", "ssm_in_b.weight"}, |
| {"linear_attn.out_proj.weight", "ssm_out.weight"}, |
| {"linear_attn.conv1d.weight", "ssm_conv1d.weight"}, |
| {"linear_attn.norm.weight", "ssm_norm.weight"}, |
| {"linear_attn.A_log", "ssm_a"}, |
| {"linear_attn.dt_bias", "ssm_dt.bias"}, |
| {NULL, NULL} |
| }; |
|
|
| for (int m = 0; layer_maps[m].from; m++) { |
| if (strcmp(rest, layer_maps[m].from) == 0) { |
| snprintf(gguf_name, buflen, "blk.%d.%s", |
| layer_idx, layer_maps[m].to); |
| return; |
| } |
| } |
|
|
| |
| int expert_idx; |
| char expert_rest[ST_MAX_NAME_LEN]; |
| if (sscanf(rest, "mlp.experts.%d.%255s", &expert_idx, expert_rest) == 2) { |
| struct { const char *from; const char *to; } expert_maps[] = { |
| {"gate_proj.weight", "ffn_gate_exp.weight"}, |
| {"up_proj.weight", "ffn_up_exp.weight"}, |
| {"down_proj.weight", "ffn_down_exp.weight"}, |
| {NULL, NULL} |
| }; |
| for (int m = 0; expert_maps[m].from; m++) { |
| if (strcmp(expert_rest, expert_maps[m].from) == 0) { |
| snprintf(gguf_name, buflen, "blk.%d.%s.%d", |
| layer_idx, expert_maps[m].to, expert_idx); |
| return; |
| } |
| } |
| } |
|
|
| |
| snprintf(gguf_name, buflen, "blk.%d.%s", layer_idx, rest); |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| static int should_quantize(const STTensorInfo *ti, const char *gguf_name) |
| { |
| |
| if (ti->n_dims < 2) return 0; |
|
|
| |
| if (strstr(gguf_name, "token_embd") != NULL) return 0; |
|
|
| |
| |
| if (strcmp(gguf_name, "output.weight") == 0) return 0; |
|
|
| |
| if (strstr(gguf_name, "norm") != NULL) return 0; |
|
|
| |
| if (strstr(gguf_name, ".bias") != NULL) return 0; |
|
|
| |
| if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0; |
|
|
| |
| if (strstr(gguf_name, "ssm_a") != NULL) return 0; |
| if (strstr(gguf_name, "ssm_dt") != NULL) return 0; |
| if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; |
|
|
| |
| return 1; |
| } |
|
|
| |
| |
| |
| |
| static int is_attention_tensor(const char *gguf_name) |
| { |
| |
| if (strstr(gguf_name, "attn_q.weight") != NULL) return 1; |
| if (strstr(gguf_name, "attn_k.weight") != NULL) return 1; |
| if (strstr(gguf_name, "attn_v.weight") != NULL) return 1; |
| if (strstr(gguf_name, "attn_output.weight") != NULL) return 1; |
| if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1; |
| |
| if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1; |
| if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1; |
| if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1; |
| |
| if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1; |
| if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1; |
| if (strstr(gguf_name, "self_attn.v_proj.weight") != NULL) return 1; |
| if (strstr(gguf_name, "self_attn.o_proj.weight") != NULL) return 1; |
| return 0; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #define SCALE_FACTOR_COUNT 6 |
| static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = { |
| 0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f |
| }; |
|
|
| |
| |
| |
| |
| #define QUHITS_PER_BLOCK 2 |
| #define N_CAND_D 16 |
| #define N_CAND_M 16 |
| #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M) |
|
|
| static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES]; |
| static int scale_table_initialized = 0; |
|
|
| static void init_scale_table(void) { |
| if (scale_table_initialized) return; |
| |
| for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) { |
| SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1)); |
| } |
| scale_table_initialized = 1; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites) |
| { |
| |
| g->n_edges = 0; |
| g->cz_edges = 0; |
| g->phase_edges = 0; |
| g->syntheme_edges = 0; |
| g->n_log = 0; |
| g->min_fidelity = 1.0; |
| g->avg_fidelity = 1.0; |
| g->amp_evals = 0; |
| g->prob_evals = 0; |
| g->measurements = 0; |
|
|
| |
| for (uint64_t i = 0; i < n_sites; i++) { |
| g->adj[i].count = 0; |
| } |
|
|
| |
| for (uint64_t i = 0; i < n_sites; i++) |
| triality_init(&g->locals[i]); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| static inline float fast_pow_2_4(float x) |
| { |
| |
| |
| |
| |
| |
| float x2 = x * x; |
| return x2 * sqrtf(cbrtf(x2)); |
| } |
|
|
| |
| |
| static float compute_block_error_q2k(const float *weights, int block_size, |
| float scale_mult, |
| const float *importance, int imp_offset) |
| { |
| float min_val = weights[0]; |
| float max_val = weights[0]; |
| for (int j = 1; j < block_size; j++) { |
| if (weights[j] < min_val) min_val = weights[j]; |
| if (weights[j] > max_val) max_val = weights[j]; |
| } |
| if (min_val > 0) min_val = 0; |
|
|
| float range = (max_val - min_val) * scale_mult; |
| if (range < 1e-15f) return 0.0f; |
| float inv_range = 3.0f / range; |
|
|
| float err = 0.0f; |
| for (int j = 0; j < block_size; j++) { |
| float x = weights[j]; |
| int q = (int)((x - min_val * scale_mult) * inv_range + 0.5f); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| float deq = min_val * scale_mult + (float)q * range / 3.0f; |
| float diff = x - deq; |
| float w = (importance) ? importance[imp_offset + j] : 1.0f; |
| err += diff * diff * w; |
| } |
| return err; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static HPCGraph *build_sensitivity_graph(const float *weights, |
| int64_t n_elements, |
| int block_size, |
| float temperature, |
| const float *importance) |
| { |
| int64_t n_blocks = n_elements / block_size; |
| if (n_blocks < 2) return NULL; |
|
|
| init_scale_table(); |
|
|
| int64_t graph_blocks = (n_blocks > 8192) ? 8192 : n_blocks; |
| int64_t stride = n_blocks / graph_blocks; |
| int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK; |
|
|
| HPCGraph *graph = hpc_create(n_sites); |
| if (!graph) return NULL; |
|
|
| for (int64_t i = 0; i < n_sites; i++) |
| triality_dft(&graph->locals[i]); |
|
|
| |
| |
| for (int64_t i = 0; i < graph_blocks; i++) { |
| int64_t block_idx = i * stride; |
| const float *block_weights = weights + block_idx * block_size; |
|
|
| |
| float errors[TOTAL_SCALE_CANDIDATES]; |
| float min_err = 1e30f; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { |
| errors[c] = compute_block_error_q2k(block_weights, block_size, |
| SCALE_TABLE[c], |
| importance, |
| (int)(block_idx * block_size)); |
| if (errors[c] < min_err) min_err = errors[c]; |
| } |
|
|
| |
| |
| double coarse_re[6], coarse_im[6]; |
| double coarse_norm = 0.0; |
| for (int v0 = 0; v0 < 6; v0++) { |
| coarse_re[v0] = 0.0; |
| coarse_im[v0] = 0.0; |
| for (int v1 = 0; v1 < 6; v1++) { |
| int idx = v0 * 6 + v1; |
| coarse_re[v0] += exp(-(double)(errors[idx] - min_err) / |
| (2.0 * (double)temperature)); |
| } |
| coarse_norm += coarse_re[v0] * coarse_re[v0]; |
| } |
| if (coarse_norm > 1e-30) { |
| double inv = 1.0 / sqrt(coarse_norm); |
| for (int v = 0; v < 6; v++) coarse_re[v] *= inv; |
| } |
|
|
| |
| |
| double fine_re[6], fine_im[6]; |
| double fine_norm = 0.0; |
| for (int v1 = 0; v1 < 6; v1++) { |
| fine_re[v1] = 0.0; |
| fine_im[v1] = 0.0; |
| for (int v0 = 0; v0 < 6; v0++) { |
| int idx = v0 * 6 + v1; |
| fine_re[v1] += exp(-(double)(errors[idx] - min_err) / |
| (2.0 * (double)temperature)); |
| } |
| fine_norm += fine_re[v1] * fine_re[v1]; |
| } |
| if (fine_norm > 1e-30) { |
| double inv = 1.0 / sqrt(fine_norm); |
| for (int v = 0; v < 6; v++) fine_re[v] *= inv; |
| } |
|
|
| |
| int64_t s_coarse = 2 * i; |
| for (int v = 0; v < 6; v++) { |
| graph->locals[s_coarse].edge_re[v] = coarse_re[v]; |
| graph->locals[s_coarse].edge_im[v] = 0.0; |
| } |
| graph->locals[s_coarse].primary = VIEW_EDGE; |
| graph->locals[s_coarse].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| graph->locals[s_coarse].delta_valid = 0; |
| triality_update_mask(&graph->locals[s_coarse]); |
|
|
| |
| int64_t s_fine = 2 * i + 1; |
| for (int v = 0; v < 6; v++) { |
| graph->locals[s_fine].edge_re[v] = fine_re[v]; |
| graph->locals[s_fine].edge_im[v] = 0.0; |
| } |
| graph->locals[s_fine].primary = VIEW_EDGE; |
| graph->locals[s_fine].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| graph->locals[s_fine].delta_valid = 0; |
| triality_update_mask(&graph->locals[s_fine]); |
| } |
|
|
| |
| for (int64_t i = 0; i < graph_blocks; i++) { |
| |
| hpc_cz(graph, 2 * i, 2 * i + 1); |
|
|
| |
| if (i + 1 < graph_blocks) { |
| hpc_cz(graph, 2 * i, 2 * (i + 1)); |
| hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); |
| } |
| } |
|
|
| return graph; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| typedef struct { |
| float maxshrink; |
| int grid; |
| int patience; |
| float norm; |
| } MSEGridConfig; |
|
|
| static const MSEGridConfig MSE_DEFAULT_CONFIG = { |
| .maxshrink = 0.20f, |
| .grid = 200, |
| .patience = 8, |
| .norm = 2.4f |
| }; |
|
|
| |
| |
| |
| |
| static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax, |
| uint8_t *L, float *out_min, |
| const float *importance, |
| const MSEGridConfig *cfg) |
| { |
| float min_val = x[0], max_val = x[0]; |
| for (int i = 1; i < n; i++) { |
| if (x[i] < min_val) min_val = x[i]; |
| if (x[i] > max_val) max_val = x[i]; |
| } |
| if (max_val == min_val) { |
| for (int i = 0; i < n; i++) L[i] = 0; |
| *out_min = -min_val; |
| return 0.0f; |
| } |
| if (min_val > 0) min_val = 0; |
|
|
| float best_scale = 0.0f; |
| float best_min = -min_val; |
| float best_error = 1e30f; |
| int no_improve = 0; |
|
|
| int shrink_steps = (int)(cfg->maxshrink * cfg->grid); |
| if (shrink_steps < 1) shrink_steps = 1; |
|
|
| for (int step = 0; step <= shrink_steps; step++) { |
| float p = 1.0f - (float)step / (float)cfg->grid; |
|
|
| float cand_min = p * min_val; |
| float cand_max = p * max_val; |
|
|
| if (cand_max <= cand_min) continue; |
|
|
| float iscale = (float)nmax / (cand_max - cand_min); |
| float scale = 1.0f / iscale; |
|
|
| |
| float err = 0.0f; |
| uint8_t tmp_L[256]; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale * (x[i] - cand_min)); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| tmp_L[i] = (uint8_t)l; |
|
|
| float deq = cand_min + scale * (float)l; |
| float diff = fabsf(x[i] - deq); |
| |
| float e = diff; |
| if (cfg->norm == 2.4f) { |
| e = fast_pow_2_4(diff); |
| } else if (cfg->norm != 1.0f) { |
| e = powf(diff, cfg->norm); |
| } |
| |
| if (importance) e *= importance[i]; |
| err += e; |
| } |
|
|
| if (err < best_error) { |
| best_error = err; |
| best_scale = scale; |
| best_min = -cand_min; |
| memcpy(L, tmp_L, n); |
| no_improve = 0; |
| } else { |
| no_improve++; |
| if (no_improve >= cfg->patience) break; |
| } |
| } |
|
|
| |
| float cur_min = -best_min; |
| float cur_scale = best_scale; |
| if (cur_scale > 1e-15f) { |
| float iscale = 1.0f / cur_scale; |
| for (int itry = 0; itry < 3; itry++) { |
| float sumlx = 0; |
| int suml2 = 0; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale * (x[i] - cur_min)); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| L[i] = (uint8_t)l; |
| sumlx += (x[i] - cur_min) * l; |
| suml2 += l * l; |
| } |
| if (suml2 > 0) cur_scale = sumlx / suml2; |
| float sum = 0; |
| for (int i = 0; i < n; i++) |
| sum += x[i] - cur_scale * L[i]; |
| cur_min = 0.7f * cur_min + 0.3f * sum / n; |
| if (cur_min > 0) cur_min = 0; |
| if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale; |
| } |
| } |
|
|
| *out_min = -cur_min; |
| return cur_scale; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| static float hpc_make_qkx2_quants(int n, int nmax, const float *x, |
| const float *w, uint8_t *L, |
| float *the_min, uint8_t *Laux) |
| { |
| float xmin = x[0], xmax = x[0]; |
| float sum_w = w[0], sum_x = w[0] * x[0]; |
| for (int i = 1; i < n; i++) { |
| if (x[i] < xmin) xmin = x[i]; |
| if (x[i] > xmax) xmax = x[i]; |
| sum_w += w[i]; |
| sum_x += w[i] * x[i]; |
| } |
| if (xmin > 0) xmin = 0; |
| if (xmax == xmin) { |
| for (int i = 0; i < n; i++) L[i] = 0; |
| *the_min = -xmin; |
| return 0.0f; |
| } |
|
|
| float iscale = (float)nmax / (xmax - xmin); |
| float scale = 1.0f / iscale; |
| float best_mad = 0; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale * (x[i] - xmin)); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| L[i] = (uint8_t)l; |
| float diff = scale * (float)l + xmin - x[i]; |
| best_mad += w[i] * fabsf(diff); |
| } |
|
|
| |
| for (int is = 0; is <= 15; is++) { |
| float try_iscale = (-0.5f + 0.1f * (float)is + (float)nmax) / (xmax - xmin); |
| float sl = 0, sl2 = 0, sxl = 0; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(try_iscale * (x[i] - xmin)); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| Laux[i] = (uint8_t)l; |
| sl += w[i] * (float)l; |
| sl2 += w[i] * (float)(l * l); |
| sxl += w[i] * (float)l * x[i]; |
| } |
| float det = sum_w * sl2 - sl * sl; |
| if (det > 0) { |
| float this_scale = (sum_w * sxl - sum_x * sl) / det; |
| float this_min = (sl2 * sum_x - sl * sxl) / det; |
| if (this_min > 0) { |
| this_min = 0; |
| this_scale = sxl / sl2; |
| } |
| float mad = 0; |
| for (int i = 0; i < n; i++) { |
| float diff = this_scale * (float)Laux[i] + this_min - x[i]; |
| mad += w[i] * fabsf(diff); |
| } |
| if (mad < best_mad) { |
| for (int i = 0; i < n; i++) L[i] = Laux[i]; |
| best_mad = mad; |
| scale = this_scale; |
| xmin = this_min; |
| } |
| } |
| } |
| *the_min = -xmin; |
| return scale; |
| } |
|
|
| |
| |
| static float hpc_make_qp_quants(int n, int nmax, const float *x, |
| uint8_t *L, const float *sw) |
| { |
| float xmax = 0; |
| for (int i = 0; i < n; i++) |
| if (x[i] > xmax) xmax = x[i]; |
| if (xmax < 1e-15f) { |
| for (int i = 0; i < n; i++) L[i] = 0; |
| return 0.0f; |
| } |
| float iscale = (float)nmax / xmax; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale * x[i]); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| L[i] = (uint8_t)l; |
| } |
| float scale = 1.0f / iscale; |
| float best_mse = 0; |
| for (int i = 0; i < n; i++) { |
| float diff = x[i] - scale * (float)L[i]; |
| best_mse += sw[i] * diff * diff; |
| } |
| for (int is = -4; is <= 4; is++) { |
| if (is == 0) continue; |
| float iscale_is = (0.1f * (float)is + (float)nmax) / xmax; |
| float scale_is = 1.0f / iscale_is; |
| float mse = 0; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale_is * x[i]); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| float diff = x[i] - scale_is * (float)l; |
| mse += sw[i] * diff * diff; |
| } |
| if (mse < best_mse) { |
| best_mse = mse; |
| iscale = iscale_is; |
| } |
| } |
| |
| float sumlx = 0, suml2 = 0; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale * x[i]); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| L[i] = (uint8_t)l; |
| sumlx += sw[i] * x[i] * (float)l; |
| suml2 += sw[i] * (float)(l * l); |
| } |
| |
| for (int itry = 0; itry < 5; itry++) { |
| int n_changed = 0; |
| for (int i = 0; i < n; i++) { |
| float wi = sw[i]; |
| float slx = sumlx - wi * x[i] * (float)L[i]; |
| float sl2 = suml2 - wi * (float)(L[i] * L[i]); |
| if (slx > 0 && sl2 > 0) { |
| int new_l = gguf_nearest_int(x[i] * sl2 / slx); |
| if (new_l < 0) new_l = 0; |
| if (new_l > nmax) new_l = nmax; |
| if (new_l != L[i]) { |
| slx += wi * x[i] * (float)new_l; |
| sl2 += wi * (float)(new_l * new_l); |
| if (slx * slx * suml2 > sumlx * sumlx * sl2) { |
| L[i] = (uint8_t)new_l; |
| sumlx = slx; |
| suml2 = sl2; |
| n_changed++; |
| } |
| } |
| } |
| } |
| if (!n_changed) break; |
| } |
| return suml2 > 0 ? sumlx / suml2 : 0.0f; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| static const double W6_RE[6] = { 1.0, 0.5, -0.5, -1.0, -0.5, 0.5 }; |
| static const double W6_IM[6] = { 0.0, 0.866025403784438647, 0.866025403784438647, |
| 0.0, -0.866025403784438647, -0.866025403784438647 }; |
| static const double INV_SQRT6 = 0.40824829046386301637; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome) |
| { |
| |
| for (int v = 0; v < 6; v++) { |
| graph->locals[target_site].edge_re[v] = (v == outcome) ? 1.0 : 0.0; |
| graph->locals[target_site].edge_im[v] = 0.0; |
| } |
| graph->locals[target_site].primary = VIEW_EDGE; |
| graph->locals[target_site].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| graph->locals[target_site].delta_valid = 0; |
|
|
| |
| |
| |
| |
| HPCAdjList *adj = &graph->adj[target_site]; |
| for (uint64_t ei = 0; ei < adj->count; ei++) { |
| uint64_t eid = adj->edge_ids[ei]; |
| HPCEdge *edge = &graph->edges[eid]; |
| uint64_t partner = (edge->site_a == (uint64_t)target_site) ? |
| edge->site_b : edge->site_a; |
|
|
| TrialityQuhit *pq = &graph->locals[partner]; |
| for (int d = 0; d < 6; d++) { |
| double w_re, w_im; |
| if (edge->type == HPC_EDGE_CZ) { |
| int pidx = (outcome * d) % 6; |
| w_re = HPC_W6_RE[pidx]; |
| w_im = HPC_W6_IM[pidx]; |
| } else { |
| |
| if (edge->site_a == (uint64_t)target_site) { |
| w_re = edge->w_re[outcome][d]; |
| w_im = edge->w_im[outcome][d]; |
| } else { |
| w_re = edge->w_re[d][outcome]; |
| w_im = edge->w_im[d][outcome]; |
| } |
| } |
| double old_re = pq->edge_re[d], old_im = pq->edge_im[d]; |
| pq->edge_re[d] = old_re * w_re - old_im * w_im; |
| pq->edge_im[d] = old_re * w_im + old_im * w_re; |
| } |
| pq->dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| pq->delta_valid = 0; |
| } |
|
|
| |
| |
| for (uint64_t ei = 0; ei < adj->count; ei++) { |
| uint64_t eid = adj->edge_ids[ei]; |
| HPCEdge *edge = &graph->edges[eid]; |
| uint64_t partner = (edge->site_a == (uint64_t)target_site) ? |
| edge->site_b : edge->site_a; |
|
|
| |
| HPCAdjList *padj = &graph->adj[partner]; |
| for (uint64_t pi = 0; pi < padj->count; pi++) { |
| if (padj->edge_ids[pi] == eid) { |
| padj->edge_ids[pi] = padj->edge_ids[--padj->count]; |
| break; |
| } |
| } |
| edge->fidelity = -1.0; |
| } |
| adj->count = 0; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| static void shor_measure_graph(HPCGraph *graph, int64_t n_sites, |
| double (*marg_out)[6], int *measured_out, |
| int deterministic) |
| { |
| |
| for (int64_t k = n_sites - 1; k >= 0; k--) { |
| int site_k = (int)k; |
|
|
| |
| |
| |
| |
| |
| double theta_k = 0.0; |
| { |
| double power = 36.0; |
| for (int64_t j = k + 1; j < n_sites; j++) { |
| theta_k += (double)measured_out[j] / power; |
| power *= 6.0; |
| } |
| } |
|
|
| |
| |
| |
| double ck_re[6], ck_im[6]; |
| for (int d = 0; d < 6; d++) { ck_re[d] = 1.0; ck_im[d] = 0.0; } |
|
|
| const HPCAdjList *adj = &graph->adj[site_k]; |
| for (uint64_t ei = 0; ei < adj->count; ei++) { |
| uint64_t eid = adj->edge_ids[ei]; |
| const HPCEdge *edge = &graph->edges[eid]; |
| if (edge->fidelity < 0.0) continue; |
| uint64_t partner = (edge->site_a == (uint64_t)site_k) ? |
| edge->site_b : edge->site_a; |
|
|
| const TrialityQuhit *pq = &graph->locals[partner]; |
| for (int d = 0; d < 6; d++) { |
| double sr = 0, si = 0; |
| for (int w = 0; w < 6; w++) { |
| double lr = pq->edge_re[w], li = pq->edge_im[w]; |
| double wr, wi; |
| if (edge->type == HPC_EDGE_CZ) { |
| int pidx = (d * w) % 6; |
| wr = HPC_W6_RE[pidx]; wi = HPC_W6_IM[pidx]; |
| } else if (edge->site_a == (uint64_t)site_k) { |
| wr = edge->w_re[d][w]; wi = edge->w_im[d][w]; |
| } else { |
| wr = edge->w_re[w][d]; wi = edge->w_im[w][d]; |
| } |
| sr += lr*wr - li*wi; |
| si += lr*wi + li*wr; |
| } |
| double nr = ck_re[d]*sr - ck_im[d]*si; |
| double ni = ck_re[d]*si + ck_im[d]*sr; |
| ck_re[d] = nr; ck_im[d] = ni; |
| } |
| } |
|
|
| |
| for (int d = 0; d < 6; d++) { |
| double re = graph->locals[site_k].edge_re[d]; |
| double im = graph->locals[site_k].edge_im[d]; |
| graph->locals[site_k].edge_re[d] = re*ck_re[d] - im*ck_im[d]; |
| graph->locals[site_k].edge_im[d] = re*ck_im[d] + im*ck_re[d]; |
| } |
|
|
| |
| for (int d = 0; d < 6; d++) { |
| double angle = -2.0 * 3.14159265358979323846 * d * theta_k; |
| double pr = cos(angle), pi2 = sin(angle); |
| double re = graph->locals[site_k].edge_re[d]; |
| double im = graph->locals[site_k].edge_im[d]; |
| graph->locals[site_k].edge_re[d] = re*pr - im*pi2; |
| graph->locals[site_k].edge_im[d] = re*pi2 + im*pr; |
| } |
|
|
| |
| |
| |
| |
| |
| { |
| double alpha_re[6], alpha_im[6]; |
| for (int d = 0; d < 6; d++) { |
| alpha_re[d] = graph->locals[site_k].edge_re[d]; |
| alpha_im[d] = graph->locals[site_k].edge_im[d]; |
| } |
| for (int v = 0; v < 6; v++) { |
| double sum_re = 0.0, sum_im = 0.0; |
| for (int d = 0; d < 6; d++) { |
| double angle = 2.0 * 3.14159265358979323846 * d * v / 6.0; |
| double er = cos(angle), ei = sin(angle); |
| sum_re += alpha_re[d]*er - alpha_im[d]*ei; |
| sum_im += alpha_re[d]*ei + alpha_im[d]*er; |
| } |
| graph->locals[site_k].edge_re[v] = sum_re * INV_SQRT6; |
| graph->locals[site_k].edge_im[v] = sum_im * INV_SQRT6; |
| } |
| } |
|
|
| |
| double probs[6]; |
| double total = 0.0; |
| for (int v = 0; v < 6; v++) { |
| probs[v] = graph->locals[site_k].edge_re[v] * graph->locals[site_k].edge_re[v] + |
| graph->locals[site_k].edge_im[v] * graph->locals[site_k].edge_im[v]; |
| total += probs[v]; |
| } |
| if (total > 1e-30) { |
| for (int v = 0; v < 6; v++) probs[v] /= total; |
| } else { |
| for (int v = 0; v < 6; v++) probs[v] = 1.0 / 6.0; |
| } |
|
|
| |
| for (int v = 0; v < 6; v++) |
| marg_out[k][v] = probs[v]; |
|
|
| |
| |
| |
| int outcome; |
| if (deterministic) { |
| outcome = 0; |
| double max_p = probs[0]; |
| for (int v = 1; v < 6; v++) { |
| if (probs[v] > max_p) { max_p = probs[v]; outcome = v; } |
| } |
| } else { |
| |
| static unsigned int shor_rng = 271828; |
| shor_rng = shor_rng * 1664525u + 1013904223u; |
| double r01 = (double)(shor_rng >> 8) / 16777216.0; |
| double cumul = 0.0; |
| outcome = 5; |
| for (int v = 0; v < 6; v++) { |
| cumul += probs[v]; |
| if (r01 <= cumul) { outcome = v; break; } |
| } |
| } |
|
|
| measured_out[k] = outcome; |
|
|
| |
| |
| shor_collapse_site(graph, site_k, outcome); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #define Q4_N_CAND 16 |
| #define Q4_N_BEAMS 24 |
|
|
| |
| static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = { |
| 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, 0.975f, 0.985f, |
| 0.995f, 1.005f, 1.015f, 1.025f, 1.035f, 1.050f, 1.070f, 1.100f |
| }; |
| static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = { |
| 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 |
| }; |
|
|
| static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, |
| BlockQ4_0 *output, float *out_total_error, |
| const float *imat_importance, int verbose) |
| { |
| int64_t n_blocks = n_elements / QK4_0; |
| float total_err = 0.0f; |
|
|
| |
| float *greedy_d = (float *)calloc(n_blocks, sizeof(float)); |
|
|
| #pragma omp parallel for schedule(dynamic, 64) |
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| const float *bw = weights + blk * QK4_0; |
| float amax = 0.0f; |
| for (int j = 0; j < QK4_0; j++) { |
| float av = fabsf(bw[j]); |
| if (av > amax) amax = av; |
| } |
| greedy_d[blk] = amax / 7.0f; |
| } |
|
|
| |
| |
| |
| float (*cand_errors)[Q4_N_CAND] = (float (*)[Q4_N_CAND]) |
| calloc(n_blocks, sizeof(float[Q4_N_CAND])); |
| uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND]) |
| calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND])); |
|
|
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| const float *bw = weights + blk * QK4_0; |
|
|
| |
| float wls_d = greedy_d[blk]; |
| for (int ls_iter = 0; ls_iter < 3; ls_iter++) { |
| if (wls_d < 1e-15f) break; |
| float inv_d = 1.0f / wls_d; |
| float num = 0.0f, den = 0.0f; |
| for (int j = 0; j < QK4_0; j++) { |
| int q = (int)(bw[j] * inv_d + 8.5f); |
| if (q < 0) q = 0; if (q > 15) q = 15; |
| float qc = (float)q - 8.0f; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK4_0 + j] : 1.0f; |
| num += w * bw[j] * qc; |
| den += w * qc * qc; |
| } |
| if (den > 1e-15f) { |
| float d_new = num / den; |
| if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f)) |
| wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new)); |
| } |
| } |
|
|
| |
| for (int ci = 0; ci < Q4_N_CAND; ci++) { |
| float trial_d = wls_d * Q4_NEIGHBOR_MULTS[ci]; |
| uint16_t d16 = gguf_fp32_to_fp16(trial_d); |
| float actual_d = gguf_fp16_to_fp32(d16); |
| cand_d16[blk][ci] = d16; |
|
|
| float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f; |
| float err = 0.0f; |
|
|
| for (int j = 0; j < QK4_0; j += 6) { |
| int g_len = (j + 6 <= QK4_0) ? 6 : (QK4_0 - j); |
| int half_g = g_len / 2; |
| float e_cur[6], w_cur[6]; |
| |
| for (int kk = 0; kk < g_len; kk++) { |
| int idx = j + kk; |
| float x = bw[idx]; |
| int q = (int)(x * id + 8.5f); |
| if (q < 0) q = 0; if (q > 15) q = 15; |
| float deq = ((float)q - 8.0f) * actual_d; |
| e_cur[kk] = x - deq; |
| w_cur[kk] = (imat_importance) ? imat_importance[blk * QK4_0 + idx] : 1.0f; |
| } |
| |
| |
| float vesica_err = 0.0f, wave_err = 0.0f; |
| for (int p = 0; p < half_g; p++) { |
| float v = e_cur[p] + e_cur[p + half_g]; |
| float w_wave = e_cur[p] - e_cur[p + half_g]; |
| float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f; |
| vesica_err += v * v * w_avg; |
| wave_err += w_wave * w_wave * w_avg; |
| } |
| |
| |
| err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err); |
| } |
| cand_errors[blk][ci] = err; |
| } |
| } |
|
|
| |
| int *best_candidate = (int *)malloc(n_blocks * sizeof(int)); |
| for (int64_t i = 0; i < n_blocks; i++) |
| best_candidate[i] = 10; |
|
|
| if (n_blocks >= 2) { |
| float temperature = 0.5f; |
| int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks; |
| int64_t stride = n_blocks / graph_blocks; |
| int64_t n_sites = graph_blocks; |
|
|
| HPCGraph *graph = hpc_create(n_sites); |
| if (graph) { |
| for (int64_t i = 0; i < n_sites; i++) |
| triality_dft(&graph->locals[i]); |
|
|
| |
| { |
| double err_accum = 0.0; |
| int err_count = 0; |
| for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) { |
| int64_t blk = gi * stride; |
| float max_e = 0.0f; |
| for (int c = 0; c < Q4_N_CAND; c++) |
| if (cand_errors[blk][c] > max_e) |
| max_e = cand_errors[blk][c]; |
| err_accum += (double)max_e; |
| err_count++; |
| } |
| if (err_count > 0) { |
| temperature = (float)(err_accum / err_count) * 0.1f; |
| if (temperature < 1e-10f) temperature = 1e-10f; |
| } |
| } |
|
|
| |
| for (int64_t i = 0; i < graph_blocks; i++) { |
| |
| float agg_errors[Q4_N_CAND]; |
| for (int c = 0; c < Q4_N_CAND; c++) |
| agg_errors[c] = 0.0f; |
|
|
| int64_t blk_start = i * stride; |
| int64_t blk_end = blk_start + stride; |
| if (blk_end > n_blocks) blk_end = n_blocks; |
| int64_t group_size = blk_end - blk_start; |
|
|
| for (int64_t b = blk_start; b < blk_end; b++) { |
| for (int c = 0; c < Q4_N_CAND; c++) |
| agg_errors[c] += cand_errors[b][c]; |
| } |
| if (group_size > 1) { |
| float inv_gs = 1.0f / (float)group_size; |
| for (int c = 0; c < Q4_N_CAND; c++) |
| agg_errors[c] *= inv_gs; |
| } |
|
|
| float min_err = 1e30f; |
| for (int c = 0; c < Q4_N_CAND; c++) |
| if (agg_errors[c] < min_err) |
| min_err = agg_errors[c]; |
|
|
| double amp_re[6]; |
| double amp_norm = 0.0; |
| for (int qi = 0; qi < 6; qi++) amp_re[qi] = 0.0; |
| for (int ci = 0; ci < Q4_N_CAND; ci++) { |
| int qi = Q4_CAND_TO_QUHIT[ci]; |
| amp_re[qi] += exp(-(double)(agg_errors[ci] - min_err) / |
| (2.0 * (double)temperature)); |
| } |
| for (int qi = 0; qi < 6; qi++) |
| amp_norm += amp_re[qi] * amp_re[qi]; |
| if (amp_norm > 1e-30) { |
| double inv = 1.0 / sqrt(amp_norm); |
| for (int v = 0; v < 6; v++) amp_re[v] *= inv; |
| } |
|
|
| for (int v = 0; v < 6; v++) { |
| graph->locals[i].edge_re[v] = amp_re[v]; |
| graph->locals[i].edge_im[v] = 0.0; |
| } |
| graph->locals[i].primary = VIEW_EDGE; |
| graph->locals[i].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| graph->locals[i].delta_valid = 0; |
| triality_update_mask(&graph->locals[i]); |
| } |
|
|
| |
| for (int64_t i = 0; i < graph_blocks - 1; i++) |
| hpc_cz(graph, i, i + 1); |
|
|
| |
| |
| |
| |
| double (*marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); |
| int *shor_measured = (int *)calloc(graph_blocks, sizeof(int)); |
|
|
| shor_measure_graph(graph, graph_blocks, marg, shor_measured, 1); |
|
|
| free(shor_measured); |
|
|
| |
| typedef struct { double acc_error; int history_idx; } Q4Beam; |
| typedef struct { int cand_idx; int parent_idx; } Q4BeamHistory; |
|
|
| Q4Beam beams[Q4_N_BEAMS]; |
| int active_beams = 1; |
| Q4BeamHistory *history = (Q4BeamHistory *)malloc(n_blocks * Q4_N_BEAMS * sizeof(Q4BeamHistory)); |
|
|
| for (int b = 0; b < Q4_N_BEAMS; b++) { |
| beams[b].acc_error = 0.0; |
| beams[b].history_idx = -1; |
| } |
|
|
| for (int64_t i = 0; i < graph_blocks; i++) { |
| double m_total = 0.0; |
| for (int v = 0; v < 6; v++) m_total += marg[i][v]; |
|
|
| double cand_score[Q4_N_CAND]; |
| int64_t blk = i * stride; |
| |
| int q4_bin_count[6] = {0}; |
| for (int ci = 0; ci < Q4_N_CAND; ci++) |
| q4_bin_count[Q4_CAND_TO_QUHIT[ci]]++; |
| |
| |
| float blk_mean_err = 0.0f; |
| for (int ci = 0; ci < Q4_N_CAND; ci++) |
| blk_mean_err += cand_errors[blk][ci]; |
| blk_mean_err /= (float)Q4_N_CAND; |
| if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f; |
| for (int ci = 0; ci < Q4_N_CAND; ci++) { |
| int qi = Q4_CAND_TO_QUHIT[ci]; |
| double p = (m_total > 1e-30) ? marg[i][qi] / m_total : 1.0/6.0; |
| p /= (double)q4_bin_count[qi]; |
| cand_score[ci] = p / (cand_errors[blk][ci] / blk_mean_err + 1e-15); |
| } |
|
|
| typedef struct { double score; int beam_idx; int cand_idx; } Q4Ext; |
| Q4Ext extensions[Q4_N_BEAMS * Q4_N_CAND]; |
| int n_ext = 0; |
| for (int b = 0; b < active_beams; b++) { |
| for (int c = 0; c < Q4_N_CAND; c++) { |
| double ext_err = beams[b].acc_error + cand_errors[blk][c]; |
| extensions[n_ext].score = cand_score[c] / (ext_err + 1e-15); |
| extensions[n_ext].beam_idx = b; |
| extensions[n_ext].cand_idx = c; |
| n_ext++; |
| } |
| } |
|
|
| int top_k = (n_ext < Q4_N_BEAMS) ? n_ext : Q4_N_BEAMS; |
| int top_indices[Q4_N_BEAMS]; |
| for (int k = 0; k < top_k; k++) { |
| int best = -1; double best_s = -1e30; |
| for (int e = 0; e < n_ext; e++) { |
| if (extensions[e].score > best_s) { |
| best_s = extensions[e].score; best = e; |
| } |
| } |
| top_indices[k] = best; |
| extensions[best].score = -2e30; |
| } |
|
|
| Q4Beam new_beams[Q4_N_BEAMS]; |
| for (int k = 0; k < top_k; k++) { |
| int ei = top_indices[k]; |
| int sb = extensions[ei].beam_idx; |
| int cand = extensions[ei].cand_idx; |
|
|
| int hist_idx = i * Q4_N_BEAMS + k; |
| history[hist_idx].cand_idx = cand; |
| history[hist_idx].parent_idx = beams[sb].history_idx; |
|
|
| new_beams[k].history_idx = hist_idx; |
| new_beams[k].acc_error = beams[sb].acc_error + cand_errors[blk][cand]; |
| } |
| for (int k = 0; k < top_k; k++) beams[k] = new_beams[k]; |
| active_beams = top_k; |
| } |
|
|
| int curr_hist = beams[0].history_idx; |
| for (int64_t i = graph_blocks - 1; i >= 0; i--) { |
| int group_cidx; |
| if (curr_hist >= 0) { |
| group_cidx = history[curr_hist].cand_idx; |
| curr_hist = history[curr_hist].parent_idx; |
| } else { |
| group_cidx = 10; |
| } |
|
|
| if (stride <= 1) { |
| best_candidate[i] = group_cidx; |
| } else { |
| |
| |
| |
| int target_bin = Q4_CAND_TO_QUHIT[group_cidx]; |
|
|
| for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) { |
| float best_err = 1e30f; |
| int best_c = group_cidx; |
| for (int c = 0; c < Q4_N_CAND; c++) { |
| if (Q4_CAND_TO_QUHIT[c] != target_bin) continue; |
| if (cand_errors[b][c] < best_err) { |
| best_err = cand_errors[b][c]; |
| best_c = c; |
| } |
| } |
| |
| float global_best = 1e30f; |
| int global_best_c = group_cidx; |
| for (int c = 0; c < Q4_N_CAND; c++) { |
| if (cand_errors[b][c] < global_best) { |
| global_best = cand_errors[b][c]; |
| global_best_c = c; |
| } |
| } |
| if (global_best < best_err * 0.95f) |
| best_candidate[b] = global_best_c; |
| else |
| best_candidate[b] = best_c; |
| } |
| } |
| } |
| free(history); |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| { |
| #define Q4_BORN_SHOTS 64 |
|
|
| |
| float beam_total_err = 0.0f; |
| for (int64_t bi = 0; bi < n_blocks; bi++) |
| beam_total_err += cand_errors[bi][best_candidate[bi]]; |
|
|
| |
| unsigned int born_rng = 314159; |
|
|
| |
| float tail_err_q4 = 0.0f; |
| for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++) |
| tail_err_q4 += cand_errors[bi][best_candidate[bi]]; |
|
|
| |
| int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int)); |
|
|
| for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) { |
| float shot_err = tail_err_q4; |
|
|
| for (int64_t gi = 0; gi < graph_blocks; gi++) { |
| |
| double m_total = 0.0; |
| for (int v = 0; v < 6; v++) m_total += marg[gi][v]; |
|
|
| |
| born_rng = born_rng * 1664525u + 1013904223u; |
| double rnd = (double)(born_rng >> 8) / 16777216.0; |
| double target = rnd * m_total; |
| double cum = 0.0; |
| int sampled_qi = 5; |
| for (int v = 0; v < 6; v++) { |
| cum += marg[gi][v]; |
| if (cum > target) { sampled_qi = v; break; } |
| } |
|
|
| |
| int64_t blk = gi * stride; |
| float best_bin_err = 1e30f; |
| int best_bin_cand = 10; |
| for (int ci = 0; ci < Q4_N_CAND; ci++) { |
| if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) { |
| if (cand_errors[blk][ci] < best_bin_err) { |
| best_bin_err = cand_errors[blk][ci]; |
| best_bin_cand = ci; |
| } |
| } |
| } |
|
|
| shot_sparse_q4[gi] = best_bin_cand; |
| shot_err += cand_errors[blk][best_bin_cand]; |
| } |
|
|
| |
| if (shot_err < beam_total_err) { |
| for (int64_t gi = 0; gi < graph_blocks; gi++) |
| best_candidate[gi * stride] = shot_sparse_q4[gi]; |
| beam_total_err = shot_err; |
| } |
| } |
|
|
| free(shot_sparse_q4); |
| } |
|
|
| free(marg); |
| hpc_destroy(graph); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err) |
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| const float *bw = weights + blk * QK4_0; |
| int cidx = best_candidate[blk]; |
|
|
| |
| float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]); |
|
|
| |
| |
| for (int ls_iter = 0; ls_iter < 5; ls_iter++) { |
| if (d_current < 1e-15f) break; |
| float id = 1.0f / d_current; |
|
|
| |
| int qs_tmp[QK4_0]; |
| for (int j = 0; j < QK4_0; j++) { |
| int q = (int)(bw[j] * id + 8.5f); |
| if (q < 0) q = 0; if (q > 15) q = 15; |
| qs_tmp[j] = q; |
| } |
|
|
| |
| |
| float num = 0.0f, den = 0.0f; |
| for (int j = 0; j < QK4_0; j++) { |
| float q_centered = (float)qs_tmp[j] - 8.0f; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK4_0 + j] : 1.0f; |
| num += w * bw[j] * q_centered; |
| den += w * q_centered * q_centered; |
| } |
|
|
| if (den > 1e-15f) { |
| float d_new = num / den; |
| |
| float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]); |
| if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) { |
| uint16_t d16 = gguf_fp32_to_fp16(d_new); |
| d_current = gguf_fp16_to_fp32(d16); |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| { |
| uint16_t base_d16 = gguf_fp32_to_fp16(d_current); |
| uint16_t best_d16 = base_d16; |
| float best_ulp_err = 1e30f; |
|
|
| |
| uint16_t ulp_candidates[17]; |
| int n_ulp = 0; |
| for (int delta = -4; delta <= 4; delta++) { |
| int cand16 = (int)base_d16 + delta; |
| if (cand16 >= 0 && cand16 <= 0x7BFF) |
| ulp_candidates[n_ulp++] = (uint16_t)cand16; |
| } |
| |
| { |
| float neg_d = -d_current; |
| uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d); |
| ulp_candidates[n_ulp++] = neg_d16; |
| } |
|
|
| for (int ui = 0; ui < n_ulp; ui++) { |
| float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]); |
| float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f; |
| float err = 0.0f; |
| for (int j = 0; j < QK4_0; j++) { |
| int q = (int)(bw[j] * trial_id + 8.5f); |
| if (q < 0) q = 0; if (q > 15) q = 15; |
| float deq = ((float)q - 8.0f) * trial_d; |
| float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; |
| err += (bw[j] - deq) * (bw[j] - deq) * w; |
| } |
| if (err < best_ulp_err) { |
| best_ulp_err = err; |
| best_d16 = ulp_candidates[ui]; |
| } |
| } |
| d_current = gguf_fp16_to_fp32(best_d16); |
| } |
|
|
| |
| output[blk].d = gguf_fp32_to_fp16(d_current); |
| float actual_d = d_current; |
| float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f; |
|
|
| |
| |
| |
| |
| |
|
|
| |
| int q_base[QK4_0], q_shaped[QK4_0]; |
| float q_cont[QK4_0]; |
| for (int j = 0; j < QK4_0; j++) { |
| q_cont[j] = bw[j] * id + 8.0f; |
| q_base[j] = (int)(q_cont[j] + 0.5f); |
| if (q_base[j] < 0) q_base[j] = 0; |
| if (q_base[j] > 15) q_base[j] = 15; |
| } |
| memcpy(q_shaped, q_base, QK4_0 * sizeof(int)); |
|
|
| |
| for (int g = 0; g < 5; g++) { |
| int g_off = g * 6; |
|
|
| for (int pass = 0; pass < 6; pass++) { |
| int best_k = -1; |
| int best_q_alt = 0; |
| float best_delta = 0.0f; |
|
|
| |
| float e_cur[6]; |
| for (int kk = 0; kk < 6; kk++) { |
| float deq = ((float)q_shaped[g_off+kk] - 8.0f) * actual_d; |
| e_cur[kk] = bw[g_off+kk] - deq; |
| } |
|
|
| |
| float vesica_cur = 0.0f, dc_cur = 0.0f; |
| for (int p = 0; p < 3; p++) { |
| float v = e_cur[p] + e_cur[p+3]; |
| vesica_cur += v * v; |
| } |
| for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk]; |
| float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; |
|
|
| |
| for (int k = 0; k < 6; k++) { |
| int idx = g_off + k; |
| int q_cur = q_shaped[idx]; |
|
|
| int q_try; |
| if (q_cont[idx] - (float)q_cur >= 0) { |
| q_try = q_cur + 1; |
| } else { |
| q_try = q_cur - 1; |
| } |
| if (q_try < 0 || q_try > 15) continue; |
|
|
| |
| float e_alt[6]; |
| for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk]; |
| float deq_try = ((float)q_try - 8.0f) * actual_d; |
| e_alt[k] = bw[idx] - deq_try; |
|
|
| |
| float vesica_alt = 0.0f, dc_alt = 0.0f; |
| for (int p = 0; p < 3; p++) { |
| float v = e_alt[p] + e_alt[p+3]; |
| vesica_alt += v * v; |
| } |
| for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk]; |
| float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt; |
|
|
| float delta = metric_cur - metric_alt; |
| if (delta > best_delta) { |
| best_delta = delta; |
| best_k = k; |
| best_q_alt = q_try; |
| } |
| } |
|
|
| if (best_k < 0) break; |
| q_shaped[g_off + best_k] = best_q_alt; |
| } |
| } |
|
|
| |
| float err_base = 0.0f, err_shaped = 0.0f; |
| for (int j = 0; j < QK4_0; j++) { |
| float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; |
| float deq_b = ((float)q_base[j] - 8.0f) * actual_d; |
| float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d; |
| err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w; |
| err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w; |
| } |
| int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base; |
|
|
| |
| for (int j = 0; j < QK4_0 / 2; j++) { |
| int q0 = q_final[j]; |
| int q1 = q_final[j + QK4_0/2]; |
| output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4)); |
|
|
| float deq0 = ((float)q0 - 8.0f) * actual_d; |
| float deq1 = ((float)q1 - 8.0f) * actual_d; |
| total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1); |
| } |
| } |
|
|
| *out_total_error = total_err; |
| free(greedy_d); |
| free(cand_errors); |
| free(cand_d16); |
| free(best_candidate); |
| } |
|
|
| static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, |
| BlockQ2K *output, float *out_total_error, |
| OptimizerMode opt_mode, |
| const float *imat_importance, |
| int verbose) |
| { |
| int64_t n_blocks = n_elements / QK_K; |
| float total_err = 0.0f; |
| const int N_SUB = QK_K / 16; |
|
|
| init_scale_table(); |
|
|
| |
| |
| |
|
|
| |
| typedef struct { |
| float dm, mm; |
| uint16_t d_fp16, dmin_fp16; |
| uint8_t Ls[16], Lm[16]; |
| float scales[16], mins[16], sw[16]; |
| } BlockSeed; |
|
|
| BlockSeed *seeds = (BlockSeed *)calloc(n_blocks, sizeof(BlockSeed)); |
|
|
| #pragma omp parallel for schedule(dynamic, 64) |
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| const float *block_x = weights + blk * QK_K; |
| uint8_t L[QK_K], Laux[16]; |
| float wt[16]; |
|
|
| float sumx2 = 0; |
| for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i]; |
| float sigma2 = sumx2 / (float)QK_K; |
|
|
| for (int j = 0; j < N_SUB; j++) { |
| const float *sx = block_x + 16 * j; |
| seeds[blk].sw[j] = 0; |
| for (int l = 0; l < 16; l++) { |
| float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f; |
| wt[l] = imp * sqrtf(sigma2 + sx[l] * sx[l]); |
| seeds[blk].sw[j] += wt[l]; |
| } |
| seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx, wt, |
| L + 16 * j, &seeds[blk].mins[j], Laux); |
| } |
|
|
| seeds[blk].dm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].scales, |
| seeds[blk].Ls, seeds[blk].sw); |
| seeds[blk].mm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].mins, |
| seeds[blk].Lm, seeds[blk].sw); |
| seeds[blk].d_fp16 = gguf_fp32_to_fp16(seeds[blk].dm); |
| seeds[blk].dmin_fp16 = gguf_fp32_to_fp16(seeds[blk].mm); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| static const float NEIGHBOR_MULTS_D[N_CAND_D] = { |
| 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f, |
| 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f |
| }; |
| static const float NEIGHBOR_MULTS_M[N_CAND_M] = { |
| 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f, |
| 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f |
| }; |
| |
| static const int CAND_TO_QUHIT[16] = { |
| 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 |
| }; |
|
|
| |
| float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL; |
| uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL; |
| uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL; |
| |
| uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL; |
| uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL; |
|
|
| candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks, |
| sizeof(float[TOTAL_SCALE_CANDIDATES])); |
| candidate_d = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks, |
| sizeof(uint16_t[TOTAL_SCALE_CANDIDATES])); |
| candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks, |
| sizeof(uint16_t[TOTAL_SCALE_CANDIDATES])); |
| candidate_Ls = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks, |
| sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16])); |
| candidate_Lm = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks, |
| sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16])); |
|
|
| #pragma omp parallel for schedule(dynamic, 16) |
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| const float *block_x = weights + blk * QK_K; |
|
|
| |
| |
| |
| |
| float wls_dm = seeds[blk].dm; |
| float wls_mm = seeds[blk].mm; |
| uint8_t wls_Ls[16], wls_Lm[16]; |
| memcpy(wls_Ls, seeds[blk].Ls, 16); |
| memcpy(wls_Lm, seeds[blk].Lm, 16); |
|
|
| for (int ls_iter = 0; ls_iter < 5; ls_iter++) { |
| |
| uint8_t L_wls[QK_K]; |
| for (int j = 0; j < N_SUB; j++) { |
| float d_sub = wls_dm * (float)wls_Ls[j]; |
| float m_sub = wls_mm * (float)wls_Lm[j]; |
| if (d_sub < 1e-15f) { |
| for (int k = 0; k < 16; k++) L_wls[16*j+k] = 0; |
| continue; |
| } |
| for (int k = 0; k < 16; k++) { |
| int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| L_wls[16*j+k] = (uint8_t)q; |
| } |
| } |
|
|
| |
| double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0; |
| for (int j = 0; j < N_SUB; j++) { |
| float ls_f = (float)wls_Ls[j]; |
| float lm_f = (float)wls_Lm[j]; |
| for (int k = 0; k < 16; k++) { |
| float x = block_x[16*j+k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16*j+k] : 1.0f; |
| float a = ls_f * (float)L_wls[16*j+k]; |
| float b = lm_f; |
| Saa += w * a * a; |
| Sab += w * a * b; |
| Sbb += w * b * b; |
| Sxa += w * x * a; |
| Sxb += w * x * b; |
| } |
| } |
|
|
| |
| double det = Saa * Sbb - Sab * Sab; |
| if (fabs(det) > 1e-30) { |
| double d_new = (Sbb * Sxa - Sab * Sxb) / det; |
| double dm_new = (Sab * Sxa - Saa * Sxb) / det; |
| |
| if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10)) |
| wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new)); |
| if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10)) |
| wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new)); |
| } |
|
|
| |
| for (int j = 0; j < N_SUB; j++) { |
| if (wls_dm > 1e-15f) { |
| int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm); |
| if (ls < 0) ls = 0; if (ls > 15) ls = 15; |
| wls_Ls[j] = (uint8_t)ls; |
| } else { wls_Ls[j] = 0; } |
| if (wls_mm > 1e-15f) { |
| int lm = gguf_nearest_int(seeds[blk].mins[j] / wls_mm); |
| if (lm < 0) lm = 0; if (lm > 15) lm = 15; |
| wls_Lm[j] = (uint8_t)lm; |
| } else { wls_Lm[j] = 0; } |
| } |
| } |
|
|
| |
| |
| |
| for (int di = 0; di < N_CAND_D; di++) { |
| float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di]; |
| uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm); |
| float actual_dm = gguf_fp16_to_fp32(trial_d16); |
|
|
| for (int mi = 0; mi < N_CAND_M; mi++) { |
| int cidx = di * N_CAND_M + mi; |
| float trial_mm = wls_mm * NEIGHBOR_MULTS_M[mi]; |
| uint16_t trial_dmin16 = gguf_fp32_to_fp16(trial_mm); |
| float actual_mm = gguf_fp16_to_fp32(trial_dmin16); |
|
|
| candidate_d[blk][cidx] = trial_d16; |
| candidate_dmin[blk][cidx] = trial_dmin16; |
|
|
| |
| uint8_t trial_Ls[16], trial_Lm[16]; |
| for (int j = 0; j < N_SUB; j++) { |
| if (actual_dm > 1e-15f) { |
| int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm); |
| if (ls < 0) ls = 0; if (ls > 15) ls = 15; |
| trial_Ls[j] = (uint8_t)ls; |
| } else { |
| trial_Ls[j] = 0; |
| } |
| if (actual_mm > 1e-15f) { |
| int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm); |
| if (lm < 0) lm = 0; if (lm > 15) lm = 15; |
| trial_Lm[j] = (uint8_t)lm; |
| } else { |
| trial_Lm[j] = 0; |
| } |
| } |
| memcpy(candidate_Ls[blk][cidx], trial_Ls, 16); |
| memcpy(candidate_Lm[blk][cidx], trial_Lm, 16); |
|
|
| |
| float err = 0.0f; |
| for (int j = 0; j < N_SUB; j++) { |
| float d = actual_dm * (float)trial_Ls[j]; |
| float m = actual_mm * (float)trial_Lm[j]; |
| if (d < 1e-15f) { |
| for (int k = 0; k < 16; k++) { |
| float x = block_x[16 * j + k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16 * j + k] : 1.0f; |
| err += x * x * w; |
| } |
| continue; |
| } |
| for (int k = 0; k < 16; k += 6) { |
| int g_len = (k + 6 <= 16) ? 6 : (16 - k); |
| int half_g = g_len / 2; |
| float e_cur[6], w_cur[6]; |
| |
| for (int kk = 0; kk < g_len; kk++) { |
| int idx = 16 * j + k + kk; |
| float x = block_x[idx]; |
| int q = gguf_nearest_int((x + m) / d); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| float deq = d * (float)q - m; |
| e_cur[kk] = x - deq; |
| w_cur[kk] = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f; |
| } |
| |
| |
| float vesica_err = 0.0f, wave_err = 0.0f; |
| for (int p = 0; p < half_g; p++) { |
| float v = e_cur[p] + e_cur[p + half_g]; |
| float w_wave = e_cur[p] - e_cur[p + half_g]; |
| float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f; |
| vesica_err += v * v * w_avg; |
| wave_err += w_wave * w_wave * w_avg; |
| } |
| |
| err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err); |
| } |
| } |
| candidate_errors[blk][cidx] = err; |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| int *best_candidate = (int *)malloc(n_blocks * sizeof(int)); |
| for (int64_t i = 0; i < n_blocks; i++) |
| best_candidate[i] = 10 * N_CAND_M + 10; |
|
|
| if (opt_mode != OPT_MSE && n_blocks >= 2) { |
| int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks; |
| int64_t stride = n_blocks / graph_blocks; |
| float temperature = 0.5f; |
| int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK; |
|
|
| HPCGraph *graph = hpc_create(n_sites); |
| if (graph) { |
| for (int64_t i = 0; i < n_sites; i++) |
| triality_dft(&graph->locals[i]); |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| { |
| double err_accum = 0.0; |
| int err_count = 0; |
| for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) { |
| int64_t blk = gi * stride; |
| float max_e = 0.0f; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) |
| if (candidate_errors[blk][c] > max_e) |
| max_e = candidate_errors[blk][c]; |
| err_accum += (double)max_e; |
| err_count++; |
| } |
| if (err_count > 0) { |
| float median_err = (float)(err_accum / err_count); |
| |
| |
| temperature = median_err * 0.1f; |
| if (temperature < 1e-10f) temperature = 1e-10f; |
| } |
| } |
|
|
| for (int64_t i = 0; i < graph_blocks; i++) { |
| |
| float agg_errors[TOTAL_SCALE_CANDIDATES]; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) |
| agg_errors[c] = 0.0f; |
|
|
| int64_t blk_start = i * stride; |
| int64_t blk_end = blk_start + stride; |
| if (blk_end > n_blocks) blk_end = n_blocks; |
| int64_t group_size = blk_end - blk_start; |
|
|
| for (int64_t b = blk_start; b < blk_end; b++) { |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) |
| agg_errors[c] += candidate_errors[b][c]; |
| } |
| |
| if (group_size > 1) { |
| float inv_gs = 1.0f / (float)group_size; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) |
| agg_errors[c] *= inv_gs; |
| } |
|
|
| float min_err = 1e30f; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) |
| if (agg_errors[c] < min_err) |
| min_err = agg_errors[c]; |
|
|
| |
| double coarse_re[6]; |
| double coarse_norm = 0.0; |
| for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0; |
| for (int di = 0; di < N_CAND_D; di++) { |
| int qi = CAND_TO_QUHIT[di]; |
| for (int mi = 0; mi < N_CAND_M; mi++) { |
| int cidx = di * N_CAND_M + mi; |
| coarse_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) / |
| (2.0 * (double)temperature)); |
| } |
| } |
| for (int qi = 0; qi < 6; qi++) |
| coarse_norm += coarse_re[qi] * coarse_re[qi]; |
| if (coarse_norm > 1e-30) { |
| double inv = 1.0 / sqrt(coarse_norm); |
| for (int v = 0; v < 6; v++) coarse_re[v] *= inv; |
| } |
|
|
| |
| double fine_re[6]; |
| double fine_norm = 0.0; |
| for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0; |
| for (int mi = 0; mi < N_CAND_M; mi++) { |
| int qi = CAND_TO_QUHIT[mi]; |
| for (int di = 0; di < N_CAND_D; di++) { |
| int cidx = di * N_CAND_M + mi; |
| fine_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) / |
| (2.0 * (double)temperature)); |
| } |
| } |
| for (int qi = 0; qi < 6; qi++) |
| fine_norm += fine_re[qi] * fine_re[qi]; |
| if (fine_norm > 1e-30) { |
| double inv = 1.0 / sqrt(fine_norm); |
| for (int v = 0; v < 6; v++) fine_re[v] *= inv; |
| } |
|
|
| |
| int64_t s0 = 2 * i, s1 = 2 * i + 1; |
| for (int v = 0; v < 6; v++) { |
| graph->locals[s0].edge_re[v] = coarse_re[v]; |
| graph->locals[s0].edge_im[v] = 0.0; |
| graph->locals[s1].edge_re[v] = fine_re[v]; |
| graph->locals[s1].edge_im[v] = 0.0; |
| } |
| graph->locals[s0].primary = VIEW_EDGE; |
| graph->locals[s0].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| graph->locals[s0].delta_valid = 0; |
| triality_update_mask(&graph->locals[s0]); |
| graph->locals[s1].primary = VIEW_EDGE; |
| graph->locals[s1].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| graph->locals[s1].delta_valid = 0; |
| triality_update_mask(&graph->locals[s1]); |
| } |
|
|
| |
| for (int64_t i = 0; i < graph_blocks; i++) { |
| hpc_cz(graph, 2 * i, 2 * i + 1); |
| if (i + 1 < graph_blocks) { |
| hpc_cz(graph, 2 * i, 2 * (i + 1)); |
| hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| double (*shor_marg)[6] = (double (*)[6])calloc(n_sites, sizeof(double[6])); |
| int *shor_measured = (int *)calloc(n_sites, sizeof(int)); |
|
|
| shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1); |
|
|
| |
| double (*coarse_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); |
| double (*fine_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); |
|
|
| for (int64_t i = 0; i < graph_blocks; i++) { |
| for (int v = 0; v < 6; v++) { |
| coarse_marg[i][v] = shor_marg[2 * i][v]; |
| fine_marg[i][v] = shor_marg[2 * i + 1][v]; |
| } |
| } |
|
|
| free(shor_marg); |
| free(shor_measured); |
|
|
| |
| |
| |
| |
| |
|
|
| #define N_BEAMS 24 |
|
|
| typedef struct { |
| double acc_error; |
| int history_idx; |
| } QuantBeam; |
|
|
| typedef struct { |
| int cand_idx; |
| int parent_idx; |
| } BeamHistory; |
|
|
| QuantBeam beams[N_BEAMS]; |
| int active_beams = 1; |
|
|
| |
| BeamHistory *history = (BeamHistory *)malloc(n_blocks * N_BEAMS * sizeof(BeamHistory)); |
|
|
| for (int b = 0; b < N_BEAMS; b++) { |
| beams[b].acc_error = 0.0; |
| beams[b].history_idx = -1; |
| } |
|
|
| |
| for (int64_t i = 0; i < graph_blocks; i++) { |
| double c_total = 0.0, f_total = 0.0; |
| for (int v = 0; v < 6; v++) { |
| c_total += coarse_marg[i][v]; |
| f_total += fine_marg[i][v]; |
| } |
|
|
| |
| double cand_score[TOTAL_SCALE_CANDIDATES]; |
| int64_t blk = i * stride; |
| int d_bin_count[6] = {0}, m_bin_count[6] = {0}; |
| for (int k = 0; k < N_CAND_D; k++) d_bin_count[CAND_TO_QUHIT[k]]++; |
| for (int k = 0; k < N_CAND_M; k++) m_bin_count[CAND_TO_QUHIT[k]]++; |
| |
| |
| float blk_mean_err = 0.0f; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) |
| blk_mean_err += candidate_errors[blk][c]; |
| blk_mean_err /= (float)TOTAL_SCALE_CANDIDATES; |
| if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f; |
| for (int di = 0; di < N_CAND_D; di++) { |
| int qi_d = CAND_TO_QUHIT[di]; |
| double p_d = (c_total > 1e-30) ? coarse_marg[i][qi_d] / c_total : 1.0/6.0; |
| p_d /= (double)d_bin_count[qi_d]; |
| for (int mi = 0; mi < N_CAND_M; mi++) { |
| int qi_m = CAND_TO_QUHIT[mi]; |
| double p_m = (f_total > 1e-30) ? fine_marg[i][qi_m] / f_total : 1.0/6.0; |
| p_m /= (double)m_bin_count[qi_m]; |
| int cidx = di * N_CAND_M + mi; |
| cand_score[cidx] = p_d * p_m / (candidate_errors[blk][cidx] / blk_mean_err + 1e-15); |
| } |
| } |
|
|
| |
| typedef struct { double score; int beam_idx; int cand_idx; } BeamExt; |
| BeamExt extensions[N_BEAMS * TOTAL_SCALE_CANDIDATES]; |
| int n_ext = 0; |
|
|
| for (int b = 0; b < active_beams; b++) { |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { |
| |
| double ext_err = beams[b].acc_error + candidate_errors[blk][c]; |
| double ext_score = cand_score[c] / (ext_err + 1e-15); |
| extensions[n_ext].score = ext_score; |
| extensions[n_ext].beam_idx = b; |
| extensions[n_ext].cand_idx = c; |
| n_ext++; |
| } |
| } |
|
|
| |
| int top_k = (n_ext < N_BEAMS) ? n_ext : N_BEAMS; |
| int top_indices[N_BEAMS]; |
| for (int k = 0; k < top_k; k++) { |
| int best = -1; |
| double best_s = -1e30; |
| for (int e = 0; e < n_ext; e++) { |
| if (extensions[e].score > best_s) { |
| best_s = extensions[e].score; |
| best = e; |
| } |
| } |
| top_indices[k] = best; |
| extensions[best].score = -2e30; |
| } |
|
|
| |
| QuantBeam new_beams[N_BEAMS]; |
| for (int k = 0; k < top_k; k++) { |
| int ext_idx = top_indices[k]; |
| int src_beam = extensions[ext_idx].beam_idx; |
| int cand = extensions[ext_idx].cand_idx; |
|
|
| int hist_idx = i * N_BEAMS + k; |
| history[hist_idx].cand_idx = cand; |
| history[hist_idx].parent_idx = beams[src_beam].history_idx; |
|
|
| new_beams[k].history_idx = hist_idx; |
| new_beams[k].acc_error = beams[src_beam].acc_error |
| + candidate_errors[blk][cand]; |
| } |
|
|
| for (int k = 0; k < top_k; k++) |
| beams[k] = new_beams[k]; |
| active_beams = top_k; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| int curr_hist = beams[0].history_idx; |
| for (int64_t i = graph_blocks - 1; i >= 0; i--) { |
| int group_cidx; |
| if (curr_hist >= 0) { |
| group_cidx = history[curr_hist].cand_idx; |
| curr_hist = history[curr_hist].parent_idx; |
| } else { |
| group_cidx = 10 * N_CAND_M + 10; |
| } |
|
|
| if (stride <= 1) { |
| |
| best_candidate[i] = group_cidx; |
| } else { |
| |
| |
| |
| |
| |
| int group_di = group_cidx / N_CAND_M; |
| int group_mi = group_cidx % N_CAND_M; |
| int target_d_bin = CAND_TO_QUHIT[group_di]; |
| int target_m_bin = CAND_TO_QUHIT[group_mi]; |
|
|
| for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) { |
| |
| float best_err = 1e30f; |
| int best_c = group_cidx; |
|
|
| for (int di = 0; di < N_CAND_D; di++) { |
| if (CAND_TO_QUHIT[di] != target_d_bin) continue; |
| for (int mi = 0; mi < N_CAND_M; mi++) { |
| if (CAND_TO_QUHIT[mi] != target_m_bin) continue; |
| int cidx = di * N_CAND_M + mi; |
| if (candidate_errors[b][cidx] < best_err) { |
| best_err = candidate_errors[b][cidx]; |
| best_c = cidx; |
| } |
| } |
| } |
|
|
| |
| |
| float global_best = 1e30f; |
| int global_best_c = group_cidx; |
| for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { |
| if (candidate_errors[b][c] < global_best) { |
| global_best = candidate_errors[b][c]; |
| global_best_c = c; |
| } |
| } |
|
|
| |
| |
| |
| if (global_best < best_err * 0.95f) |
| best_candidate[b] = global_best_c; |
| else |
| best_candidate[b] = best_c; |
| } |
| } |
| } |
|
|
| free(history); |
|
|
| |
| |
| |
| |
| |
| |
| |
| { |
| #define Q2K_BORN_SHOTS 64 |
|
|
| float beam_total_err = 0.0f; |
| for (int64_t bi = 0; bi < n_blocks; bi++) |
| beam_total_err += candidate_errors[bi][best_candidate[bi]]; |
|
|
| unsigned int born_rng_q2 = 271828; |
| |
| float tail_err = 0.0f; |
| for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++) |
| tail_err += candidate_errors[bi][best_candidate[bi]]; |
|
|
| |
| int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int)); |
|
|
| for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) { |
| float shot_err = tail_err; |
|
|
| for (int64_t gi = 0; gi < graph_blocks; gi++) { |
| |
| double c_total = 0.0; |
| for (int v = 0; v < 6; v++) c_total += coarse_marg[gi][v]; |
| born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u; |
| double rnd_c = (double)(born_rng_q2 >> 8) / 16777216.0; |
| double target_c = rnd_c * c_total; |
| double cum_c = 0.0; |
| int qi_d = 5; |
| for (int v = 0; v < 6; v++) { |
| cum_c += coarse_marg[gi][v]; |
| if (cum_c > target_c) { qi_d = v; break; } |
| } |
|
|
| |
| double f_total = 0.0; |
| for (int v = 0; v < 6; v++) f_total += fine_marg[gi][v]; |
| born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u; |
| double rnd_f = (double)(born_rng_q2 >> 8) / 16777216.0; |
| double target_f = rnd_f * f_total; |
| double cum_f = 0.0; |
| int qi_m = 5; |
| for (int v = 0; v < 6; v++) { |
| cum_f += fine_marg[gi][v]; |
| if (cum_f > target_f) { qi_m = v; break; } |
| } |
|
|
| |
| int64_t blk = gi * stride; |
| float best_bin_err = 1e30f; |
| int best_bin_cand = 10 * N_CAND_M + 10; |
| for (int di = 0; di < N_CAND_D; di++) { |
| if (CAND_TO_QUHIT[di] != qi_d) continue; |
| for (int mi = 0; mi < N_CAND_M; mi++) { |
| if (CAND_TO_QUHIT[mi] != qi_m) continue; |
| int cidx = di * N_CAND_M + mi; |
| if (candidate_errors[blk][cidx] < best_bin_err) { |
| best_bin_err = candidate_errors[blk][cidx]; |
| best_bin_cand = cidx; |
| } |
| } |
| } |
|
|
| shot_sparse[gi] = best_bin_cand; |
| shot_err += candidate_errors[blk][best_bin_cand]; |
| } |
|
|
| if (shot_err < beam_total_err) { |
| |
| for (int64_t gi = 0; gi < graph_blocks; gi++) |
| best_candidate[gi * stride] = shot_sparse[gi]; |
| beam_total_err = shot_err; |
| } |
| } |
|
|
| free(shot_sparse); |
| } |
|
|
| free(coarse_marg); |
| free(fine_marg); |
| hpc_destroy(graph); |
| } |
| } else { |
| |
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| float best_err = candidate_errors[blk][0]; |
| int best_idx = 0; |
| for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) { |
| if (candidate_errors[blk][c] < best_err) { |
| best_err = candidate_errors[blk][c]; |
| best_idx = c; |
| } |
| } |
| best_candidate[blk] = best_idx; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| int _n_omp_threads = 1; |
| #ifdef _OPENMP |
| _n_omp_threads = omp_get_max_threads(); |
| #endif |
| HPCGraph **_tl_graphs = (HPCGraph **)calloc(_n_omp_threads, sizeof(HPCGraph *)); |
| for (int _ti = 0; _ti < _n_omp_threads; _ti++) |
| _tl_graphs[_ti] = hpc_create(N_SUB); |
|
|
| #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err) |
| for (int64_t blk = 0; blk < n_blocks; blk++) { |
| const float *block_x = weights + blk * QK_K; |
| int cidx = best_candidate[blk]; |
| uint8_t Ls_blk[16], Lm_blk[16]; |
|
|
| |
| memcpy(Ls_blk, candidate_Ls[blk][cidx], 16); |
| memcpy(Lm_blk, candidate_Lm[blk][cidx], 16); |
|
|
| float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]); |
| float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); |
|
|
| |
| |
| |
| |
| |
| |
| |
| for (int ls_iter = 0; ls_iter < 3; ls_iter++) { |
|
|
| |
| |
| |
| |
| uint8_t state_ls[N_SUB][6]; |
| uint8_t state_lm[N_SUB][6]; |
| float state_err[N_SUB][6]; |
|
|
| for (int j = 0; j < N_SUB; j++) { |
| const float *sx = block_x + 16 * j; |
| for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f; |
|
|
| for (int try_ls = 0; try_ls <= 15; try_ls++) { |
| float d_sub = dm * (float)try_ls; |
| for (int try_lm = 0; try_lm <= 15; try_lm++) { |
| float m_sub = mm * (float)try_lm; |
| float sub_err = 0.0f; |
|
|
| for (int k = 0; k < 16; k++) { |
| float x = sx[k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16*j + k] : 1.0f; |
| int q = 0; |
| if (d_sub >= 1e-15f) { |
| q = gguf_nearest_int((x + m_sub) / d_sub); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| } |
| float deq = d_sub * (float)q - m_sub; |
| float diff = x - deq; |
| sub_err += diff * diff * w; |
| } |
|
|
| |
| for (int v = 0; v < 6; v++) { |
| if (sub_err < state_err[j][v]) { |
| for (int u = 5; u > v; u--) { |
| state_err[j][u] = state_err[j][u-1]; |
| state_ls[j][u] = state_ls[j][u-1]; |
| state_lm[j][u] = state_lm[j][u-1]; |
| } |
| state_err[j][v] = sub_err; |
| state_ls[j][v] = (uint8_t)try_ls; |
| state_lm[j][v] = (uint8_t)try_lm; |
| break; |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| int _tid = 0; |
| #ifdef _OPENMP |
| _tid = omp_get_thread_num(); |
| #endif |
| HPCGraph *sg = _tl_graphs[_tid]; |
| hpc_reset_for_subblock(sg, N_SUB); |
| { |
| float min_sub_err[N_SUB]; |
| for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0]; |
|
|
| |
| for (int j = 0; j < N_SUB; j++) { |
| triality_dft(&sg->locals[j]); |
| double amp_re[6]; |
| double amp_norm = 0.0; |
| for (int v = 0; v < 6; v++) { |
| |
| |
| |
| float err_spread = state_err[j][5] - state_err[j][0]; |
| float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f; |
| if (sub_temp < 1e-12f) sub_temp = 1e-12f; |
| amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp); |
| amp_norm += amp_re[v] * amp_re[v]; |
| } |
| if (amp_norm > 1e-30) { |
| double inv = 1.0 / sqrt(amp_norm); |
| for (int v = 0; v < 6; v++) amp_re[v] *= inv; |
| } |
| for (int v = 0; v < 6; v++) { |
| sg->locals[j].edge_re[v] = amp_re[v]; |
| sg->locals[j].edge_im[v] = 0.0; |
| } |
| sg->locals[j].primary = VIEW_EDGE; |
| sg->locals[j].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED; |
| sg->locals[j].delta_valid = 0; |
| triality_update_mask(&sg->locals[j]); |
| } |
|
|
| |
| for (int j = 0; j < N_SUB - 1; j++) |
| hpc_cz(sg, j, j + 1); |
|
|
| |
| |
| double sub_marg[N_SUB][6]; |
| int sub_measured[N_SUB]; |
| memset(sub_marg, 0, sizeof(sub_marg)); |
| memset(sub_measured, 0, sizeof(sub_measured)); |
|
|
| shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1); |
|
|
| |
| for (int j = 0; j < N_SUB; j++) { |
| double best_prob = -1.0; |
| int best_v = 0; |
| for (int v = 0; v < 6; v++) { |
| if (sub_marg[j][v] > best_prob) { |
| best_prob = sub_marg[j][v]; |
| best_v = v; |
| } |
| } |
| Ls_blk[j] = state_ls[j][best_v]; |
| Lm_blk[j] = state_lm[j][best_v]; |
| } |
| } |
|
|
| |
| uint8_t L[QK_K]; |
| for (int j = 0; j < N_SUB; j++) { |
| float d_sub = dm * (float)Ls_blk[j]; |
| float m_sub = mm * (float)Lm_blk[j]; |
| if (d_sub < 1e-15f) { |
| for (int k = 0; k < 16; k++) L[16*j+k] = 0; |
| continue; |
| } |
| for (int k = 0; k < 16; k++) { |
| int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| L[16*j+k] = (uint8_t)q; |
| } |
| } |
|
|
| |
| |
| |
| |
| double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0; |
| for (int j = 0; j < N_SUB; j++) { |
| float ls_f = (float)Ls_blk[j]; |
| float lm_f = (float)Lm_blk[j]; |
| for (int k = 0; k < 16; k++) { |
| float x = block_x[16*j+k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16*j+k] : 1.0f; |
| float a = ls_f * (float)L[16*j+k]; |
| float b = lm_f; |
| Saa += w * a * a; |
| Sab += w * a * b; |
| Sbb += w * b * b; |
| Sxa += w * x * a; |
| Sxb += w * x * b; |
| } |
| } |
|
|
| double det = Saa * Sbb - Sab * Sab; |
| if (fabs(det) > 1e-30) { |
| double d_new = (Sbb * Sxa - Sab * Sxb) / det; |
| double dm_new = (Sab * Sxa - Saa * Sxb) / det; |
| |
| float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]); |
| float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); |
| if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10)) |
| dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new)); |
| if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10)) |
| mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new)); |
| } |
| if (isnan(dm) || isnan(mm)) { |
| printf("NaN detected before ULP: dm=%f mm=%f det=%f\n", dm, mm, det); |
| exit(1); |
| } |
| } |
|
|
| |
| |
| |
| |
| { |
| uint16_t base_d16 = gguf_fp32_to_fp16(dm); |
| uint16_t base_m16 = gguf_fp32_to_fp16(mm); |
| uint16_t best_d16 = base_d16, best_m16 = base_m16; |
| float best_ulp_err = 1e30f; |
|
|
| for (int dd = -2; dd <= 2; dd++) { |
| int cd16 = (int)base_d16 + dd; |
| if (cd16 < 0 || cd16 > 0x7BFF) continue; |
| float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16); |
|
|
| for (int dm_delta = -2; dm_delta <= 2; dm_delta++) { |
| int cm16 = (int)base_m16 + dm_delta; |
| if (cm16 < 0 || cm16 > 0x7BFF) continue; |
| float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16); |
|
|
| float err = 0.0f; |
| for (int j = 0; j < N_SUB; j++) { |
| float d_sub = trial_dm * (float)Ls_blk[j]; |
| float m_sub = trial_mm * (float)Lm_blk[j]; |
| for (int k = 0; k < 16; k++) { |
| float x = block_x[16*j+k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16*j+k] : 1.0f; |
| int q; |
| if (d_sub < 1e-15f) { q = 0; } |
| else { |
| q = gguf_nearest_int((x + m_sub) / d_sub); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| } |
| float deq = d_sub * (float)q - m_sub; |
| float diff = x - deq; |
| err += diff * diff * w; |
| } |
| } |
| if (err < best_ulp_err) { |
| best_ulp_err = err; |
| best_d16 = (uint16_t)cd16; |
| best_m16 = (uint16_t)cm16; |
| } |
| } |
| } |
| dm = gguf_fp16_to_fp32(best_d16); |
| mm = gguf_fp16_to_fp32(best_m16); |
| } |
|
|
| |
| |
| |
| |
| for (int j = 0; j < N_SUB; j++) { |
| const float *sx = block_x + 16 * j; |
| float best_sub_err = 1e30f; |
| uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j]; |
| int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0; |
| int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15; |
| int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0; |
| int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15; |
| for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) { |
| float d_sub = dm * (float)try_ls; |
| for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) { |
| float m_sub = mm * (float)try_lm; |
| float sub_err = 0.0f; |
| for (int k = 0; k < 16; k++) { |
| float x = sx[k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16*j + k] : 1.0f; |
| int q; |
| if (d_sub < 1e-15f) { q = 0; } |
| else { |
| q = gguf_nearest_int((x + m_sub) / d_sub); |
| if (q < 0) q = 0; if (q > 3) q = 3; |
| } |
| float deq = d_sub * (float)q - m_sub; |
| float diff = x - deq; |
| sub_err += diff * diff * w; |
| } |
| if (sub_err < best_sub_err) { |
| best_sub_err = sub_err; |
| best_ls = (uint8_t)try_ls; |
| best_lm = (uint8_t)try_lm; |
| } |
| } |
| } |
| Ls_blk[j] = best_ls; |
| Lm_blk[j] = best_lm; |
| } |
|
|
| |
| output[blk].d = gguf_fp32_to_fp16(dm); |
| output[blk].dmin = gguf_fp32_to_fp16(mm); |
|
|
| for (int j = 0; j < N_SUB; j++) |
| output[blk].scales[j] = Ls_blk[j] | (Lm_blk[j] << 4); |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| uint8_t L[QK_K]; |
| for (int j = 0; j < N_SUB; j++) { |
| float d = dm * (float)(output[blk].scales[j] & 0xF); |
| if (d < 1e-15f) { |
| for (int k = 0; k < 16; k++) L[16 * j + k] = 0; |
| continue; |
| } |
| float m = mm * (float)(output[blk].scales[j] >> 4); |
| float id = 1.0f / d; |
|
|
| |
| int q_base[16]; |
| float q_cont[16]; |
| for (int k = 0; k < 16; k++) { |
| q_cont[k] = (block_x[16*j+k] + m) * id; |
| q_base[k] = gguf_nearest_int(q_cont[k]); |
| if (q_base[k] < 0) q_base[k] = 0; |
| if (q_base[k] > 3) q_base[k] = 3; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| int q_shaped[16]; |
| memcpy(q_shaped, q_base, 16 * sizeof(int)); |
|
|
| |
| for (int g = 0; g < 2; g++) { |
| int g_off = g * 6; |
| if (g_off + 5 >= 16) break; |
|
|
| |
| for (int pass = 0; pass < 6; pass++) { |
| int best_k = -1; |
| int best_q_alt = 0; |
| float best_delta = 0.0f; |
|
|
| |
| float e_cur[6]; |
| for (int kk = 0; kk < 6; kk++) { |
| int ii = g_off + kk; |
| float deq = d * (float)q_shaped[ii] - m; |
| e_cur[kk] = block_x[16*j+ii] - deq; |
| } |
|
|
| |
| float vesica_cur = 0.0f, dc_cur = 0.0f; |
| for (int p = 0; p < 3; p++) { |
| float v = e_cur[p] + e_cur[p+3]; |
| vesica_cur += v * v; |
| } |
| for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk]; |
| float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; |
|
|
| |
| for (int k = 0; k < 6; k++) { |
| int idx = g_off + k; |
| int q_cur = q_shaped[idx]; |
|
|
| |
| int q_try; |
| if (q_cont[idx] - (float)q_cur >= 0) { |
| q_try = q_cur + 1; |
| } else { |
| q_try = q_cur - 1; |
| } |
| if (q_try < 0 || q_try > 3) continue; |
|
|
| |
| float e_alt[6]; |
| for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk]; |
| float deq_try = d * (float)q_try - m; |
| e_alt[k] = block_x[16*j+idx] - deq_try; |
|
|
| |
| float vesica_alt = 0.0f, dc_alt = 0.0f; |
| for (int p = 0; p < 3; p++) { |
| float v = e_alt[p] + e_alt[p+3]; |
| vesica_alt += v * v; |
| } |
| for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk]; |
| float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt; |
|
|
| float delta = metric_cur - metric_alt; |
| if (delta > best_delta) { |
| best_delta = delta; |
| best_k = k; |
| best_q_alt = q_try; |
| } |
| } |
|
|
| if (best_k < 0) break; |
| q_shaped[g_off + best_k] = best_q_alt; |
| } |
| } |
|
|
| |
| |
| |
| float err_base = 0.0f, err_shaped = 0.0f; |
| for (int k = 0; k < 16; k++) { |
| float x = block_x[16*j+k]; |
| float w = (imat_importance) ? |
| imat_importance[blk * QK_K + 16*j + k] : 1.0f; |
| float deq_b = d * (float)q_base[k] - m; |
| float deq_s = d * (float)q_shaped[k] - m; |
| err_base += (x - deq_b) * (x - deq_b) * w; |
| err_shaped += (x - deq_s) * (x - deq_s) * w; |
| } |
|
|
| int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base; |
| for (int k = 0; k < 16; k++) |
| L[16 * j + k] = (uint8_t)q_final[k]; |
| } |
|
|
| for (int j = 0; j < QK_K; j += 128) { |
| for (int l = 0; l < 32; l++) { |
| output[blk].qs[j / 4 + l] = L[j + l] |
| | (L[j + l + 32] << 2) |
| | (L[j + l + 64] << 4) |
| | (L[j + l + 96] << 6); |
| } |
| } |
|
|
| float berr = gguf_q2_k_block_error(block_x, &output[blk]); |
| if (isnan(berr)) { |
| printf("NaN block error at blk %ld! dm=%f mm=%f\n", (long)blk, dm, mm); |
| for (int j=0; j<16; j++) printf("Ls[%d]=%d Lm[%d]=%d\n", j, Ls_blk[j], j, Lm_blk[j]); |
| exit(1); |
| } |
| total_err += berr; |
| } |
|
|
| |
| for (int _ti = 0; _ti < _n_omp_threads; _ti++) |
| hpc_destroy(_tl_graphs[_ti]); |
| free(_tl_graphs); |
|
|
| free(seeds); |
| free(candidate_errors); |
| free(candidate_d); |
| free(candidate_dmin); |
| free(candidate_Ls); |
| free(candidate_Lm); |
| free(best_candidate); |
| if (out_total_error) *out_total_error = total_err; |
| |
| if (verbose) { |
| float rmse = sqrtf(total_err / (float)n_elements); |
|
|
| |
| double w_sum2 = 0.0; |
| for (int64_t i = 0; i < n_elements; i++) |
| w_sum2 += (double)weights[i] * (double)weights[i]; |
| float w_sigma = (float)sqrt(w_sum2 / (double)n_elements); |
| float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f; |
|
|
| |
| const char *fidelity_class; |
| const char *fidelity_icon; |
| if (rmse <= 1.0e-04f) { |
| fidelity_class = "ULTRA (β€1e-04)"; |
| fidelity_icon = "β
β
β
β
"; |
| } else if (rmse <= 3.0e-04f) { |
| fidelity_class = "HIGH (β€3e-04)"; |
| fidelity_icon = "β
β
β
β"; |
| } else if (rmse <= 1.0e-03f) { |
| fidelity_class = "GOOD (β€1e-03)"; |
| fidelity_icon = "β
β
ββ"; |
| } else { |
| fidelity_class = "STANDARD"; |
| fidelity_icon = "β
βββ"; |
| } |
|
|
| printf("\n βββββ Shor Measurement Q2_K Report βββββββββββββββββββββββββββββββββ\n"); |
| printf(" β Elements: %-12lld Blocks: %-12lld β\n", |
| (long long)n_elements, (long long)(n_elements / QK_K)); |
| printf(" β Weight Ο: %-12.4e Range: [%.4e, %.4e] β\n", |
| w_sigma, w_sigma * -4.0f, w_sigma * 4.0f); |
| printf(" β Total MSE: %-12.6f β\n", total_err); |
| printf(" β RMSE: %-12.4e RMSE/Ο: %-8.4f β\n", |
| rmse, rmse_over_sigma); |
| printf(" β Fidelity: %s %-14s β\n", |
| fidelity_icon, fidelity_class); |
| printf(" β Engine: Shor Griffiths-Niu (IDFT6 + feed-forward) β\n"); |
| printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| } |
| } |
|
|
|
|
| |
| |
| |
|
|
| static void print_progress_bar(int current, int total, const char *label, |
| clock_t start_time) |
| { |
| if (total <= 0) return; |
| float pct = (float)current / (float)total; |
| int bar_width = 40; |
| int filled = (int)(pct * bar_width); |
|
|
| double elapsed = (double)(clock() - start_time) / CLOCKS_PER_SEC; |
| double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0; |
|
|
| printf("\r ["); |
| for (int i = 0; i < bar_width; i++) { |
| if (i < filled) printf("β"); |
| else if (i == filled) printf("β"); |
| else printf("β"); |
| } |
| printf("] %3d%% (%d/%d) %.0fs ETA:%.0fs %s", |
| (int)(pct * 100), current, total, elapsed, eta, label); |
| fflush(stdout); |
|
|
| if (current == total) printf("\n"); |
| } |
|
|
| |
| |
| |
|
|
| static int write_gguf(const char *output_path, const STMultiFile *mf, |
| const ModelArchitecture *arch, |
| const TokenizerData *tokenizer, |
| OptimizerMode opt_mode, |
| const IMatrixData *imatrix, |
| int verbose) |
| { |
| FILE *fp = fopen(output_path, "wb"); |
| if (!fp) { |
| fprintf(stderr, " ERROR: Cannot open '%s' for writing\n", output_path); |
| return -1; |
| } |
|
|
| printf("\n ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| printf(" β WRITING GGUF FILE β\n"); |
| printf(" ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); |
|
|
| |
| int *include_list = (int *)calloc(mf->n_tensors, sizeof(int)); |
| int n_include = 0; |
| for (int i = 0; i < mf->n_tensors; i++) { |
| if (!should_skip_tensor(mf->tensor_map[i].name)) { |
| include_list[n_include++] = i; |
| } else { |
| if (verbose) printf(" SKIP: %s (not needed in GGUF)\n", mf->tensor_map[i].name); |
| } |
| } |
|
|
| |
| int n_kv = 0; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
|
|
| |
| int has_tokenizer = (tokenizer != NULL && tokenizer->vocab_size > 0); |
| if (has_tokenizer) { |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| n_kv++; |
| if (tokenizer->n_merges > 0) |
| n_kv++; |
| } |
|
|
| |
| |
| |
| |
| int has_lm_head = (st_multi_find_tensor(mf, "lm_head.weight") >= 0); |
| int total_tensors = n_include; |
|
|
| if (arch->tie_word_embeddings && !has_lm_head) { |
| printf(" Weight-tied embeddings detected β llama.cpp handles internally\n\n"); |
| } |
|
|
| |
| char (*gguf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN); |
| GGMLType *tensor_types = calloc(total_tensors, sizeof(GGMLType)); |
| int64_t *tensor_sizes = calloc(total_tensors, sizeof(int64_t)); |
| uint64_t data_offset = 0; |
| uint64_t *tensor_offsets = calloc(total_tensors, sizeof(uint64_t)); |
| int *tensor_src_idx = calloc(total_tensors, sizeof(int)); |
| char (*tensor_hf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN); |
|
|
| GGMLType quant_type = GGML_TYPE_Q2_K; |
|
|
| for (int i = 0; i < n_include; i++) { |
| int src = include_list[i]; |
| const STTensorInfo *ti = st_multi_tensor_info(mf, src); |
| map_tensor_name(mf->tensor_map[src].name, gguf_names[i], ST_MAX_NAME_LEN); |
| strncpy(tensor_hf_names[i], mf->tensor_map[src].name, ST_MAX_NAME_LEN - 1); |
| tensor_src_idx[i] = src; |
|
|
| if (should_quantize(ti, gguf_names[i])) { |
| if (is_attention_tensor(gguf_names[i])) { |
| |
| |
| tensor_types[i] = GGML_TYPE_Q4_0; |
| int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0; |
| tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0); |
| if (verbose) |
| printf(" [ATTNβQ4_0] %s (%ld elements)\n", |
| gguf_names[i], (long)ti->n_elements); |
| } else { |
| tensor_types[i] = quant_type; |
| tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements); |
| } |
| } else if (ti->n_dims >= 2) { |
| |
| tensor_types[i] = GGML_TYPE_F16; |
| tensor_sizes[i] = ti->n_elements * sizeof(uint16_t); |
| } else { |
| |
| tensor_types[i] = GGML_TYPE_F32; |
| tensor_sizes[i] = ti->n_elements * sizeof(float); |
| } |
|
|
| tensor_offsets[i] = data_offset; |
|
|
| |
| data_offset += tensor_sizes[i]; |
| data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) & |
| ~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1); |
| } |
|
|
| |
| gguf_write_header(fp, total_tensors, n_kv); |
|
|
| |
| gguf_write_kv_string(fp, "general.architecture", arch->architecture); |
| gguf_write_kv_string(fp, "general.name", arch->name); |
| gguf_write_kv_uint32(fp, "general.quantization_version", 2); |
| gguf_write_kv_uint32(fp, "general.file_type", 10); |
|
|
| char kbuf[128]; |
| snprintf(kbuf, sizeof(kbuf), "%s.context_length", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->context_length); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.embedding_length", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->embedding_length); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.block_count", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->block_count); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.feed_forward_length", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->feed_forward_length); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->head_count); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count_kv", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->head_count_kv); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.attention.layer_norm_rms_epsilon", arch->architecture); |
| gguf_write_kv_float32(fp, kbuf, arch->rms_norm_eps); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.rope.freq_base", arch->architecture); |
| gguf_write_kv_float32(fp, kbuf, arch->rope_freq_base); |
|
|
| snprintf(kbuf, sizeof(kbuf), "%s.vocab_size", arch->architecture); |
| gguf_write_kv_uint32(fp, kbuf, arch->vocab_size); |
|
|
| |
| if (has_tokenizer) { |
| gguf_write_kv_string(fp, "tokenizer.ggml.model", tokenizer->model_type); |
| gguf_write_kv_string_array(fp, "tokenizer.ggml.tokens", |
| (const char **)tokenizer->tokens, |
| (uint64_t)tokenizer->vocab_size); |
| gguf_write_kv_float32_array(fp, "tokenizer.ggml.scores", |
| tokenizer->scores, |
| (uint64_t)tokenizer->vocab_size); |
| gguf_write_kv_int32_array(fp, "tokenizer.ggml.token_type", |
| tokenizer->token_types, |
| (uint64_t)tokenizer->vocab_size); |
| gguf_write_kv_uint32(fp, "tokenizer.ggml.bos_token_id", |
| (uint32_t)tokenizer->bos_id); |
| gguf_write_kv_uint32(fp, "tokenizer.ggml.eos_token_id", |
| (uint32_t)tokenizer->eos_id); |
| gguf_write_kv_uint32(fp, "tokenizer.ggml.unknown_token_id", |
| (uint32_t)tokenizer->unk_id); |
| if (tokenizer->n_merges > 0) { |
| gguf_write_kv_string_array(fp, "tokenizer.ggml.merges", |
| (const char **)tokenizer->merges, |
| (uint64_t)tokenizer->n_merges); |
| } |
| printf(" Tokenizer metadata written (%d tokens, %d merges)\n\n", |
| tokenizer->vocab_size, tokenizer->n_merges); |
| } |
|
|
| |
| for (int i = 0; i < total_tensors; i++) { |
| int src = tensor_src_idx[i]; |
| const STTensorInfo *ti = st_multi_tensor_info(mf, src); |
| uint64_t dims[ST_MAX_DIMS]; |
| |
| int nd = ti->n_dims; |
| for (int d = 0; d < nd; d++) { |
| dims[d] = (uint64_t)ti->shape[nd - 1 - d]; |
| } |
| gguf_write_tensor_info(fp, gguf_names[i], |
| ti->n_dims, dims, |
| tensor_types[i], tensor_offsets[i]); |
| } |
|
|
| |
| gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT); |
|
|
| |
| printf(" Quantizing and writing %d tensors...\n\n", total_tensors); |
|
|
| float total_error_sum = 0.0f; |
| int quant_count = 0; |
| int64_t total_elements_quantized = 0; |
| int64_t total_bytes_quantized = 0; |
| int64_t total_bytes_unquantized = 0; |
| clock_t quant_start = clock(); |
|
|
| for (int i = 0; i < total_tensors; i++) { |
| int src = tensor_src_idx[i]; |
| const STTensorInfo *ti = st_multi_tensor_info(mf, src); |
|
|
| print_progress_bar(i, total_tensors, gguf_names[i], quant_start); |
|
|
| if (tensor_types[i] == GGML_TYPE_Q2_K) { |
| |
| float *f32_data = st_multi_tensor_to_f32(mf, src); |
| if (!f32_data) { |
| fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n", |
| ti->name); |
| continue; |
| } |
|
|
| int64_t n_elements = ti->n_elements; |
| float tensor_error = 0.0f; |
|
|
| |
| int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K; |
| if (padded > n_elements) { |
| f32_data = realloc(f32_data, padded * sizeof(float)); |
| for (int64_t j = n_elements; j < padded; j++) |
| f32_data[j] = 0.0f; |
| n_elements = padded; |
| } |
|
|
| int64_t n_blocks = n_elements / QK_K; |
| BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K)); |
|
|
| |
| const float *imp = NULL; |
| if (imatrix) { |
| const IMatrixEntry *ime = imatrix_find_any(imatrix, |
| gguf_names[i], tensor_hf_names[i]); |
| if (ime && ime->n_values > 0) { |
| imp = ime->normalized; |
| if (verbose) |
| printf("\n imatrix: using %d importance weights for %s\n", |
| ime->n_values, gguf_names[i]); |
| } |
| } |
|
|
| quantize_tensor_q2k_hpc(f32_data, n_elements, |
| quant_data, &tensor_error, |
| opt_mode, imp, verbose); |
|
|
| fwrite(quant_data, sizeof(BlockQ2K), n_blocks, fp); |
|
|
| float rmse = sqrtf(tensor_error / (float)ti->n_elements); |
|
|
| |
| double wss = 0.0; |
| for (int64_t j = 0; j < ti->n_elements; j++) |
| wss += (double)f32_data[j] * (double)f32_data[j]; |
| float w_sig = (float)sqrt(wss / (double)ti->n_elements); |
|
|
| |
| const char *fid; |
| if (rmse <= 1.0e-04f) fid = "β
β
β
β
ULTRA"; |
| else if (rmse <= 3.0e-04f) fid = "β
β
β
β HIGH"; |
| else if (rmse <= 1.0e-03f) fid = "β
β
ββ GOOD"; |
| else fid = "β
βββ STD"; |
|
|
| if (verbose) { |
| printf("\n [Q2_KΒ·Shor] %-47s\n", gguf_names[i]); |
| printf(" %10ld elements β %ld bytes Ο=%.2e RMSE=%.4e %s\n", |
| (long)ti->n_elements, |
| (long)(n_blocks * sizeof(BlockQ2K)), |
| w_sig, rmse, fid); |
| } |
|
|
| total_error_sum += tensor_error; |
| total_elements_quantized += ti->n_elements; |
| total_bytes_quantized += n_blocks * sizeof(BlockQ2K); |
| quant_count++; |
|
|
| free(quant_data); |
| free(f32_data); |
| } else if (tensor_types[i] == GGML_TYPE_Q4_0) { |
| |
| float *f32_data = st_multi_tensor_to_f32(mf, src); |
| if (!f32_data) { |
| fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n", |
| ti->name); |
| continue; |
| } |
|
|
| int64_t n_elements = ti->n_elements; |
|
|
| |
| int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0; |
| if (padded > n_elements) { |
| f32_data = realloc(f32_data, padded * sizeof(float)); |
| for (int64_t j = n_elements; j < padded; j++) |
| f32_data[j] = 0.0f; |
| n_elements = padded; |
| } |
|
|
| int64_t n_blocks_q4 = n_elements / QK4_0; |
| BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0)); |
| float tensor_error = 0.0f; |
|
|
| |
| const float *imp = NULL; |
| if (imatrix) { |
| const IMatrixEntry *ime = imatrix_find_any(imatrix, |
| gguf_names[i], tensor_hf_names[i]); |
| if (ime && ime->n_values > 0) { |
| imp = ime->normalized; |
| if (verbose) |
| printf("\n imatrix: using %d importance weights for %s\n", |
| ime->n_values, gguf_names[i]); |
| } |
| } |
|
|
| quantize_tensor_q4_0_hpc(f32_data, n_elements, |
| q4_data, &tensor_error, |
| imp, verbose); |
|
|
| fwrite(q4_data, sizeof(BlockQ4_0), n_blocks_q4, fp); |
|
|
| float rmse = sqrtf(tensor_error / (float)ti->n_elements); |
|
|
| |
| double wss4 = 0.0; |
| for (int64_t j = 0; j < ti->n_elements; j++) |
| wss4 += (double)f32_data[j] * (double)f32_data[j]; |
| float w_sig4 = (float)sqrt(wss4 / (double)ti->n_elements); |
|
|
| const char *fid4; |
| if (rmse <= 1.0e-04f) fid4 = "β
β
β
β
ULTRA"; |
| else if (rmse <= 3.0e-04f) fid4 = "β
β
β
β HIGH"; |
| else if (rmse <= 1.0e-03f) fid4 = "β
β
ββ GOOD"; |
| else fid4 = "β
βββ STD"; |
|
|
| if (verbose) { |
| printf("\n [Q4_0Β·Shor] %-47s\n", gguf_names[i]); |
| printf(" %10ld elements β %ld bytes Ο=%.2e RMSE=%.4e %s\n", |
| (long)ti->n_elements, |
| (long)(n_blocks_q4 * sizeof(BlockQ4_0)), |
| w_sig4, rmse, fid4); |
| } |
|
|
| total_error_sum += tensor_error; |
| total_elements_quantized += ti->n_elements; |
| total_bytes_quantized += n_blocks_q4 * sizeof(BlockQ4_0); |
| quant_count++; |
|
|
| free(q4_data); |
| free(f32_data); |
| } else if (tensor_types[i] == GGML_TYPE_F16) { |
| |
| float *f32_data = st_multi_tensor_to_f32(mf, src); |
| if (!f32_data) { |
| fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n", |
| ti->name); |
| continue; |
| } |
|
|
| |
| uint16_t *f16_data = (uint16_t *)malloc(ti->n_elements * sizeof(uint16_t)); |
| for (int64_t j = 0; j < ti->n_elements; j++) |
| f16_data[j] = gguf_fp32_to_fp16(f32_data[j]); |
|
|
| fwrite(f16_data, sizeof(uint16_t), ti->n_elements, fp); |
|
|
| total_bytes_unquantized += ti->n_elements * sizeof(uint16_t); |
|
|
| if (verbose) { |
| printf("\n [F16 ] %-50s %10ld elements β %ld bytes\n", |
| gguf_names[i], (long)ti->n_elements, |
| (long)(ti->n_elements * sizeof(uint16_t))); |
| } |
|
|
| free(f16_data); |
| free(f32_data); |
| } else { |
| |
| float *f32_data = st_multi_tensor_to_f32(mf, src); |
| if (!f32_data) { |
| fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n", |
| ti->name); |
| continue; |
| } |
|
|
| fwrite(f32_data, sizeof(float), ti->n_elements, fp); |
|
|
| total_bytes_unquantized += ti->n_elements * sizeof(float); |
|
|
| if (verbose) { |
| printf("\n [F32 ] %-50s %10ld elements β %ld bytes\n", |
| gguf_names[i], (long)ti->n_elements, |
| (long)(ti->n_elements * sizeof(float))); |
| } |
|
|
| free(f32_data); |
| } |
|
|
| |
| gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT); |
| } |
|
|
| print_progress_bar(total_tensors, total_tensors, "done", quant_start); |
|
|
| long final_size = ftell(fp); |
| fclose(fp); |
|
|
| |
| |
| int64_t original_f32_size = 0; |
| for (int i = 0; i < total_tensors; i++) { |
| const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]); |
| original_f32_size += ti->n_elements * sizeof(float); |
| } |
| float compression_ratio = (original_f32_size > 0) ? |
| (float)original_f32_size / (float)final_size : 0.0f; |
| float effective_bpw = (total_elements_quantized > 0) ? |
| 8.0f * (float)total_bytes_quantized / (float)total_elements_quantized : |
| 0.0f; |
| float total_rmse = (total_elements_quantized > 0) ? |
| sqrtf(total_error_sum / (float)total_elements_quantized) : 0.0f; |
| float mean_mse_per_tensor = (quant_count > 0) ? |
| total_error_sum / (float)quant_count : 0.0f; |
|
|
| |
| const char *overall_fid, *overall_icon; |
| if (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (β€1e-04)"; overall_icon = "β
β
β
β
"; } |
| else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (β€3e-04)"; overall_icon = "β
β
β
β"; } |
| else if (total_rmse <= 1.0e-03f) { overall_fid = "GOOD (β€1e-03)"; overall_icon = "β
β
ββ"; } |
| else { overall_fid = "STANDARD"; overall_icon = "β
βββ"; } |
|
|
| printf("\n ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| printf(" β SHOR-OPTIMIZED QUANTIZATION SUMMARY β\n"); |
| printf(" β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β β\n"); |
| printf(" β Engine: Griffiths-Niu Sequential Measurement β\n"); |
| printf(" β Protocol: IDFT6 β feed-forward β Born β collapse β\n"); |
| printf(" β Origin: tesseract_factor.c (Shor's algorithm) β\n"); |
| printf(" β β\n"); |
| printf(" β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β Tensors quantized: %-33d β\n", quant_count); |
| printf(" β Elements quantized: %15ld β\n", |
| (long)total_elements_quantized); |
| printf(" β Quantized data: %12ld bytes (%6.1f MB) β\n", |
| (long)total_bytes_quantized, |
| (double)total_bytes_quantized / (1024.0 * 1024.0)); |
| printf(" β Unquantized data: %12ld bytes (%6.1f MB) β\n", |
| (long)total_bytes_unquantized, |
| (double)total_bytes_unquantized / (1024.0 * 1024.0)); |
| printf(" β Effective bits/weight: %15.2f β\n", |
| effective_bpw); |
| printf(" β Compression ratio: %15.1fx β\n", |
| compression_ratio); |
| printf(" β β\n"); |
| printf(" β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β FIDELITY METRICS (target: 1e-04) β\n"); |
| printf(" β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β β\n"); |
| printf(" β Total MSE: %15.6e β\n", |
| total_error_sum); |
| printf(" β Per-element RMSE: %15.4e β\n", |
| total_rmse); |
| printf(" β Mean MSE/tensor: %15.6e β\n", |
| mean_mse_per_tensor); |
| printf(" β β\n"); |
| printf(" β Fidelity class: %s %-14s β\n", |
| overall_icon, overall_fid); |
| if (total_rmse <= 1.0e-04f) |
| printf(" β β RMSE β€ 1e-04: TARGET MET β maximum fidelity achieved β\n"); |
| else if (total_rmse <= 3.0e-04f) |
| printf(" β β RMSE β€ 3e-04: near target β high fidelity achieved β\n"); |
| else |
| printf(" β β RMSE > 3e-04: below target β weight Ο may be large β\n"); |
| printf(" β β\n"); |
| printf(" β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β Output file: %ld bytes (%.1f MB)%*sβ\n", |
| final_size, (double)final_size / (1024.0 * 1024.0), |
| (int)(27 - snprintf(NULL, 0, "%ld bytes (%.1f MB)", |
| final_size, (double)final_size / (1024.0 * 1024.0))), ""); |
| printf(" ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); |
|
|
| free(include_list); |
| free(gguf_names); |
| free(tensor_types); |
| free(tensor_sizes); |
| free(tensor_offsets); |
| free(tensor_src_idx); |
| free(tensor_hf_names); |
|
|
| return 0; |
| } |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| void hexstate_init(void) |
| { |
| static int initialized = 0; |
| if (!initialized) { |
| srand(42); |
| triality_exotic_init(); |
| s6_exotic_init(); |
| triality_stats_reset(); |
| initialized = 1; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements, |
| void *output, float *out_error, |
| int opt_mode, int verbose) |
| { |
| hexstate_init(); |
| quantize_tensor_q2k_hpc(weights, n_elements, |
| (BlockQ2K *)output, out_error, |
| (OptimizerMode)opt_mode, NULL, verbose); |
| } |
|
|
| |
| void hexstate_quantize_tensor_q2k_imat(const float *weights, int64_t n_elements, |
| void *output, float *out_error, |
| int opt_mode, |
| const float *imat_importance, |
| int verbose) |
| { |
| hexstate_init(); |
| quantize_tensor_q2k_hpc(weights, n_elements, |
| (BlockQ2K *)output, out_error, |
| (OptimizerMode)opt_mode, imat_importance, verbose); |
| } |
|
|
| |
| int hexstate_q2k_block_bytes(void) { return sizeof(BlockQ2K); } |
| int hexstate_q2k_block_elements(void) { return QK_K; } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, |
| void *output, float *out_error, |
| const float *imat_importance, |
| int verbose) |
| { |
| hexstate_init(); |
| float err = 0.0f; |
| quantize_tensor_q4_0_hpc(weights, n_elements, |
| (BlockQ4_0 *)output, &err, |
| imat_importance, verbose); |
| if (out_error) *out_error = err; |
| } |
|
|
| #ifndef HEXSTATE_LIBRARY |
| |
| |
| |
|
|
| int main(int argc, char **argv) |
| { |
| srand(time(NULL)); |
|
|
| |
| triality_exotic_init(); |
| s6_exotic_init(); |
| triality_stats_reset(); |
|
|
| printf("\n"); |
| printf(" ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| printf(" β β\n"); |
| printf(" β HExState GGUF QUANTIZER v3.0 β Shor-Optimized β\n"); |
| printf(" β β\n"); |
| printf(" β Architecture: HPCGraph Sensitivity Propagation β\n"); |
| printf(" β Optimization: Shor's Griffiths-Niu Measurement + iMatrix β\n"); |
| printf(" β Output: GGUF v3 (Q2_K, 2.625 bpw) β\n"); |
| printf(" β β\n"); |
| printf(" β \"The weight and the quantized are opposite faces.\" β\n"); |
| printf(" β β\n"); |
| printf(" ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); |
|
|
| if (argc < 3) { |
| printf(" Usage: %s <input> <output.gguf> [options]\n\n", argv[0]); |
| printf(" Input:\n"); |
| printf(" Single .safetensors file, or\n"); |
| printf(" Model directory with sharded .safetensors files\n\n"); |
| printf(" Options:\n"); |
| printf(" --optimizer hpc|mse|hybrid Scale optimization (default: hybrid)\n"); |
| printf(" --imatrix <file> Importance matrix for Q2_K quality\n"); |
| printf(" --config <file> Explicit config.json for arch detection\n"); |
| printf(" --qwen Force Qwen 3.5/3.6 architecture\n"); |
| printf(" --verbose Per-block diagnostics\n\n"); |
| return 1; |
| } |
|
|
| const char *input_path = argv[1]; |
| const char *output_path = argv[2]; |
| OptimizerMode opt_mode = OPT_HYBRID; |
| const char *imatrix_path = NULL; |
| const char *config_override = NULL; |
| int verbose = 0; |
| int force_qwen = 0; |
|
|
| |
| for (int i = 3; i < argc; i++) { |
| if (strcmp(argv[i], "--optimizer") == 0 && i + 1 < argc) { |
| i++; |
| if (strcmp(argv[i], "hpc") == 0) opt_mode = OPT_HPC; |
| else if (strcmp(argv[i], "mse") == 0) opt_mode = OPT_MSE; |
| else if (strcmp(argv[i], "hybrid") == 0) opt_mode = OPT_HYBRID; |
| else { |
| fprintf(stderr, " ERROR: Unknown optimizer '%s'. Use hpc, mse, or hybrid.\n", argv[i]); |
| return 1; |
| } |
| } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) { |
| imatrix_path = argv[++i]; |
| } else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) { |
| config_override = argv[++i]; |
| } else if (strcmp(argv[i], "--qwen") == 0) { |
| force_qwen = 1; |
| } else if (strcmp(argv[i], "--verbose") == 0) { |
| verbose = 1; |
| } else { |
| fprintf(stderr, " ERROR: Unknown option '%s'\n", argv[i]); |
| return 1; |
| } |
| } |
|
|
| const char *opt_names[] = {"HPC (BP only)", "MSE (grid search)", "Hybrid (HPC+MSE)"}; |
| printf(" Input: %s\n", input_path); |
| printf(" Output: %s\n", output_path); |
| printf(" Quant type: Q2_K (2.625 bpw)\n"); |
| printf(" Optimizer: %s\n", opt_names[opt_mode]); |
| if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path); |
| if (config_override) printf(" Config: %s\n", config_override); |
| if (force_qwen) printf(" Model: Qwen 3.5/3.6 (forced via --qwen)\n"); |
| printf("\n"); |
|
|
| |
| printf(" Phase 1: Loading model...\n"); |
| clock_t t_start = clock(); |
|
|
| |
| struct stat st; |
| if (stat(input_path, &st) != 0) { |
| fprintf(stderr, " ERROR: Cannot access '%s'\n", input_path); |
| return 1; |
| } |
|
|
| STMultiFile *mf = NULL; |
| char input_dir[512] = ""; |
|
|
| if (S_ISDIR(st.st_mode)) { |
| |
| mf = st_open_dir(input_path); |
| strncpy(input_dir, input_path, sizeof(input_dir) - 2); |
| int dlen = strlen(input_dir); |
| if (dlen > 0 && input_dir[dlen - 1] != '/') { |
| input_dir[dlen] = '/'; |
| input_dir[dlen + 1] = '\0'; |
| } |
| } else { |
| |
| STFile *sf = st_open(input_path); |
| if (!sf) { |
| fprintf(stderr, " ERROR: Failed to open '%s'\n", input_path); |
| return 1; |
| } |
| mf = (STMultiFile *)calloc(1, sizeof(STMultiFile)); |
| mf->shards[0] = sf; |
| mf->n_shards = 1; |
| for (int i = 0; i < sf->n_tensors && mf->n_tensors < ST_MAX_TENSORS; i++) { |
| strncpy(mf->tensor_map[mf->n_tensors].name, |
| sf->tensors[i].name, ST_MAX_NAME_LEN - 1); |
| mf->tensor_map[mf->n_tensors].shard_idx = 0; |
| mf->tensor_map[mf->n_tensors].tensor_idx = i; |
| mf->n_tensors++; |
| } |
|
|
| |
| strncpy(input_dir, input_path, sizeof(input_dir) - 1); |
| char *last_slash = strrchr(input_dir, '/'); |
| if (last_slash) { |
| *(last_slash + 1) = '\0'; |
| } else { |
| strcpy(input_dir, "./"); |
| } |
| } |
|
|
| if (!mf) { |
| fprintf(stderr, " ERROR: Failed to load model from '%s'\n", input_path); |
| return 1; |
| } |
|
|
| st_multi_print_summary(mf); |
|
|
| clock_t t_load = clock(); |
| printf(" Loaded in %.3f seconds\n\n", |
| (double)(t_load - t_start) / CLOCKS_PER_SEC); |
|
|
| |
| printf(" Phase 2: Detecting model architecture...\n"); |
|
|
| |
| char config_path[1024]; |
| snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir); |
| const char *config_ptr = NULL; |
| { |
| FILE *check = fopen(config_path, "rb"); |
| if (check) { |
| fclose(check); |
| config_ptr = config_path; |
| printf(" Found config.json: %s\n", config_path); |
| } |
| } |
|
|
| ModelArchitecture arch; |
| detect_architecture(mf, &arch, config_ptr); |
|
|
| |
| if (force_qwen) { |
| strcpy(arch.architecture, "qwen2"); |
| strcpy(arch.name, "Qwen3.6-HExState-Q2K"); |
| printf(" [--qwen] Forcing qwen2-compatible architecture\n"); |
| } |
|
|
| printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| printf(" β Model Architecture β\n"); |
| printf(" β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β Architecture: %-40s β\n", arch.architecture); |
| printf(" β Layers: %-40u β\n", arch.block_count); |
| printf(" β Hidden size: %-40u β\n", arch.embedding_length); |
| printf(" β Attention heads: %-40u β\n", arch.head_count); |
| printf(" β KV heads: %-40u β\n", arch.head_count_kv); |
| printf(" β Vocab size: %-40u β\n", arch.vocab_size); |
| printf(" β FFN size: %-40u β\n", arch.feed_forward_length); |
| printf(" β Context length: %-40u β\n", arch.context_length); |
| printf(" β Has bias: %-40s β\n", arch.has_bias ? "yes" : "no"); |
| printf(" β Tied embeddings: %-40s β\n", arch.tie_word_embeddings ? "yes" : "no"); |
| printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); |
|
|
| |
| printf(" Phase 2b: Loading tokenizer...\n"); |
| TokenizerData *tokenizer = NULL; |
| { |
| char tok_json[512], tok_config[512]; |
| snprintf(tok_json, sizeof(tok_json), "%stokenizer.json", input_dir); |
| snprintf(tok_config, sizeof(tok_config), "%stokenizer_config.json", input_dir); |
|
|
| tokenizer = tok_load(tok_json, tok_config); |
| if (tokenizer) { |
| tok_print_summary(tokenizer); |
| } else { |
| printf(" No tokenizer found in '%s'\n", input_dir); |
| printf(" (Output GGUF will lack tokenizer data β not inference-ready)\n\n"); |
| } |
| } |
|
|
| |
| IMatrixData *imatrix = NULL; |
| if (imatrix_path) { |
| printf(" Phase 2c: Loading importance matrix...\n"); |
| imatrix = imatrix_load(imatrix_path); |
| if (imatrix) { |
| imatrix_print_summary(imatrix); |
| } else { |
| printf(" WARNING: Failed to load imatrix from '%s'\n", imatrix_path); |
| printf(" Proceeding without importance weighting.\n\n"); |
| } |
| } |
|
|
| |
| printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n"); |
| clock_t t_quant_start = clock(); |
|
|
| int result = write_gguf(output_path, mf, &arch, tokenizer, |
| opt_mode, imatrix, verbose); |
|
|
| clock_t t_end = clock(); |
| printf(" Total time: %.3f seconds\n\n", |
| (double)(t_end - t_start) / CLOCKS_PER_SEC); |
|
|
| if (imatrix) imatrix_free(imatrix); |
| if (tokenizer) tok_free(tokenizer); |
| st_multi_close(mf); |
| return result; |
| } |
| #endif |
|
|