yujiepan commited on
Commit
453391c
·
verified ·
1 Parent(s): 47b9f0a

Upload folder using huggingface_hub

Browse files
.meta.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "torch": "2.11.0+cu126",
3
+ "transformers": "5.7.0.dev0"
4
+ }
README.md ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - tencent/Hy3-preview
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [tencent/Hy3-preview](https://huggingface.co/tencent/Hy3-preview).
8
+
9
+ | File path | Size |
10
+ |------|------|
11
+ | model.safetensors | 5.4MB |
12
+
13
+
14
+ ### Example usage:
15
+
16
+ - vLLM
17
+
18
+ ```bash
19
+ # Multi-token prediction is supported
20
+ model_id=tiny-random/hy3
21
+ vllm serve $model_id \
22
+ --tensor-parallel-size 2 \
23
+ --speculative-config.method mtp \
24
+ --speculative-config.num_speculative_tokens 1 \
25
+ --tool-call-parser hy_v3 \
26
+ --reasoning-parser hy_v3 \
27
+ --enable-auto-tool-choice
28
+ ```
29
+
30
+ - SGLang
31
+
32
+ ```bash
33
+ # Multi-token prediction is supported
34
+ model_id=tiny-random/hy3
35
+ python3 -m sglang.launch_server \
36
+ --model tencent/Hy3-preview \
37
+ --tp 2 \
38
+ --tool-call-parser hunyuan \
39
+ --reasoning-parser hunyuan \
40
+ --speculative-num-steps 1 \
41
+ --speculative-eagle-topk 1 \
42
+ --speculative-num-draft-tokens 2 \
43
+ --speculative-algorithm EAGLE
44
+ ```
45
+
46
+ - Transformers
47
+
48
+ ```python
49
+ from transformers import AutoModelForCausalLM, AutoTokenizer
50
+
51
+ model_id = "tiny-random/hy3"
52
+
53
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
54
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True)
55
+ messages = [
56
+ {"role": "user", "content": "Write a short poem about AI."},
57
+ ]
58
+ inputs = tokenizer.apply_chat_template(
59
+ messages,
60
+ tokenize=True,
61
+ return_tensors="pt",
62
+ add_generation_prompt=True,
63
+ reasoning_effort='high',
64
+ )
65
+ print(inputs)
66
+ outputs = model.generate(**inputs.to(model.device), max_new_tokens=32)
67
+ output_text = tokenizer.decode(outputs[0])
68
+ print(output_text)
69
+ ```
70
+
71
+ ### Codes to create this repo:
72
+
73
+ <details>
74
+ <summary>Click to expand</summary>
75
+
76
+ ```python
77
+ import json
78
+ from copy import deepcopy
79
+ from pathlib import Path
80
+
81
+ import torch
82
+ import torch.nn as nn
83
+
84
+ from huggingface_hub import file_exists, hf_hub_download
85
+ from transformers import (
86
+ AutoConfig,
87
+ AutoModelForCausalLM,
88
+ AutoTokenizer,
89
+ GenerationConfig,
90
+ set_seed,
91
+ )
92
+
93
+ source_model_id = "tencent/Hy3-preview"
94
+ save_folder = "/tmp/tiny-random/hy3"
95
+
96
+ processor = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
97
+ processor.save_pretrained(save_folder)
98
+
99
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
100
+ config_json = json.load(f)
101
+ config_json.update({
102
+ 'expert_hidden_dim': 32,
103
+ 'moe_intermediate_size': 32,
104
+ 'head_dim': 32,
105
+ 'hidden_size': 8,
106
+ 'intermediate_size': 32,
107
+ 'num_attention_heads': 8,
108
+ 'num_hidden_layers': 4,
109
+ 'num_key_value_heads': 4,
110
+ })
111
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
112
+ json.dump(config_json, f, indent=2)
113
+
114
+ config = AutoConfig.from_pretrained(
115
+ save_folder,
116
+ trust_remote_code=True,
117
+ )
118
+ print(config)
119
+ torch.set_default_dtype(torch.bfloat16)
120
+ set_seed(42)
121
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).eval().cpu()
122
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
123
+ model.generation_config = GenerationConfig.from_pretrained(
124
+ source_model_id, trust_remote_code=True,
125
+ )
126
+ model.generation_config.top_k = 40 # original value in source model is -1 , which is invalid
127
+
128
+ # mtp
129
+ mtp = deepcopy(model.model.layers[-1])
130
+ mtp.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
131
+ mtp.enorm = nn.RMSNorm(config.hidden_size)
132
+ mtp.hnorm = nn.RMSNorm(config.hidden_size)
133
+ mtp.final_layernorm = nn.RMSNorm(config.hidden_size)
134
+ model.model.layers.append(mtp)
135
+
136
+ # init weights
137
+ set_seed(42)
138
+ model = model.cpu().eval()
139
+ n_params = sum(p.numel() for p in model.parameters())
140
+ with torch.no_grad():
141
+ for name, p in sorted(model.named_parameters()):
142
+ torch.nn.init.normal_(p, 0, 0.2)
143
+ print(name, p.shape, p.dtype, f'{p.numel() / n_params * 100: .2f}%')
144
+
145
+ # expert bias is in float32
146
+ for i in range(config.first_k_dense_replace, config.num_hidden_layers + 1, 1):
147
+ model.model.layers[i].mlp.e_score_correction_bias = nn.Parameter(torch.randn_like(
148
+ model.model.layers[i].mlp.e_score_correction_bias
149
+ ).float() * 0.002)
150
+
151
+ model.save_pretrained(save_folder)
152
+ print(model)
153
+ torch.set_default_dtype(torch.float32)
154
+ ```
155
+
156
+ </details>
157
+
158
+ ### Printing the model:
159
+
160
+ <details><summary>Click to expand</summary>
161
+
162
+ ```text
163
+ HYV3ForCausalLM(
164
+ (model): HYV3Model(
165
+ (embed_tokens): Embedding(120832, 8, padding_idx=120002)
166
+ (layers): ModuleList(
167
+ (0): HYV3DecoderLayer(
168
+ (self_attn): HYV3Attention(
169
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
170
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
171
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
172
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
173
+ (q_norm): HYV3RMSNorm((32,), eps=1e-05)
174
+ (k_norm): HYV3RMSNorm((32,), eps=1e-05)
175
+ )
176
+ (mlp): HYV3MLP(
177
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
178
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
179
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
180
+ (act_fn): SiLUActivation()
181
+ )
182
+ (input_layernorm): HYV3RMSNorm((8,), eps=1e-05)
183
+ (post_attention_layernorm): HYV3RMSNorm((8,), eps=1e-05)
184
+ )
185
+ (1-3): 3 x HYV3DecoderLayer(
186
+ (self_attn): HYV3Attention(
187
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
188
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
189
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
190
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
191
+ (q_norm): HYV3RMSNorm((32,), eps=1e-05)
192
+ (k_norm): HYV3RMSNorm((32,), eps=1e-05)
193
+ )
194
+ (mlp): HYV3MoE(
195
+ (gate): HYV3TopKRouter()
196
+ (experts): HYV3Experts(
197
+ (act_fn): SiLUActivation()
198
+ )
199
+ (shared_experts): HYV3MLP(
200
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
201
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
202
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
203
+ (act_fn): SiLUActivation()
204
+ )
205
+ )
206
+ (input_layernorm): HYV3RMSNorm((8,), eps=1e-05)
207
+ (post_attention_layernorm): HYV3RMSNorm((8,), eps=1e-05)
208
+ )
209
+ (4): HYV3DecoderLayer(
210
+ (self_attn): HYV3Attention(
211
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
212
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
213
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
214
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
215
+ (q_norm): HYV3RMSNorm((32,), eps=1e-05)
216
+ (k_norm): HYV3RMSNorm((32,), eps=1e-05)
217
+ )
218
+ (mlp): HYV3MoE(
219
+ (gate): HYV3TopKRouter()
220
+ (experts): HYV3Experts(
221
+ (act_fn): SiLUActivation()
222
+ )
223
+ (shared_experts): HYV3MLP(
224
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
225
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
226
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
227
+ (act_fn): SiLUActivation()
228
+ )
229
+ )
230
+ (input_layernorm): HYV3RMSNorm((8,), eps=1e-05)
231
+ (post_attention_layernorm): HYV3RMSNorm((8,), eps=1e-05)
232
+ (eh_proj): Linear(in_features=16, out_features=8, bias=False)
233
+ (enorm): RMSNorm((8,), eps=None, elementwise_affine=True)
234
+ (hnorm): RMSNorm((8,), eps=None, elementwise_affine=True)
235
+ (final_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
236
+ )
237
+ )
238
+ (norm): HYV3RMSNorm((8,), eps=1e-05)
239
+ (rotary_emb): HYV3RotaryEmbedding()
240
+ )
241
+ (lm_head): Linear(in_features=8, out_features=120832, bias=False)
242
+ )
243
+ ```
244
+
245
+ </details>
246
+
247
+ ### Test environment:
248
+
249
+ - torch: 2.11.0+cu126
250
+ - transformers: 5.7.0.dev0
chat_template.jinja ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {#- ----------‑‑‑ special token variables ‑‑‑---------- -#}
2
+ {%- set bos_token = '<|hy_begin▁of▁sentence|>' %}
3
+ {%- set pad_token = '<|hy_▁pad▁|>' %}
4
+ {%- set user_token = '<|hy_User|>' %}
5
+ {%- set assistant_token = '<|hy_Assistant|>' %}
6
+ {%- set eos_token = '<|hy_eos|>' %}
7
+ {%- set think_begin_token = '<think>' %}
8
+ {%- set think_end_token = '</think>' %}
9
+ {%- set toolcalls_begin_token = '<tool_calls>' %}
10
+ {%- set toolcalls_end_token = '</tool_calls>' %}
11
+ {%- set toolcall_begin_token = '<tool_call>' %}
12
+ {%- set toolcall_end_token = '</tool_call>' %}
13
+ {%- set toolsep_token = '<tool_sep>' %}
14
+ {%- set argkey_begin_token = '<arg_key>' %}
15
+ {%- set argkey_end_token = '</arg_key>' %}
16
+ {%- set argvalue_begin_token = '<arg_value>' %}
17
+ {%- set argvalue_end_token = '</arg_value>' %}
18
+ {%- set toolresponses_begin_token = '<tool_responses>' %}
19
+ {%- set toolresponses_end_token = '</tool_responses>' %}
20
+ {%- set toolresponse_begin_token = '<tool_response>' %}
21
+ {%- set toolresponse_end_token = '</tool_response>' %}
22
+ {%- set reasoning_mode_token = '<|reasoning_mode|>' %}
23
+ {#- ----------‑‑‑ hyperparameters variables ‑‑‑---------- -#}
24
+ {%- if not add_generation_prompt is defined %}
25
+ {%- set add_generation_prompt = false %}
26
+ {%- endif %}
27
+ {%- if not interleaved_thinking is defined %}
28
+ {%- set interleaved_thinking = false %}
29
+ {%- endif %}
30
+ {%- if not tools %}
31
+ {%- set interleaved_thinking = false %}
32
+ {%- endif %}
33
+ {%- if not is_training is defined %}
34
+ {%- set is_training = false %}
35
+ {%- endif %}
36
+ {%- if not reasoning_effort is defined or reasoning_effort not in ['high', 'low', 'no_think'] %}
37
+ {%- set reasoning_effort = 'no_think' %}
38
+ {%- endif %}
39
+
40
+ {%- macro visible_text(content) -%}
41
+ {%- if content is string -%}
42
+ {{- content }}
43
+ {%- elif content is iterable and content is not mapping -%}
44
+ {%- for item in content -%}
45
+ {%- if item is mapping and item.type == 'text' -%}
46
+ {{- item.text }}
47
+ {%- elif item is string -%}
48
+ {{- item }}
49
+ {%- endif -%}
50
+ {%- endfor -%}
51
+ {%- elif content is none -%}
52
+ {{- '' }}
53
+ {%- else -%}
54
+ {{- content }}
55
+ {%- endif -%}
56
+ {%- endmacro -%}
57
+
58
+ {%- set ns = namespace(last_user_index=-1) %}
59
+ {%- set sp_ns = namespace(system_prompt='', is_first_sp=true) %}
60
+ {%- for message in messages %}
61
+ {%- if message['role'] == 'system' %}
62
+ {%- set sp_ns.system_prompt = sp_ns.system_prompt + visible_text(message['content']) %}
63
+ {%- endif %}
64
+ {%- if message['role'] == 'user' %}
65
+ {%- set ns.last_user_index = loop.index0 %}
66
+ {%- endif %}
67
+ {%- endfor %}
68
+ {%- if reasoning_effort is defined and reasoning_effort is string and reasoning_effort != '' and not tools %}
69
+ {%- set sp_ns.system_prompt = sp_ns.system_prompt + reasoning_mode_token + 'reasoning_effort:' + reasoning_effort %}
70
+ {%- endif %}
71
+ {{- bos_token }}
72
+ {{- sp_ns.system_prompt }}
73
+ {%- if tools %}
74
+ {%- if sp_ns.system_prompt != '' %}
75
+ {{- '\n\n# Tools\n\nYou may call one or more functions to assist with the user query.' }}
76
+ {%- else %}
77
+ {{- '# Tools\n\nYou may call one or more functions to assist with the user query.' }}
78
+ {%- endif %}
79
+ {{- '\n\nYou are provided with function signatures within <tools></tools> XML tags:' }}
80
+ {{- '\n<tools>\n' }}
81
+ {%- for tool in tools %}
82
+ {%- if loop.index0 > 0 %}
83
+ {{- '\n' }}
84
+ {%- endif %}
85
+ {{- tool | tojson }}
86
+ {%- endfor %}
87
+ {{- '\n</tools>\n\n' }}
88
+ {{- 'For function call returns, you should first print ' + toolcalls_begin_token + '\n' }}
89
+ {{- 'For each function call, you should return object like:\n' }}
90
+ {{- toolcall_begin_token + '{function-name}' + toolsep_token + '\n' }}
91
+ {{- argkey_begin_token + '{arg-key-1}' + argkey_end_token + '\n' }}
92
+ {{- argvalue_begin_token + '{arg-value-1}' + argvalue_end_token + '\n' }}
93
+ {{- argkey_begin_token + '{arg-key-2}' + argkey_end_token + '\n' }}
94
+ {{- argvalue_begin_token + '{arg-value-2}' + argvalue_end_token + '\n' }}
95
+ {{- '...\n' }}
96
+ {{- toolcall_end_token + '\n' }}
97
+ {%- if reasoning_effort is defined and reasoning_effort is string and reasoning_effort != '' %}
98
+ {{- 'At the end of function call returns, you should print ' + toolcalls_end_token + reasoning_mode_token + 'reasoning_effort:' + reasoning_effort }}
99
+ {%- else %}
100
+ {{- 'At the end of function call returns, you should print ' + toolcalls_end_token }}
101
+ {%- endif %}
102
+ {%- endif %}
103
+
104
+ {%- set prev_ns = namespace(is_tool=false, is_tool_first=true) %}
105
+ {%- set last_ns = namespace(last_is_assistant=false) %}
106
+ {%- for message in messages %}
107
+ {%- if message['role'] == 'user' %}
108
+ {%- if prev_ns.is_tool %}
109
+ {{- toolresponses_end_token }}
110
+ {%- endif %}
111
+ {{- user_token + visible_text(message['content']) }}
112
+ {%- set prev_ns.is_tool = false %}
113
+ {%- endif %}
114
+ {%- if message['role'] == 'assistant' %}
115
+ {%- if 'reasoning_content' in message and message['reasoning_content'] is string %}
116
+ {%- set rc = message['reasoning_content'] %}
117
+ {%- elif 'reasoning' in message and message['reasoning'] is string %}
118
+ {%- set rc = message['reasoning'] %}
119
+ {%- else %}
120
+ {%- set rc = none %}
121
+ {%- endif %}
122
+ {%- if is_training %}
123
+ {%- if rc is not none %}
124
+ {%- set content = think_begin_token + rc + think_end_token + visible_text(message['content']) %}
125
+ {%- else %}
126
+ {%- set content = think_begin_token + think_end_token + visible_text(message['content']) %}
127
+ {%- endif %}
128
+ {%- else %}
129
+ {%- if interleaved_thinking %}
130
+ {%- if loop.index0 > ns.last_user_index and rc is not none %}
131
+ {%- set content = think_begin_token + rc + think_end_token + visible_text(message['content']) %}
132
+ {%- else %}
133
+ {%- set content = think_begin_token + think_end_token + visible_text(message['content']) %}
134
+ {%- endif %}
135
+ {%- else %}
136
+ {%- set content = think_begin_token + think_end_token + visible_text(message['content']) %}
137
+ {%- endif %}
138
+ {%- endif %}
139
+ {%- if prev_ns.is_tool %}
140
+ {{- toolresponses_end_token }}
141
+ {%- endif %}
142
+ {{- assistant_token }}
143
+ {%- if message['tool_calls'] is defined and message['tool_calls'] %}
144
+ {%- set prev_ns.is_tool_first = true %}
145
+ {{- content }}
146
+ {{- toolcalls_begin_token + '\n' }}
147
+ {%- for tool in message['tool_calls'] %}
148
+ {%- set arguments = tool['function']['arguments'] %}
149
+ {{- toolcall_begin_token + tool['function']['name'] + toolsep_token + '\n' }}
150
+ {%- for key, value in arguments.items() %}
151
+ {{- argkey_begin_token + key + argkey_end_token + '\n' }}
152
+ {%- if value is not string %}
153
+ {%- set value = value | tojson(ensure_ascii=False) %}
154
+ {%- endif %}
155
+ {{- argvalue_begin_token + value + argvalue_end_token + '\n' }}
156
+ {%- endfor %}
157
+ {{- toolcall_end_token + '\n' }}
158
+ {%- endfor %}
159
+ {{- toolcalls_end_token + eos_token }}
160
+ {%- else %}
161
+ {%- if not loop.last or is_training %}
162
+ {{- content + eos_token }}
163
+ {%- else %}
164
+ {{- content }}
165
+ {%- endif %}
166
+ {%- endif %}
167
+ {%- set prev_ns.is_tool = false %}
168
+ {%- endif %}
169
+ {%- if message['role'] == 'tool' %}
170
+ {%- set prev_ns.is_tool = true %}
171
+ {%- if prev_ns.is_tool_first %}
172
+ {{- toolresponses_begin_token + '\n' }}
173
+ {%- set prev_ns.is_tool_first = false %}
174
+ {%- endif %}
175
+ {{- toolresponse_begin_token + '\n' + visible_text(message['content']) + '\n' + toolresponse_end_token + '\n' }}
176
+ {%- endif %}
177
+ {%- if loop.last and message['role'] == 'assistant' %}
178
+ {%- set last_ns.last_is_assistant = true %}
179
+ {%- endif %}
180
+
181
+ {%- endfor %}
182
+ {%- if prev_ns.is_tool %}
183
+ {{- toolresponses_end_token }}
184
+ {%- endif %}
185
+ {%- if add_generation_prompt %}
186
+ {%- if not last_ns.last_is_assistant %}
187
+ {%- if reasoning_effort is defined and reasoning_effort in ['low', 'high'] %}
188
+ {{- assistant_token + think_begin_token }}
189
+ {%- elif reasoning_effort is defined and reasoning_effort == 'no_think' %}
190
+ {{- assistant_token + think_begin_token + think_end_token }}
191
+ {%- else %}
192
+ {{- assistant_token }}
193
+ {%- endif %}
194
+ {%- endif %}
195
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HYV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 120000,
8
+ "dtype": "bfloat16",
9
+ "enable_attention_fp32_softmax": false,
10
+ "enable_lm_head_fp32": true,
11
+ "enable_moe_fp32_combine": false,
12
+ "eod_token_id": 120026,
13
+ "eos_token_id": 120025,
14
+ "expert_hidden_dim": 32,
15
+ "first_k_dense_replace": 1,
16
+ "head_dim": 32,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 8,
19
+ "initializer_range": 0.006,
20
+ "intermediate_size": 32,
21
+ "max_position_embeddings": 262144,
22
+ "mlp_bias": false,
23
+ "mlp_layer_types": [
24
+ "dense",
25
+ "sparse",
26
+ "sparse",
27
+ "sparse"
28
+ ],
29
+ "model_type": "hy_v3",
30
+ "moe_intermediate_size": 32,
31
+ "moe_router_enable_expert_bias": true,
32
+ "moe_router_use_sigmoid": true,
33
+ "num_attention_heads": 8,
34
+ "num_experts": 192,
35
+ "num_experts_per_tok": 8,
36
+ "num_hidden_layers": 4,
37
+ "num_key_value_heads": 4,
38
+ "num_nextn_predict_layers": 1,
39
+ "num_shared_experts": 1,
40
+ "output_router_logits": true,
41
+ "pad_token_id": 120002,
42
+ "qk_norm": true,
43
+ "rms_norm_eps": 1e-05,
44
+ "rope_parameters": {
45
+ "rope_theta": 11158840.0,
46
+ "rope_type": "default"
47
+ },
48
+ "route_norm": true,
49
+ "router_scaling_factor": 2.826,
50
+ "sep_token_id": 120007,
51
+ "tie_word_embeddings": false,
52
+ "transformers_version": "5.7.0.dev0",
53
+ "use_cache": true,
54
+ "use_grouped_mm": false,
55
+ "vocab_size": 120832
56
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 120000,
3
+ "do_sample": true,
4
+ "eos_token_id": 120025,
5
+ "pad_token_id": 120002,
6
+ "temperature": 0.9,
7
+ "top_k": 40,
8
+ "top_p": 1,
9
+ "transformers_version": "5.7.0.dev0",
10
+ "trust_remote_code": true
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d14912ca0a2c0a4487ff365b54caf64ef7885fc06059c225bc0668e7013055f8
3
+ size 5401352
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|hy_begin▁of▁sentence|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|hy_eos|>",
6
+ "is_local": false,
7
+ "local_files_only": false,
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "<|hy_▁pad▁|>",
10
+ "tokenizer_class": "TokenizersBackend"
11
+ }