yujiepan commited on
Commit
5a1b2b2
·
verified ·
1 Parent(s): 75fa6cf

Upload folder using huggingface_hub

Browse files
.meta.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "torch": "2.11.0",
3
- "transformers": "5.5.0"
4
  }
 
1
  {
2
+ "torch": "2.12.0",
3
+ "transformers": "5.9.0"
4
  }
README.md CHANGED
@@ -193,16 +193,62 @@ model.save_pretrained(save_folder)
193
  ```text
194
  Gemma4ForConditionalGeneration(
195
  (model): Gemma4Model(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  (language_model): Gemma4TextModel(
197
  (embed_tokens): Gemma4TextScaledWordEmbedding(262144, 8, padding_idx=0)
198
  (layers): ModuleList(
199
  (0): Gemma4TextDecoderLayer(
200
  (self_attn): Gemma4TextAttention(
 
201
  (q_norm): Gemma4RMSNorm()
202
  (k_norm): Gemma4RMSNorm()
203
  (v_norm): Gemma4RMSNorm()
204
  (k_proj): Linear(in_features=8, out_features=128, bias=False)
205
- (q_proj): Linear(in_features=8, out_features=256, bias=False)
206
  (v_proj): Linear(in_features=8, out_features=128, bias=False)
207
  (o_proj): Linear(in_features=256, out_features=8, bias=False)
208
  )
@@ -223,11 +269,11 @@ Gemma4ForConditionalGeneration(
223
  )
224
  (1): Gemma4TextDecoderLayer(
225
  (self_attn): Gemma4TextAttention(
 
226
  (q_norm): Gemma4RMSNorm()
227
  (k_norm): Gemma4RMSNorm()
228
  (v_norm): Gemma4RMSNorm()
229
  (k_proj): Linear(in_features=8, out_features=256, bias=False)
230
- (q_proj): Linear(in_features=8, out_features=512, bias=False)
231
  (v_proj): Linear(in_features=8, out_features=256, bias=False)
232
  (o_proj): Linear(in_features=512, out_features=8, bias=False)
233
  )
@@ -248,12 +294,8 @@ Gemma4ForConditionalGeneration(
248
  )
249
  (2): Gemma4TextDecoderLayer(
250
  (self_attn): Gemma4TextAttention(
251
- (q_norm): Gemma4RMSNorm()
252
- (k_norm): Gemma4RMSNorm()
253
- (v_norm): Gemma4RMSNorm()
254
- (k_proj): Linear(in_features=8, out_features=128, bias=False)
255
  (q_proj): Linear(in_features=8, out_features=256, bias=False)
256
- (v_proj): Linear(in_features=8, out_features=128, bias=False)
257
  (o_proj): Linear(in_features=256, out_features=8, bias=False)
258
  )
259
  (mlp): Gemma4TextMLP(
@@ -273,12 +315,8 @@ Gemma4ForConditionalGeneration(
273
  )
274
  (3): Gemma4TextDecoderLayer(
275
  (self_attn): Gemma4TextAttention(
276
- (q_norm): Gemma4RMSNorm()
277
- (k_norm): Gemma4RMSNorm()
278
- (v_norm): Gemma4RMSNorm()
279
- (k_proj): Linear(in_features=8, out_features=256, bias=False)
280
  (q_proj): Linear(in_features=8, out_features=512, bias=False)
281
- (v_proj): Linear(in_features=8, out_features=256, bias=False)
282
  (o_proj): Linear(in_features=512, out_features=8, bias=False)
283
  )
284
  (mlp): Gemma4TextMLP(
@@ -303,66 +341,16 @@ Gemma4ForConditionalGeneration(
303
  (per_layer_model_projection): Linear(in_features=8, out_features=8, bias=False)
304
  (per_layer_projection_norm): Gemma4RMSNorm()
305
  )
306
- (vision_tower): Gemma4VisionModel(
307
- (patch_embedder): Gemma4VisionPatchEmbedder(
308
- (input_proj): Linear(in_features=768, out_features=8, bias=False)
309
- )
310
- (encoder): Gemma4VisionEncoder(
311
- (rotary_emb): Gemma4VisionRotaryEmbedding()
312
- (layers): ModuleList(
313
- (0-1): 2 x Gemma4VisionEncoderLayer(
314
- (self_attn): Gemma4VisionAttention(
315
- (q_proj): Gemma4ClippableLinear(
316
- (linear): Linear(in_features=8, out_features=128, bias=False)
317
- )
318
- (k_proj): Gemma4ClippableLinear(
319
- (linear): Linear(in_features=8, out_features=128, bias=False)
320
- )
321
- (v_proj): Gemma4ClippableLinear(
322
- (linear): Linear(in_features=8, out_features=128, bias=False)
323
- )
324
- (o_proj): Gemma4ClippableLinear(
325
- (linear): Linear(in_features=128, out_features=8, bias=False)
326
- )
327
- (q_norm): Gemma4RMSNorm()
328
- (k_norm): Gemma4RMSNorm()
329
- (v_norm): Gemma4RMSNorm()
330
- )
331
- (mlp): Gemma4VisionMLP(
332
- (gate_proj): Gemma4ClippableLinear(
333
- (linear): Linear(in_features=8, out_features=64, bias=False)
334
- )
335
- (up_proj): Gemma4ClippableLinear(
336
- (linear): Linear(in_features=8, out_features=64, bias=False)
337
- )
338
- (down_proj): Gemma4ClippableLinear(
339
- (linear): Linear(in_features=64, out_features=8, bias=False)
340
- )
341
- (act_fn): GELUTanh()
342
- )
343
- (input_layernorm): Gemma4RMSNorm()
344
- (post_attention_layernorm): Gemma4RMSNorm()
345
- (pre_feedforward_layernorm): Gemma4RMSNorm()
346
- (post_feedforward_layernorm): Gemma4RMSNorm()
347
- )
348
- )
349
- )
350
- (pooler): Gemma4VisionPooler()
351
- )
352
- (embed_vision): Gemma4MultimodalEmbedder(
353
- (embedding_projection): Linear(in_features=8, out_features=8, bias=False)
354
- (embedding_pre_projection_norm): Gemma4RMSNorm()
355
- )
356
  (audio_tower): Gemma4AudioModel(
357
  (subsample_conv_projection): Gemma4AudioSubSampleConvProjection(
358
  (layer0): Gemma4AudioSubSampleConvProjectionLayer(
359
  (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
360
- (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
361
  (act): ReLU()
362
  )
363
  (layer1): Gemma4AudioSubSampleConvProjectionLayer(
364
  (conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
365
- (norm): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
366
  (act): ReLU()
367
  )
368
  (input_proj_linear): Linear(in_features=1024, out_features=64, bias=False)
@@ -426,6 +414,10 @@ Gemma4ForConditionalGeneration(
426
  )
427
  (output_proj): Linear(in_features=64, out_features=32, bias=True)
428
  )
 
 
 
 
429
  (embed_audio): Gemma4MultimodalEmbedder(
430
  (embedding_projection): Linear(in_features=32, out_features=8, bias=False)
431
  (embedding_pre_projection_norm): Gemma4RMSNorm()
@@ -439,5 +431,5 @@ Gemma4ForConditionalGeneration(
439
 
440
  ### Test environment:
441
 
442
- - torch: 2.11.0
443
- - transformers: 5.5.0
 
193
  ```text
194
  Gemma4ForConditionalGeneration(
195
  (model): Gemma4Model(
196
+ (vision_tower): Gemma4VisionModel(
197
+ (patch_embedder): Gemma4VisionPatchEmbedder(
198
+ (input_proj): Linear(in_features=768, out_features=8, bias=False)
199
+ )
200
+ (encoder): Gemma4VisionEncoder(
201
+ (rotary_emb): Gemma4VisionRotaryEmbedding()
202
+ (layers): ModuleList(
203
+ (0-1): 2 x Gemma4VisionEncoderLayer(
204
+ (self_attn): Gemma4VisionAttention(
205
+ (q_proj): Gemma4ClippableLinear(
206
+ (linear): Linear(in_features=8, out_features=128, bias=False)
207
+ )
208
+ (k_proj): Gemma4ClippableLinear(
209
+ (linear): Linear(in_features=8, out_features=128, bias=False)
210
+ )
211
+ (v_proj): Gemma4ClippableLinear(
212
+ (linear): Linear(in_features=8, out_features=128, bias=False)
213
+ )
214
+ (o_proj): Gemma4ClippableLinear(
215
+ (linear): Linear(in_features=128, out_features=8, bias=False)
216
+ )
217
+ (q_norm): Gemma4RMSNorm()
218
+ (k_norm): Gemma4RMSNorm()
219
+ (v_norm): Gemma4RMSNorm()
220
+ )
221
+ (mlp): Gemma4VisionMLP(
222
+ (gate_proj): Gemma4ClippableLinear(
223
+ (linear): Linear(in_features=8, out_features=64, bias=False)
224
+ )
225
+ (up_proj): Gemma4ClippableLinear(
226
+ (linear): Linear(in_features=8, out_features=64, bias=False)
227
+ )
228
+ (down_proj): Gemma4ClippableLinear(
229
+ (linear): Linear(in_features=64, out_features=8, bias=False)
230
+ )
231
+ (act_fn): GELUTanh()
232
+ )
233
+ (input_layernorm): Gemma4RMSNorm()
234
+ (post_attention_layernorm): Gemma4RMSNorm()
235
+ (pre_feedforward_layernorm): Gemma4RMSNorm()
236
+ (post_feedforward_layernorm): Gemma4RMSNorm()
237
+ )
238
+ )
239
+ )
240
+ (pooler): Gemma4VisionPooler()
241
+ )
242
  (language_model): Gemma4TextModel(
243
  (embed_tokens): Gemma4TextScaledWordEmbedding(262144, 8, padding_idx=0)
244
  (layers): ModuleList(
245
  (0): Gemma4TextDecoderLayer(
246
  (self_attn): Gemma4TextAttention(
247
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
248
  (q_norm): Gemma4RMSNorm()
249
  (k_norm): Gemma4RMSNorm()
250
  (v_norm): Gemma4RMSNorm()
251
  (k_proj): Linear(in_features=8, out_features=128, bias=False)
 
252
  (v_proj): Linear(in_features=8, out_features=128, bias=False)
253
  (o_proj): Linear(in_features=256, out_features=8, bias=False)
254
  )
 
269
  )
270
  (1): Gemma4TextDecoderLayer(
271
  (self_attn): Gemma4TextAttention(
272
+ (q_proj): Linear(in_features=8, out_features=512, bias=False)
273
  (q_norm): Gemma4RMSNorm()
274
  (k_norm): Gemma4RMSNorm()
275
  (v_norm): Gemma4RMSNorm()
276
  (k_proj): Linear(in_features=8, out_features=256, bias=False)
 
277
  (v_proj): Linear(in_features=8, out_features=256, bias=False)
278
  (o_proj): Linear(in_features=512, out_features=8, bias=False)
279
  )
 
294
  )
295
  (2): Gemma4TextDecoderLayer(
296
  (self_attn): Gemma4TextAttention(
 
 
 
 
297
  (q_proj): Linear(in_features=8, out_features=256, bias=False)
298
+ (q_norm): Gemma4RMSNorm()
299
  (o_proj): Linear(in_features=256, out_features=8, bias=False)
300
  )
301
  (mlp): Gemma4TextMLP(
 
315
  )
316
  (3): Gemma4TextDecoderLayer(
317
  (self_attn): Gemma4TextAttention(
 
 
 
 
318
  (q_proj): Linear(in_features=8, out_features=512, bias=False)
319
+ (q_norm): Gemma4RMSNorm()
320
  (o_proj): Linear(in_features=512, out_features=8, bias=False)
321
  )
322
  (mlp): Gemma4TextMLP(
 
341
  (per_layer_model_projection): Linear(in_features=8, out_features=8, bias=False)
342
  (per_layer_projection_norm): Gemma4RMSNorm()
343
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  (audio_tower): Gemma4AudioModel(
345
  (subsample_conv_projection): Gemma4AudioSubSampleConvProjection(
346
  (layer0): Gemma4AudioSubSampleConvProjectionLayer(
347
  (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
348
+ (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True, bias=False)
349
  (act): ReLU()
350
  )
351
  (layer1): Gemma4AudioSubSampleConvProjectionLayer(
352
  (conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
353
+ (norm): LayerNorm((32,), eps=1e-06, elementwise_affine=True, bias=False)
354
  (act): ReLU()
355
  )
356
  (input_proj_linear): Linear(in_features=1024, out_features=64, bias=False)
 
414
  )
415
  (output_proj): Linear(in_features=64, out_features=32, bias=True)
416
  )
417
+ (embed_vision): Gemma4MultimodalEmbedder(
418
+ (embedding_projection): Linear(in_features=8, out_features=8, bias=False)
419
+ (embedding_pre_projection_norm): Gemma4RMSNorm()
420
+ )
421
  (embed_audio): Gemma4MultimodalEmbedder(
422
  (embedding_projection): Linear(in_features=32, out_features=8, bias=False)
423
  (embedding_pre_projection_norm): Gemma4RMSNorm()
 
431
 
432
  ### Test environment:
433
 
434
+ - torch: 2.12.0
435
+ - transformers: 5.9.0
chat_template.jinja CHANGED
@@ -1,9 +1,9 @@
1
- {%- macro format_parameters(properties, required) -%}
2
  {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
  {%- set ns = namespace(found_first=false) -%}
4
  {%- for key, value in properties | dictsort -%}
5
  {%- set add_comma = false -%}
6
- {%- if key not in standard_keys -%}
7
  {%- if ns.found_first %},{% endif -%}
8
  {%- set ns.found_first = true -%}
9
  {{ key }}:{
@@ -65,7 +65,7 @@
65
  {%- elif value is mapping -%}
66
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
  properties:{
68
- {{- format_parameters(value, value['required'] | default([])) -}}
69
  }
70
  {%- endif -%}
71
  {%- if value['required'] -%}
@@ -178,18 +178,21 @@
178
  {#- Handle System/Tool Definitions Block -#}
179
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
  {{- '<|turn>system\n' -}}
181
-
182
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
183
  {%- if enable_thinking is defined and enable_thinking -%}
184
  {{- '<|think|>\n' -}}
185
  {%- set ns.prev_message_type = 'think' -%}
186
  {%- endif -%}
187
-
188
  {%- if messages[0]['role'] in ['system', 'developer'] -%}
189
- {{- messages[0]['content'] | trim -}}
 
 
 
 
 
 
190
  {%- set loop_messages = messages[1:] -%}
191
  {%- endif -%}
192
-
193
  {%- if tools -%}
194
  {%- for tool in tools %}
195
  {{- '<|tool>' -}}
@@ -198,7 +201,6 @@
198
  {%- endfor %}
199
  {%- set ns.prev_message_type = 'tool' -%}
200
  {%- endif -%}
201
-
202
  {{- '<turn|>\n' -}}
203
  {%- endif %}
204
 
@@ -293,6 +295,15 @@
293
  {%- endif -%}
294
  {%- endfor -%}
295
  {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
 
 
 
 
 
 
 
 
 
296
  {%- else -%}
297
  {{- format_tool_response_block(ns_tname.name, tool_body) -}}
298
  {%- endif -%}
@@ -302,6 +313,7 @@
302
  {%- endfor -%}
303
  {%- endif -%}
304
 
 
305
  {%- if message['content'] is string -%}
306
  {%- if role == 'model' -%}
307
  {{- strip_thinking(message['content']) -}}
@@ -328,10 +340,14 @@
328
  {%- endif -%}
329
  {%- endfor -%}
330
  {%- endif -%}
 
 
 
 
331
 
332
  {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
333
  {{- '<|tool_response>' -}}
334
- {%- elif not (ns_tr_out.flag and not message.get('content')) -%}
335
  {{- '<turn|>\n' -}}
336
  {%- endif -%}
337
  {%- endif -%}
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
  {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
  {%- set ns = namespace(found_first=false) -%}
4
  {%- for key, value in properties | dictsort -%}
5
  {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
  {%- if ns.found_first %},{% endif -%}
8
  {%- set ns.found_first = true -%}
9
  {{ key }}:{
 
65
  {%- elif value is mapping -%}
66
  {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
  properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
  }
70
  {%- endif -%}
71
  {%- if value['required'] -%}
 
178
  {#- Handle System/Tool Definitions Block -#}
179
  {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
  {{- '<|turn>system\n' -}}
 
181
  {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
  {%- if enable_thinking is defined and enable_thinking -%}
183
  {{- '<|think|>\n' -}}
184
  {%- set ns.prev_message_type = 'think' -%}
185
  {%- endif -%}
 
186
  {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
  {%- set loop_messages = messages[1:] -%}
195
  {%- endif -%}
 
196
  {%- if tools -%}
197
  {%- for tool in tools %}
198
  {{- '<|tool>' -}}
 
201
  {%- endfor %}
202
  {%- set ns.prev_message_type = 'tool' -%}
203
  {%- endif -%}
 
204
  {{- '<turn|>\n' -}}
205
  {%- endif %}
206
 
 
295
  {%- endif -%}
296
  {%- endfor -%}
297
  {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- for part in tool_body -%}
299
+ {%- if part.get('type') == 'image' -%}
300
+ {{- '<|image|>' -}}
301
+ {%- elif part.get('type') == 'audio' -%}
302
+ {{- '<|audio|>' -}}
303
+ {%- elif part.get('type') == 'video' -%}
304
+ {{- '<|video|>' -}}
305
+ {%- endif -%}
306
+ {%- endfor -%}
307
  {%- else -%}
308
  {{- format_tool_response_block(ns_tname.name, tool_body) -}}
309
  {%- endif -%}
 
313
  {%- endfor -%}
314
  {%- endif -%}
315
 
316
+ {%- set captured_content -%}
317
  {%- if message['content'] is string -%}
318
  {%- if role == 'model' -%}
319
  {{- strip_thinking(message['content']) -}}
 
340
  {%- endif -%}
341
  {%- endfor -%}
342
  {%- endif -%}
343
+ {%- endset -%}
344
+
345
+ {{- captured_content -}}
346
+ {%- set has_content = captured_content | trim | length > 0 -%}
347
 
348
  {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
349
  {{- '<|tool_response>' -}}
350
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
351
  {{- '<turn|>\n' -}}
352
  {%- endif -%}
353
  {%- endif -%}
config.json CHANGED
@@ -111,7 +111,7 @@
111
  "vocab_size_per_layer_input": 262144
112
  },
113
  "tie_word_embeddings": true,
114
- "transformers_version": "5.5.0",
115
  "video_token_id": 258884,
116
  "vision_config": {
117
  "_name_or_path": "",
 
111
  "vocab_size_per_layer_input": 262144
112
  },
113
  "tie_word_embeddings": true,
114
+ "transformers_version": "5.9.0",
115
  "video_token_id": 258884,
116
  "vision_config": {
117
  "_name_or_path": "",
generation_config.json CHANGED
@@ -10,6 +10,6 @@
10
  "temperature": 1.0,
11
  "top_k": 64,
12
  "top_p": 0.95,
13
- "transformers_version": "5.5.0",
14
  "trust_remote_code": true
15
  }
 
10
  "temperature": 1.0,
11
  "top_k": 64,
12
  "top_p": 0.95,
13
+ "transformers_version": "5.9.0",
14
  "trust_remote_code": true
15
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c2045c91d5ee53e379d6f3c4947b914c36ab4308f708e25858c2eb5bd694bc3
3
- size 9483948
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8582ee51fce3dac0a80d9a90008785011e2e24d9d26234822df7361d2092877b
3
+ size 9470740
tokenizer_config.json CHANGED
@@ -18,6 +18,7 @@
18
  ],
19
  "image_token": "<|image|>",
20
  "is_local": false,
 
21
  "mask_token": "<mask>",
22
  "model_max_length": 1000000000000000019884624838656,
23
  "model_specific_special_tokens": {
 
18
  ],
19
  "image_token": "<|image|>",
20
  "is_local": false,
21
+ "local_files_only": false,
22
  "mask_token": "<mask>",
23
  "model_max_length": 1000000000000000019884624838656,
24
  "model_specific_special_tokens": {