Zero-Shot Image Classification
Transformers
ONNX
Chinese
English
m2_encoder
feature-extraction
multimodal
image-text-retrieval
bilingual
chinese
english
vision-language
custom-code
custom_code
Eval Results (legacy)
Instructions to use malusama/M2-Encoder-1B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use malusama/M2-Encoder-1B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="malusama/M2-Encoder-1B", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("malusama/M2-Encoder-1B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| import json | |
| import os | |
| from typing import Any, Dict | |
| from transformers import PretrainedConfig | |
| class M2EncoderConfig(PretrainedConfig): | |
| model_type = "m2_encoder" | |
| def __init__( | |
| self, | |
| loss_names=None, | |
| beit_version="large", | |
| encoder_embed_dim=1024, | |
| out_embed_dim=1024, | |
| encoder_layers=21, | |
| beit3_vl_layers=3, | |
| image_size=224, | |
| visual_mask_size=14, | |
| tokenizer_type="GLMChineseTokenizer", | |
| tokenizer=".", | |
| vocab_size=115244, | |
| whole_word_masking=False, | |
| precision=32, | |
| test_only=True, | |
| flash_attn=False, | |
| model_file="m2_encoder_1B.safetensors", | |
| architectures=None, | |
| auto_map=None, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.loss_names = loss_names or {"itc": 1} | |
| self.beit_version = beit_version | |
| self.encoder_embed_dim = encoder_embed_dim | |
| self.out_embed_dim = out_embed_dim | |
| self.encoder_layers = encoder_layers | |
| self.beit3_vl_layers = beit3_vl_layers | |
| self.image_size = image_size | |
| self.visual_mask_size = visual_mask_size | |
| self.tokenizer_type = tokenizer_type | |
| self.tokenizer = tokenizer | |
| self.vocab_size = vocab_size | |
| self.whole_word_masking = whole_word_masking | |
| self.precision = precision | |
| self.test_only = test_only | |
| self.flash_attn = flash_attn | |
| self.model_file = model_file | |
| self.architectures = architectures or ["M2EncoderModel"] | |
| self.auto_map = auto_map or { | |
| "AutoConfig": "configuration_m2_encoder.M2EncoderConfig", | |
| "AutoModel": "modeling_m2_encoder.M2EncoderModel", | |
| "AutoProcessor": "processing_m2_encoder.M2EncoderProcessor", | |
| } | |
| def from_encoder_json(cls, config_path: str, **kwargs) -> "M2EncoderConfig": | |
| with open(config_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| data.update(kwargs) | |
| return cls(**data) | |
| def to_vlmo_overrides(self, model_dir: str) -> Dict[str, Any]: | |
| return { | |
| "loss_names": self.loss_names, | |
| "beit_version": self.beit_version, | |
| "encoder_embed_dim": self.encoder_embed_dim, | |
| "out_embed_dim": self.out_embed_dim, | |
| "encoder_layers": self.encoder_layers, | |
| "beit3_vl_layers": self.beit3_vl_layers, | |
| "image_size": self.image_size, | |
| "visual_mask_size": self.visual_mask_size, | |
| "tokenizer_type": self.tokenizer_type, | |
| "tokenizer": self._resolve_tokenizer_dir(model_dir), | |
| "vocab_size": self.vocab_size, | |
| "whole_word_masking": self.whole_word_masking, | |
| "precision": self.precision, | |
| "test_only": self.test_only, | |
| "flash_attn": self.flash_attn, | |
| "load_path": os.path.join(model_dir, self.model_file), | |
| } | |
| def _resolve_tokenizer_dir(self, model_dir: str) -> str: | |
| if os.path.isabs(self.tokenizer): | |
| return self.tokenizer | |
| if self.tokenizer in (".", "./", ""): | |
| return model_dir | |
| return os.path.join(model_dir, self.tokenizer) | |