Feature Extraction
Transformers
Safetensors
moss-audio-tokenizer
audio
audio-tokenizer
neural-codec
moss-tts-family
MOSS Audio Tokenizer
speech-tokenizer
trust-remote-code
custom_code
Instructions to use OpenMOSS-Team/MOSS-Audio-Tokenizer with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use OpenMOSS-Team/MOSS-Audio-Tokenizer with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="OpenMOSS-Team/MOSS-Audio-Tokenizer", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("OpenMOSS-Team/MOSS-Audio-Tokenizer", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # coding=utf-8 | |
| # Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """MossAudioTokenizer model configuration""" | |
| from typing import Any | |
| from transformers.configuration_utils import PreTrainedConfig | |
| from transformers.utils import logging | |
| logger = logging.get_logger(__name__) | |
| class MossAudioTokenizerConfig(PreTrainedConfig): | |
| r""" | |
| This is the configuration class to store the configuration of a [`MossAudioTokenizerModel`]. It is used to instantiate a | |
| MossAudioTokenizer model according to the specified arguments, defining the model architecture. | |
| Instantiating a configuration with the defaults will yield a similar configuration to that of the | |
| [VoiceAgentGroup/moss_audio_tokenizer](https://huggingface.co/VoiceAgentGroup/moss_audio_tokenizer) architecture. | |
| Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PreTrainedConfig`] for more information. | |
| Args: | |
| sampling_rate (`int`, *optional*, defaults to 24000): | |
| The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). | |
| downsample_rate (`int`, *optional*, defaults to 1920): | |
| Total downsampling rate from waveform to tokens. | |
| causal_transformer_context_duration (`float`, *optional*, defaults to 10.0): | |
| Context duration in seconds for causal transformer. | |
| encoder_kwargs (`list[dict]`, *optional*): | |
| List of encoder module configurations. Each dict specifies a module type and its parameters. | |
| decoder_kwargs (`list[dict]`, *optional*): | |
| List of decoder module configurations in execution order. | |
| quantizer_type (`str`, *optional*, defaults to `"rvq"`): | |
| Quantizer type. Options include `"rvq"`, `"spec_rvq"`, `"rlfq"`, `"random_prefix_rlfq"`. | |
| quantizer_kwargs (`dict`, *optional*): | |
| Configuration for the quantizer including `input_dim`, `rvq_dim`, `output_dim`, `num_quantizers`, | |
| `codebook_size`, and `codebook_dim`. | |
| Example: | |
| ```python | |
| >>> from transformers import MossAudioTokenizerModel, MossAudioTokenizerConfig | |
| >>> # Initializing a MossAudioTokenizer style configuration | |
| >>> configuration = MossAudioTokenizerConfig() | |
| >>> # Initializing a model (with random weights) from the configuration | |
| >>> model = MossAudioTokenizerModel(configuration) | |
| >>> # Accessing the model configuration | |
| >>> configuration = model.config | |
| ``` | |
| """ | |
| model_type = "moss-audio-tokenizer" | |
| # Backward-compatible alias used by some checkpoints. | |
| attribute_map = {"sample_rate": "sampling_rate"} | |
| sampling_rate: int | |
| downsample_rate: int | |
| causal_transformer_context_duration: float | |
| encoder_kwargs: list[dict[str, Any]] | |
| decoder_kwargs: list[dict[str, Any]] | |
| quantizer_type: str | |
| quantizer_kwargs: dict[str, Any] | |
| def __init__( | |
| self, | |
| version: str | None = None, | |
| sampling_rate: int = 24000, | |
| downsample_rate: int = 1920, | |
| causal_transformer_context_duration: float = 10.0, | |
| encoder_kwargs: list[dict[str, Any]] | None = None, | |
| decoder_kwargs: list[dict[str, Any]] | None = None, | |
| quantizer_type: str = "rlfq", | |
| quantizer_kwargs: dict[str, Any] | None = None, | |
| **kwargs, | |
| ): | |
| # Some checkpoints might include an incorrect/legacy `model_type` (e.g. "speech_tokenizer"). | |
| # We drop it to avoid overriding the class-level `model_type`. | |
| kwargs.pop("model_type", None) | |
| # `version` is accepted for compatibility but not used in modeling. | |
| self.version = version | |
| self.sampling_rate = sampling_rate | |
| self.downsample_rate = downsample_rate | |
| self.causal_transformer_context_duration = causal_transformer_context_duration | |
| # Default encoder configuration | |
| if encoder_kwargs is None: | |
| encoder_kwargs = [ | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 240, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 640, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 1280, | |
| "output_dimension": 768, | |
| "d_model": 1280, | |
| "num_heads": 20, | |
| "num_layers": 32, | |
| "dim_feedforward": 5120, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| ] | |
| self.encoder_kwargs = encoder_kwargs | |
| # Default decoder configuration (execution order) | |
| if decoder_kwargs is None: | |
| decoder_kwargs = [ | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 1280, | |
| "d_model": 1280, | |
| "num_heads": 20, | |
| "num_layers": 32, | |
| "dim_feedforward": 5120, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 640, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2, | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 240, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": True, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": True, | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240, | |
| }, | |
| ] | |
| self.decoder_kwargs = decoder_kwargs | |
| # Default quantizer configuration | |
| if quantizer_kwargs is None: | |
| quantizer_kwargs = { | |
| "input_dim": 768, | |
| "rvq_dim": 512, | |
| "output_dim": 768, | |
| "num_quantizers": 32, | |
| "codebook_size": 1024, | |
| "codebook_dim": 8, | |
| "quantizer_type": "rlfq", | |
| } | |
| # Handle quantizer_type from kwargs or config | |
| kw_qtype = quantizer_kwargs.get("quantizer_type", None) | |
| if kw_qtype is not None: | |
| self.quantizer_type = kw_qtype | |
| else: | |
| self.quantizer_type = quantizer_type | |
| quantizer_kwargs["quantizer_type"] = quantizer_type | |
| self.quantizer_kwargs = quantizer_kwargs | |
| super().__init__(**kwargs) | |
| def num_quantizers(self) -> int: | |
| """Return the number of quantizers from quantizer_kwargs.""" | |
| return self.quantizer_kwargs.get("num_quantizers", 32) | |
| def codebook_size(self) -> int: | |
| """Return the codebook size from quantizer_kwargs.""" | |
| return self.quantizer_kwargs.get("codebook_size", 4096) | |
| def frame_rate(self) -> float: | |
| """Return the frame rate (tokens per second).""" | |
| return self.sampling_rate / self.downsample_rate | |
| __all__ = ["MossAudioTokenizerConfig"] | |