BiliSakura
/

PixNerd-diffusers

@@ -1,746 +0,0 @@
-from __future__ import annotations
-import copy
-import importlib
-import math
-from dataclasses import dataclass
-from functools import lru_cache
-from typing import Any, Dict, Iterable, List, Optional, Tuple
-import torch
-import torch.nn as nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils import BaseOutput
-from torch.nn.functional import scaled_dot_product_attention
-class BaseAE(torch.nn.Module):
-    def __init__(self, scale=1.0, shift=0.0):
-        super().__init__()
-        self.scale = scale
-        self.shift = shift
-    def encode(self, x):
-        return self._impl_encode(x) #.to(torch.bfloat16)
-    # @torch.autocast("cuda", dtype=torch.bfloat16)
-    def decode(self, x):
-        return self._impl_decode(x) #.to(torch.bfloat16)
-    def _impl_encode(self, x):
-        raise NotImplementedError
-    def _impl_decode(self, x):
-        raise NotImplementedError
-def uint82fp(x):
-    x = x.to(torch.float32)
-    x = (x - 127.5) / 127.5
-    return x
-def fp2uint8(x):
-    x = torch.clip_((x + 1) * 127.5 + 0.5, 0, 255).to(torch.uint8)
-    return x
-class PixelAE(BaseAE):
-    def __init__(self, scale=1.0, shift=0.0):
-        super().__init__(scale, shift)
-    def _impl_encode(self, x):
-        return x/self.scale+self.shift
-    def _impl_decode(self, x):
-        return (x-self.shift)*self.scale
-def resolve_conditioner_device(metadata: dict, fallback: torch.device | None = None) -> torch.device:
-    if metadata is None:
-        metadata = {}
-    if "device" in metadata and metadata["device"] is not None:
-        return torch.device(metadata["device"])
-    if fallback is not None:
-        return fallback
-    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
-class BaseConditioner(nn.Module):
-    def __init__(self):
-        super(BaseConditioner, self).__init__()
-    def _impl_condition(self, y, metadata)->torch.Tensor:
-        raise NotImplementedError()
-    def _impl_uncondition(self, y, metadata)->torch.Tensor:
-        raise NotImplementedError()
-    @torch.no_grad()
-    def __call__(self, y, metadata:dict={}):
-        condition = self._impl_condition(y, metadata)
-        uncondition = self._impl_uncondition(y, metadata)
-        if condition.dtype in [torch.float64, torch.float32, torch.float16]:
-            condition = condition.to(torch.bfloat16)
-        if uncondition.dtype in [torch.float64,torch.float32, torch.float16]:
-            uncondition = uncondition.to(torch.bfloat16)
-        return condition, uncondition
-class ComposeConditioner(BaseConditioner):
-    def __init__(self, conditioners:List[BaseConditioner]):
-        super().__init__()
-        self.conditioners = conditioners
-    def _impl_condition(self, y, metadata):
-        condition = []
-        for conditioner in self.conditioners:
-            condition.append(conditioner._impl_condition(y, metadata))
-        condition = torch.cat(condition, dim=1)
-        return condition
-    def _impl_uncondition(self, y, metadata):
-        uncondition = []
-        for conditioner in self.conditioners:
-            uncondition.append(conditioner._impl_uncondition(y, metadata))
-        uncondition = torch.cat(uncondition, dim=1)
-        return uncondition
-class LabelConditioner(BaseConditioner):
-    def __init__(self, num_classes):
-        super().__init__()
-        self.null_condition = num_classes
-    def _impl_condition(self, y, metadata):
-        device = resolve_conditioner_device(metadata)
-        return torch.tensor(y, device=device).long()
-    def _impl_uncondition(self, y, metadata):
-        device = resolve_conditioner_device(metadata)
-        return torch.full((len(y),), self.null_condition, dtype=torch.long, device=device)
-def modulate(x, shift, scale):
-    return x * (1 + scale) + shift
-class Embed(nn.Module):
-    def __init__(
-            self,
-            in_chans: int = 3,
-            embed_dim: int = 768,
-            norm_layer = None,
-            bias: bool = True,
-    ):
-        super().__init__()
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-        self.proj = nn.Linear(in_chans, embed_dim, bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-    def forward(self, x):
-        x = self.proj(x)
-        x = self.norm(x)
-        return x
-class TimestepEmbedder(nn.Module):
-    def __init__(self, hidden_size, frequency_embedding_size=256):
-        super().__init__()
-        self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
-            nn.SiLU(),
-            nn.Linear(hidden_size, hidden_size, bias=True),
-        )
-        self.frequency_embedding_size = frequency_embedding_size
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10):
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
-        )
-        args = t[..., None].float() * freqs[None, ...]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-        return embedding
-    def forward(self, t):
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-        t_emb = self.mlp(t_freq)
-        return t_emb
-class LabelEmbedder(nn.Module):
-    def __init__(self, num_classes, hidden_size):
-        super().__init__()
-        self.embedding_table = nn.Embedding(num_classes, hidden_size)
-        self.num_classes = num_classes
-    def forward(self, labels,):
-        embeddings = self.embedding_table(labels)
-        return embeddings
-class FinalLayer(nn.Module):
-    def __init__(self, hidden_size, out_channels):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
-        self.adaLN_modulation = nn.Sequential(
-            nn.Linear(hidden_size, 2*hidden_size, bias=True)
-        )
-    def forward(self, x, c):
-        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
-        x = modulate(self.norm_final(x), shift, scale)
-        x = self.linear(x)
-        return x
-class RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-    def forward(self, x):
-        x =  self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
-        return x
-def precompute_freqs_cis_2d(dim: int, height: int, width:int, theta: float = 10000.0, scale=16.0):
-    # assert  H * H == end
-    # flat_patch_pos = torch.linspace(-1, 1, end) # N = end
-    x_pos = torch.linspace(0, scale, width)
-    y_pos = torch.linspace(0, scale, height)
-    y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
-    y_pos = y_pos.reshape(-1)
-    x_pos = x_pos.reshape(-1)
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) # Hc/4
-    x_freqs = torch.outer(x_pos, freqs).float() # N Hc/4
-    y_freqs = torch.outer(y_pos, freqs).float() # N Hc/4
-    x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
-    y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
-    freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1) # N,Hc/4,2
-    freqs_cis = freqs_cis.reshape(height*width, -1)
-    return freqs_cis
-def apply_rotary_emb(
-        xq: torch.Tensor,
-        xk: torch.Tensor,
-        freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    freqs_cis = freqs_cis[None, :, None, :]
-    # xq : B N H Hc
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) # B N H Hc/2
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) # B, N, H, Hc
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-class RAttention(nn.Module):
-    def __init__(
-            self,
-            dim: int,
-            num_heads: int = 8,
-            qkv_bias: bool = False,
-            qk_norm: bool = True,
-            attn_drop: float = 0.,
-            proj_drop: float = 0.,
-            norm_layer: nn.Module = RMSNorm,
-    ) -> None:
-        super().__init__()
-        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.scale = self.head_dim ** -0.5
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-    def forward(self, x: torch.Tensor, pos, mask) -> torch.Tensor:
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # B N H Hc
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-        q, k = apply_rotary_emb(q, k, freqs_cis=pos)
-        q = q.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2)  # B, H, N, Hc
-        k = k.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2).contiguous()  # B, H, N, Hc
-        v = v.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2).contiguous()
-        x = scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
-        x = x.transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class FlattenDiTBlock(nn.Module):
-    def __init__(self, hidden_size, groups,  mlp_ratio=4.0, ):
-        super().__init__()
-        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
-        self.attn = RAttention(hidden_size, num_heads=groups, qkv_bias=False)
-        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.mlp = FeedForward(hidden_size, mlp_hidden_dim)
-        self.adaLN_modulation = nn.Sequential(
-            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
-        )
-    def forward(self, x,  c, pos, mask=None):
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
-        x = x + gate_msa * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), pos, mask=mask)
-        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
-        return x
-class NerfEmbedder(nn.Module):
-    def __init__(self, in_channels, hidden_size_input, max_freqs):
-        super().__init__()
-        self.max_freqs = max_freqs
-        self.hidden_size_input = hidden_size_input
-        self.embedder = nn.Sequential(
-            nn.Linear(in_channels+max_freqs**2, hidden_size_input, bias=True),
-        )
-    @lru_cache
-    def fetch_pos(self, patch_size, device, dtype):
-        pos_x = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
-        pos_y = torch.linspace(0, 1, patch_size, device=device, dtype=dtype)
-        pos_y, pos_x = torch.meshgrid(pos_y, pos_x, indexing="ij")
-        pos_x = pos_x.reshape(-1, 1, 1)
-        pos_y = pos_y.reshape(-1, 1, 1)
-        freqs = torch.linspace(0, self.max_freqs, self.max_freqs, dtype=dtype, device=device)
-        freqs_x = freqs[None, :, None]
-        freqs_y = freqs[None, None, :]
-        coeffs = (1 + freqs_x * freqs_y) ** -1
-        dct_x = torch.cos(pos_x * freqs_x * torch.pi)
-        dct_y = torch.cos(pos_y * freqs_y * torch.pi)
-        dct = (dct_x * dct_y * coeffs).view(1, -1, self.max_freqs ** 2)
-        return dct
-    def forward(self, inputs):
-        B, P2, C = inputs.shape
-        patch_size = int(P2 ** 0.5)
-        device = inputs.device
-        dtype = inputs.dtype
-        dct = self.fetch_pos(patch_size, device, dtype)
-        dct = dct.repeat(B, 1, 1)
-        inputs = torch.cat([inputs, dct], dim=-1)
-        inputs = self.embedder(inputs)
-        return inputs
-class NerfBlock(nn.Module):
-    def __init__(self, hidden_size_s, hidden_size_x, mlp_ratio=4):
-        super().__init__()
-        self.param_generator1 = nn.Sequential(
-            nn.Linear(hidden_size_s, 2*hidden_size_x**2*mlp_ratio, bias=True),
-        )
-        self.norm = RMSNorm(hidden_size_x, eps=1e-6)
-        self.mlp_ratio = mlp_ratio
-    def forward(self, x, s):
-        batch_size, num_x, hidden_size_x = x.shape
-        mlp_params1 = self.param_generator1(s)
-        fc1_param1, fc2_param1 = mlp_params1.chunk(2, dim=-1)
-        fc1_param1 = fc1_param1.view(batch_size, hidden_size_x, hidden_size_x*self.mlp_ratio)
-        fc2_param1 = fc2_param1.view(batch_size, hidden_size_x*self.mlp_ratio, hidden_size_x)
-        # normalize fc1
-        normalized_fc1_param1 = torch.nn.functional.normalize(fc1_param1, dim=-2)
-        # normalize fc2
-        normalized_fc2_param1 = torch.nn.functional.normalize(fc2_param1, dim=-2)
-        # mlp 1
-        res_x = x
-        x = self.norm(x)
-        x = torch.bmm(x, normalized_fc1_param1)
-        x = torch.nn.functional.silu(x)
-        x = torch.bmm(x, normalized_fc2_param1)
-        x = x + res_x
-        return x
-class NerfFinalLayer(nn.Module):
-    def __init__(self, hidden_size, out_channels):
-        super().__init__()
-        self.norm = RMSNorm(hidden_size, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
-    def forward(self, x):
-        x = self.norm(x)
-        x = self.linear(x)
-        return x
-class PixNerDiT(nn.Module):
-    def __init__(
-            self,
-            in_channels=4,
-            num_groups=12,
-            hidden_size=1152,
-            hidden_size_x=64,
-            nerf_mlpratio=4,
-            num_blocks=18,
-            num_cond_blocks=4,
-            patch_size=2,
-            num_classes=1000,
-            learn_sigma=True,
-            deep_supervision=0,
-            weight_path=None,
-            load_ema=False,
-    ):
-        super().__init__()
-        self.deep_supervision = deep_supervision
-        self.learn_sigma = learn_sigma
-        self.in_channels = in_channels
-        self.out_channels = in_channels
-        self.hidden_size = hidden_size
-        self.num_groups = num_groups
-        self.num_blocks = num_blocks
-        self.num_cond_blocks = num_cond_blocks
-        self.patch_size = patch_size
-        self.x_embedder = NerfEmbedder(in_channels, hidden_size_x, max_freqs=8)
-        self.s_embedder = Embed(in_channels*patch_size**2, hidden_size, bias=True)
-        self.t_embedder = TimestepEmbedder(hidden_size)
-        self.y_embedder = LabelEmbedder(num_classes+1, hidden_size)
-        self.final_layer = NerfFinalLayer(hidden_size_x, self.out_channels)
-        self.weight_path = weight_path
-        self.load_ema = load_ema
-        self.blocks = nn.ModuleList([
-            FlattenDiTBlock(self.hidden_size, self.num_groups) for _ in range(self.num_cond_blocks)
-        ])
-        self.blocks.extend([
-            NerfBlock(self.hidden_size, hidden_size_x, nerf_mlpratio) for _ in range(self.num_cond_blocks, self.num_blocks)
-        ])
-        self.initialize_weights()
-        self.precompute_pos = dict()
-    def fetch_pos(self, height, width, device):
-        if (height, width) in self.precompute_pos:
-            return self.precompute_pos[(height, width)].to(device)
-        else:
-            pos = precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width).to(device)
-            self.precompute_pos[(height, width)] = pos
-            return pos
-    def initialize_weights(self):
-        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
-        w = self.s_embedder.proj.weight.data
-        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
-        nn.init.constant_(self.s_embedder.proj.bias, 0)
-        # Initialize label embedding table:
-        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
-        # Initialize timestep embedding MLP:
-        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
-        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
-        # zero init final layer
-        nn.init.zeros_(self.final_layer.linear.weight)
-        nn.init.zeros_(self.final_layer.linear.bias)
-    def forward(self, x, t, y, s=None, mask=None):
-        B, _, H, W = x.shape
-        pos = self.fetch_pos(H//self.patch_size, W//self.patch_size, x.device)
-        x = torch.nn.functional.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
-        t = self.t_embedder(t.view(-1)).view(B, -1, self.hidden_size)
-        y = self.y_embedder(y).view(B, 1, self.hidden_size)
-        c = nn.functional.silu(t + y)
-        if s is None:
-            s = self.s_embedder(x)
-            for i in range(self.num_cond_blocks):
-                s = self.blocks[i](s, c, pos, mask)
-            s = nn.functional.silu(t + s)
-        batch_size, length, _ = s.shape
-        x = x.reshape(batch_size*length, self.in_channels, self.patch_size**2)
-        x = x.transpose(1, 2)
-        s = s.view(batch_size*length, self.hidden_size)
-        x = self.x_embedder(x)
-        for i in range(self.num_cond_blocks, self.num_blocks):
-            x = self.blocks[i](x, s)
-        x = self.final_layer(x)
-        x = x.transpose(1, 2)
-        x = x.reshape(batch_size, length, -1)
-        x = torch.nn.functional.fold(x.transpose(1, 2).contiguous(), (H, W), kernel_size=self.patch_size, stride=self.patch_size)
-        return x
-def to_container(config: Any) -> Any:
-    if hasattr(config, "items") and not isinstance(config, dict):
-        return {k: to_container(v) for k, v in config.items()}
-    if isinstance(config, list):
-        return [to_container(v) for v in config]
-    return config
-def load_symbol(path: str) -> Any:
-    module_path, name = path.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, name)
-def instantiate_from_spec(spec: Any) -> Any:
-    spec = to_container(spec)
-    if isinstance(spec, dict) and "class_path" in spec:
-        class_or_fn = load_symbol(spec["class_path"])
-        init_args = spec.get("init_args", {})
-        if isinstance(init_args, dict):
-            init_args = {k: instantiate_from_spec(v) for k, v in init_args.items()}
-        return class_or_fn(**init_args)
-    if isinstance(spec, dict):
-        return {k: instantiate_from_spec(v) for k, v in spec.items()}
-    if isinstance(spec, list):
-        return [instantiate_from_spec(v) for v in spec]
-    if isinstance(spec, str) and "." in spec:
-        try:
-            return load_symbol(spec)
-        except Exception:
-            return spec
-    return spec
-def clone_spec(spec: Dict[str, Any]) -> Dict[str, Any]:
-    return copy.deepcopy(to_container(spec))
-def load_prefixed_state_dict(
-    module: Optional[torch.nn.Module],
-    state_dict: Dict[str, torch.Tensor],
-    prefixes: Iterable[str],
-) -> bool:
-    if module is None:
-        return False
-    for prefix in prefixes:
-        subset = {
-            key[len(prefix) :]: value
-            for key, value in state_dict.items()
-            if key.startswith(prefix)
-        }
-        if subset:
-            module.load_state_dict(subset, strict=False)
-            return True
-    return False
-@dataclass
-class PixNerdTransformer2DModelOutput(BaseOutput):
-    sample: torch.FloatTensor
-class PixNerdTransformer2DModel(ModelMixin, ConfigMixin):
-    config_name = "config.json"
-    @register_to_config
-    def __init__(
-        self,
-        denoiser_spec: Dict[str, Any],
-        conditioner_spec: Dict[str, Any],
-        vae_spec: Optional[Dict[str, Any]] = None,
-        diffusion_trainer_spec: Optional[Dict[str, Any]] = None,
-        use_ema: bool = True,
-        ema_decay: float = 0.9999,
-        compile_denoiser: bool = False,
-    ) -> None:
-        super().__init__()
-        self.denoiser = instantiate_from_spec(to_container(denoiser_spec))
-        self.conditioner = instantiate_from_spec(to_container(conditioner_spec))
-        self.vae = instantiate_from_spec(to_container(vae_spec)) if vae_spec is not None else None
-        self.diffusion_trainer = (
-            instantiate_from_spec(to_container(diffusion_trainer_spec))
-            if diffusion_trainer_spec is not None
-            else None
-        )
-        self.use_ema = bool(use_ema)
-        self.ema_decay = float(ema_decay)
-        self.ema_denoiser = copy.deepcopy(self.denoiser) if self.use_ema else None
-        if self.ema_denoiser is not None:
-            self.ema_denoiser.to(torch.float32)
-        if compile_denoiser and hasattr(self.denoiser, "compile"):
-            self.denoiser.compile()
-            if self.ema_denoiser is not None:
-                self.ema_denoiser.compile()
-        self._freeze_non_trainable_modules()
-        if self.ema_denoiser is not None:
-            self.sync_ema()
-    @property
-    def patch_size(self) -> int:
-        return int(getattr(self.denoiser, "patch_size", 1))
-    @property
-    def in_channels(self) -> int:
-        return int(getattr(self.denoiser, "in_channels", 3))
-    @classmethod
-    def from_project_config(
-        cls,
-        model_config: Dict[str, Any],
-        use_ema: bool = True,
-        compile_denoiser: bool = False,
-    ) -> "PixNerdTransformer2DModel":
-        model_config = to_container(model_config)
-        ema_decay = model_config.get("ema_tracker", {}).get("init_args", {}).get("decay", 0.9999)
-        return cls(
-            denoiser_spec=model_config["denoiser"],
-            conditioner_spec=model_config["conditioner"],
-            vae_spec=model_config.get("vae"),
-            diffusion_trainer_spec=model_config.get("diffusion_trainer"),
-            use_ema=use_ema,
-            ema_decay=ema_decay,
-            compile_denoiser=compile_denoiser,
-        )
-    @staticmethod
-    def _as_timestep_tensor(
-        timestep: Any,
-        batch_size: int,
-        device: torch.device,
-    ) -> torch.Tensor:
-        if isinstance(timestep, torch.Tensor):
-            if timestep.ndim == 0:
-                return timestep.repeat(batch_size).to(device=device, dtype=torch.float32)
-            return timestep.to(device=device, dtype=torch.float32)
-        return torch.full((batch_size,), float(timestep), device=device, dtype=torch.float32)
-    def _freeze_module(self, module: Optional[torch.nn.Module]) -> None:
-        if module is None:
-            return
-        module.eval()
-        for parameter in module.parameters():
-            parameter.requires_grad = False
-    def _freeze_non_trainable_modules(self) -> None:
-        self._freeze_module(self.conditioner)
-        self._freeze_module(self.vae)
-        self._freeze_module(self.ema_denoiser)
-    def forward(
-        self,
-        sample: torch.Tensor,
-        timestep: Any,
-        encoder_hidden_states: torch.Tensor,
-        return_dict: bool = True,
-    ) -> PixNerdTransformer2DModelOutput | Tuple[torch.Tensor]:
-        t = self._as_timestep_tensor(timestep, sample.shape[0], sample.device)
-        out = self.denoiser(sample, t, encoder_hidden_states)
-        if not return_dict:
-            return (out,)
-        return PixNerdTransformer2DModelOutput(sample=out)
-    def predict_noise(
-        self,
-        sample: torch.Tensor,
-        timestep: Any,
-        encoder_hidden_states: torch.Tensor,
-        use_ema: bool = False,
-    ) -> torch.Tensor:
-        t = self._as_timestep_tensor(timestep, sample.shape[0], sample.device)
-        denoiser = self.get_inference_denoiser(use_ema=use_ema)
-        return denoiser(sample, t, encoder_hidden_states)
-    def get_inference_denoiser(self, use_ema: bool = True) -> torch.nn.Module:
-        if use_ema and self.ema_denoiser is not None:
-            return self.ema_denoiser
-        return self.denoiser
-    @torch.no_grad()
-    def get_conditioning(
-        self,
-        y: Iterable[Any],
-        metadata: Optional[Dict[str, Any]] = None,
-    ):
-        metadata = {} if metadata is None else metadata
-        return self.conditioner(y, metadata)
-    @torch.no_grad()
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        if self.vae is None:
-            return x
-        return self.vae.encode(x)
-    @torch.no_grad()
-    def decode(self, latents: torch.Tensor) -> torch.Tensor:
-        if self.vae is None:
-            return latents
-        return self.vae.decode(latents)
-    @torch.no_grad()
-    def sync_ema(self) -> None:
-        if self.ema_denoiser is None:
-            return
-        self.ema_denoiser.load_state_dict(self.denoiser.state_dict(), strict=True)
-        self.ema_denoiser.to(torch.float32)
-    @torch.no_grad()
-    def ema_step(self, decay: Optional[float] = None) -> None:
-        if self.ema_denoiser is None:
-            return
-        decay = self.ema_decay if decay is None else float(decay)
-        for ema_param, param in zip(self.ema_denoiser.parameters(), self.denoiser.parameters()):
-            ema_param.mul_(decay).add_(param.detach().float(), alpha=1.0 - decay)
-    def compute_training_loss(
-        self,
-        x: torch.Tensor,
-        y: Iterable[Any],
-        scheduler: torch.nn.Module,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> Dict[str, torch.Tensor]:
-        if self.diffusion_trainer is None:
-            raise RuntimeError("diffusion_trainer is not configured.")
-        metadata = {} if metadata is None else metadata
-        with torch.no_grad():
-            x = self.encode(x)
-            condition, uncondition = self.get_conditioning(y, metadata)
-        return self.diffusion_trainer(
-            self.denoiser,
-            self.ema_denoiser if self.ema_denoiser is not None else self.denoiser,
-            scheduler,
-            x,
-            condition,
-            uncondition,
-            metadata,
-        )
-__all__ = [
-    "PixNerDiT",
-    "LabelConditioner",
-    "PixelAE",
-    "PixNerdTransformer2DModel",
-    "PixNerdTransformer2DModelOutput",
-]