Instructions to use Motif-Technologies/optimizer with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Kernels
How to use Motif-Technologies/optimizer with Kernels:
# !pip install kernels from kernels import get_kernel kernel = get_kernel("Motif-Technologies/optimizer") - Notebooks
- Google Colab
- Kaggle
| """Unit tests for FQN normalization (no GPU / distributed required).""" | |
| from optimizer.core import default_is_muon, is_expert_param, normalize_fqn | |
| from optimizer.qk_clip import parse_qk_layer | |
| class TestNormalizeFqn: | |
| def test_passthrough(self): | |
| assert normalize_fqn("model.layers.3.attn.q_proj.weight") == \ | |
| "model.layers.3.attn.q_proj.weight" | |
| def test_strip_orig_mod(self): | |
| assert normalize_fqn("model._orig_mod.layers.3.attn.q_proj.weight") == \ | |
| "model.layers.3.attn.q_proj.weight" | |
| def test_strip_checkpoint_wrapped(self): | |
| name = "model.layers.0._checkpoint_wrapped_module.moe.experts.w1.weight" | |
| assert normalize_fqn(name) == \ | |
| "model.layers.0.moe.experts.w1.weight" | |
| def test_strip_both(self): | |
| name = "model._orig_mod.layers.0._checkpoint_wrapped_module.attn.q_proj.weight" | |
| assert normalize_fqn(name) == \ | |
| "model.layers.0.attn.q_proj.weight" | |
| def test_strip_nested_orig_mod(self): | |
| name = "_orig_mod._orig_mod.layers.0.mlp.gate_proj.weight" | |
| assert normalize_fqn(name) == \ | |
| "layers.0.mlp.gate_proj.weight" | |
| class TestParseQkLayerWithWrappers: | |
| def test_plain_name(self): | |
| assert parse_qk_layer("model.layers.3.attn.q_proj.weight") == ( | |
| "q_proj", 3) | |
| def test_orig_mod(self): | |
| assert parse_qk_layer("model._orig_mod.layers.3.attn.wq.weight") == ( | |
| "wq", 3) | |
| def test_checkpoint_wrapped(self): | |
| name = "model.layers.5._checkpoint_wrapped_module.self_attn.k_proj.weight" | |
| assert parse_qk_layer(name) == ("k_proj", 5) | |
| def test_both_wrappers(self): | |
| name = "_orig_mod.model._checkpoint_wrapped_module.layers.7.attn.wk.weight" | |
| assert parse_qk_layer(name) == ("wk", 7) | |
| def test_non_qk_still_none(self): | |
| name = "model._orig_mod.layers.2.attn.v_proj.weight" | |
| assert parse_qk_layer(name) == (None, -1) | |
| class TestExpertKeyMatching: | |
| """Verify expert_keys uses component-level matching, not substring.""" | |
| class FakeParam: | |
| def __init__(self, ndim): | |
| self.ndim = ndim | |
| def test_experts_matches(self): | |
| name = "model.layers.0.moe.experts.w1.weight" | |
| assert default_is_muon(name, | |
| self.FakeParam(3), | |
| expert_keys=["experts"]) | |
| def test_shared_experts_does_not_match(self): | |
| name = "model.layers.0.moe.shared_experts.w1.weight" | |
| # shared_experts has ndim=2, which is muon-eligible on its own. | |
| # But it must NOT be recognized as expert (ndim-1 would make it 1D → False). | |
| assert default_is_muon(name, | |
| self.FakeParam(2), | |
| expert_keys=["experts"]) | |
| def test_shared_experts_3d_not_treated_as_expert(self): | |
| # 3D shared_experts: if wrongly matched as expert, ndim-1=2 → True (same result). | |
| # Verify by checking that a 2D shared_experts is NOT downgraded to 1D. | |
| name = "model.layers.0.moe.shared_experts.gate_proj.weight" | |
| # 2D param: if expert-matched → ndim-1=1 → False. Must stay True. | |
| assert default_is_muon(name, | |
| self.FakeParam(2), | |
| expert_keys=["experts"]) | |
| def test_multi_component_key_matches(self): | |
| name = "model.layers.0.moe.experts.w1.weight" | |
| assert is_expert_param(name, expert_keys=["experts.w1"]) | |
| def test_multi_component_key_no_false_positive(self): | |
| # "experts.w2" should not match "experts.w1" | |
| name = "model.layers.0.moe.experts.w1.weight" | |
| assert not is_expert_param(name, expert_keys=["experts.w2"]) | |
| def test_multi_component_key_shared_experts(self): | |
| name = "model.layers.0.moe.shared_experts.w1.weight" | |
| assert not is_expert_param(name, expert_keys=["experts.w1"]) | |