| import pdb |
| import torch |
| import torch.nn.functional as F |
| import torch.nn as nn |
| import pytorch_lightning as pl |
| import time |
| from transformers import AutoModel, AutoConfig, AutoTokenizer |
| import xgboost as xgb |
| import esm |
|
|
| from flow_matching.path import MixtureDiscreteProbPath |
| from flow_matching.path.scheduler import PolynomialConvexScheduler |
| from flow_matching.solver import MixtureDiscreteEulerSolver |
| from flow_matching.utils import ModelWrapper |
| from flow_matching.loss import MixturePathGeneralizedKL |
|
|
| from models.peptide_models import CNNModel |
| from modules.bindevaluator_modules import * |
|
|
| def parse_motifs(motif: str) -> list: |
| parts = motif.split(',') |
| result = [] |
|
|
| for part in parts: |
| part = part.strip() |
| if '-' in part: |
| start, end = map(int, part.split('-')) |
| result.extend(range(start, end + 1)) |
| else: |
| result.append(int(part)) |
|
|
| |
| print(f'Target Motifs: {result}') |
| return torch.tensor(result) |
|
|
| class BindEvaluator(pl.LightningModule): |
| def __init__(self, n_layers, d_model, d_hidden, n_head, |
| d_k, d_v, d_inner, dropout=0.2, |
| learning_rate=0.00001, max_epochs=15, kl_weight=1): |
| super(BindEvaluator, self).__init__() |
|
|
| self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D") |
| self.esm_model.eval() |
| |
| for param in self.esm_model.parameters(): |
| param.requires_grad = False |
|
|
| self.repeated_module = RepeatedModule3(n_layers, d_model, d_hidden, |
| n_head, d_k, d_v, d_inner, dropout=dropout) |
|
|
| self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model, |
| d_k, d_v, dropout=dropout) |
|
|
| self.final_ffn = FFN(d_model, d_inner, dropout=dropout) |
|
|
| self.output_projection_prot = nn.Linear(d_model, 1) |
|
|
| self.learning_rate = learning_rate |
| self.max_epochs = max_epochs |
| self.kl_weight = kl_weight |
|
|
| self.classification_threshold = nn.Parameter(torch.tensor(0.5)) |
| self.historical_memory = 0.9 |
| self.class_weights = torch.tensor([3.000471363174231, 0.5999811490272925]) |
|
|
| def forward(self, binder_tokens, target_tokens): |
| peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state |
| protein_sequence = self.esm_model(**target_tokens).last_hidden_state |
|
|
| prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \ |
| seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence, |
| protein_sequence) |
|
|
| prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc) |
|
|
| prot_enc = self.final_ffn(prot_enc) |
|
|
| prot_enc = self.output_projection_prot(prot_enc) |
|
|
| return prot_enc |
|
|
| def get_probs(self, x_t, target_sequence): |
| ''' |
| Inputs: |
| - xt: Shape (bsz, seq_len) |
| - target_sequence: Shape (1, tgt_len) |
| ''' |
| |
| target_sequence = target_sequence.repeat(x_t.shape[0], 1) |
| binder_attention_mask = torch.ones_like(x_t) |
| target_attention_mask = torch.ones_like(target_sequence) |
|
|
| binder_attention_mask[:, 0] = binder_attention_mask[:, -1] = 0 |
| target_attention_mask[:, 0] = target_attention_mask[:, -1] = 0 |
|
|
| binder_tokens = {'input_ids': x_t, 'attention_mask': binder_attention_mask.to(x_t.device)} |
| target_tokens = {'input_ids': target_sequence, 'attention_mask': target_attention_mask.to(target_sequence.device)} |
| |
| logits = self.forward(binder_tokens, target_tokens).squeeze(-1) |
| |
| logits[:, 0] = logits[:, -1] = -100 |
| probs = torch.sigmoid(logits) |
|
|
| return probs |
|
|
| def motif_score(self, x_t, target_sequence, motifs): |
| probs = self.get_probs(x_t, target_sequence) |
| motif_probs = probs[:, motifs] |
| motif_score = motif_probs.sum(dim=-1) / len(motifs) |
| |
| return motif_score |
|
|
| def non_motif_score(self, x_t, target_sequence, motifs): |
| probs = self.get_probs(x_t, target_sequence) |
| non_motif_probs = probs[:, [i for i in range(probs.shape[1]) if i not in motifs]] |
| mask = non_motif_probs >= 0.5 |
| count = mask.sum(dim=-1) |
|
|
| non_motif_score = torch.where(count > 0, (non_motif_probs * mask).sum(dim=-1) / count, torch.zeros_like(count)) |
|
|
| return non_motif_score |
|
|
| def scoring(self, x_t, target_sequence, motifs, penalty=False): |
| probs = self.get_probs(x_t, target_sequence) |
| motif_probs = probs[:, motifs] |
| motif_score = motif_probs.sum(dim=-1) / len(motifs) |
| |
|
|
| if penalty: |
| non_motif_probs = probs[:, [i for i in range(probs.shape[1]) if i not in motifs]] |
| mask = non_motif_probs >= 0.5 |
| count = mask.sum(dim=-1) |
| |
| non_motif_score = count / target_sequence.shape[1] |
| return motif_score, 1 - non_motif_score |
| else: |
| return motif_score |
|
|
| class MotifModel(nn.Module): |
| def __init__(self, bindevaluator, target_sequence, motifs, penalty=False): |
| super(MotifModel, self).__init__() |
| self.bindevaluator = bindevaluator |
| self.target_sequence = target_sequence |
| self.motifs = motifs |
| self.penalty = penalty |
| |
| def forward(self, x): |
| return self.bindevaluator.scoring(x, self.target_sequence, self.motifs, self.penalty) |
|
|
| class HemolysisModel: |
| def __init__(self, device): |
| self.predictor = xgb.Booster(model_file='./classifier_ckpt/wt_hemolysis.json') |
| |
| self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device) |
| self.model.eval() |
|
|
| self.device = device |
| |
| def get_scores(self, input_seqs): |
| scores = np.ones(len(input_seqs)) |
| with torch.no_grad(): |
| embeddings = self.model(input_ids=input_seqs, attention_mask=torch.ones_like(input_seqs).to(self.device)).last_hidden_state |
| keep = (input_seqs != 0) & (input_seqs != 1) & (input_seqs != 2) |
| embeddings[keep==False] = 0 |
| features = torch.sum(embeddings, dim=1)/torch.sum(keep==True, dim=1).unsqueeze(-1) |
| features = features.cpu().numpy() |
| |
| if len(features) == 0: |
| return scores |
| |
| features = np.nan_to_num(features, nan=0.) |
| features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max) |
| |
| features = xgb.DMatrix(features) |
| |
| probs = self.predictor.predict(features) |
| |
| return torch.from_numpy(scores - probs).to(self.device) |
| |
| def __call__(self, input_seqs: list): |
| scores = self.get_scores(input_seqs) |
| return scores |
|
|
| |
| |
| class MaskedMeanPool(nn.Module): |
| def forward(self, X, M): |
| Mf = M.unsqueeze(-1).float() |
| denom = Mf.sum(dim=1).clamp(min=1.0) |
| return (X * Mf).sum(dim=1) / denom |
|
|
| class MLPClassifier(nn.Module): |
| def __init__(self, in_dim, hidden=512, dropout=0.1): |
| super().__init__() |
| self.pool = MaskedMeanPool() |
| self.net = nn.Sequential( |
| nn.Linear(in_dim, hidden), |
| nn.GELU(), |
| nn.Dropout(dropout), |
| nn.Linear(hidden, 1), |
| ) |
| def forward(self, X, M): |
| z = self.pool(X, M) |
| return self.net(z).squeeze(-1) |
| |
|
|
| class NonfoulingModel: |
| def __init__(self, device): |
| ckpt = torch.load('./classifier_ckpt/wt_nonfouling.pt', weights_only=False, map_location=device) |
| best_params = ckpt["best_params"] |
| self.predictor = MLPClassifier(in_dim=1280, hidden=int(best_params["hidden"]), dropout=float(best_params.get("dropout", 0.1))) |
| self.predictor.load_state_dict(ckpt["state_dict"]) |
| self.predictor = self.predictor.to(device) |
| self.predictor.eval() |
|
|
| self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device) |
| self.model.eval() |
|
|
| self.device = device |
| |
| def get_scores(self, input_ids, attention_mask): |
| with torch.no_grad(): |
| features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state |
| |
| keep = (input_ids != 0) & (input_ids != 1) & (input_ids != 2) |
| attention_mask[keep==False] = 0 |
| scores = self.predictor(features, attention_mask) |
| return scores |
| |
| def __call__(self, input_ids): |
| attention_mask = torch.ones_like(input_ids).to(self.device) |
| scores = self.get_scores(input_ids, attention_mask) |
| return 1.0 / (1.0 + torch.exp(-scores)) |
|
|
| class SolubilityModel: |
| def __init__(self, device): |
| self.hydro_ids = torch.tensor([5, 7, 4, 12, 20, 18, 22, 14], device=device) |
| self.device = device |
|
|
| def get_scores(self, x): |
| a = x[:, 1:-1] |
| mask = (a.unsqueeze(-1) == self.hydro_ids).any(dim=-1) |
| ratios = mask.float().mean(dim=1) |
| return 1 - ratios |
| |
| def __call__(self, input_seqs: list): |
| scores = self.get_scores(input_seqs) |
| return scores |
|
|
| class PeptideCNN(nn.Module): |
| def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate): |
| super().__init__() |
| self.conv1 = nn.Conv1d(input_dim, hidden_dims[0], kernel_size=3, padding=1) |
| self.conv2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=1) |
| self.fc = nn.Linear(hidden_dims[1], output_dim) |
| self.dropout = nn.Dropout(dropout_rate) |
| self.predictor = nn.Linear(output_dim, 1) |
|
|
| self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D") |
| self.esm_model.eval() |
|
|
| def forward(self, input_ids, attention_mask=None, return_features=False): |
| with torch.no_grad(): |
| x = self.esm_model(input_ids, attention_mask).last_hidden_state |
| |
| x = x.permute(0, 2, 1) |
| x = nn.functional.relu(self.conv1(x)) |
| x = self.dropout(x) |
| x = nn.functional.relu(self.conv2(x)) |
| x = self.dropout(x) |
| x = x.permute(0, 2, 1) |
| |
| |
| x = x.mean(dim=1) |
| |
| features = self.fc(x) |
| if return_features: |
| return features |
| return self.predictor(features) |
|
|
|
|
| |
| |
| |
| class TransformerRegressor(nn.Module): |
| def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1): |
| super().__init__() |
| self.proj = nn.Linear(in_dim, d_model) |
| enc_layer = nn.TransformerEncoderLayer( |
| d_model=d_model, |
| nhead=nhead, |
| dim_feedforward=ff, |
| dropout=dropout, |
| batch_first=True, |
| activation="gelu", |
| ) |
| self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers) |
| self.head = nn.Linear(d_model, 1) |
|
|
| def forward(self, X, M): |
| |
| pad_mask = ~M |
| Z = self.proj(X) |
| Z = self.enc(Z, src_key_padding_mask=pad_mask) |
| Mf = M.unsqueeze(-1).float() |
| denom = Mf.sum(dim=1).clamp(min=1.0) |
| pooled = (Z * Mf).sum(dim=1) / denom |
| return self.head(pooled).squeeze(-1) |
|
|
|
|
| def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module: |
| |
| |
| if model_name != "transformer": |
| raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}") |
| return TransformerRegressor( |
| in_dim=in_dim, |
| d_model=384, |
| nhead=4, |
| layers=1, |
| ff=512, |
| dropout=0.1521676463658988, |
| ) |
|
|
| def _clean_state_dict(state_dict: dict) -> dict: |
| cleaned = {} |
| for k, v in state_dict.items(): |
| if k.startswith("module."): |
| k = k[len("module.") :] |
| if k.startswith("model."): |
| k = k[len("model.") :] |
| cleaned[k] = v |
| return cleaned |
|
|
| class HalfLifeModel: |
| """ |
| Loads: |
| - ESM2 encoder to generate *unpooled* token embeddings (per residue) |
| - Your fine-tuned TransformerRegressor from final_model.pt |
| |
| By default, __call__ returns "hours": |
| - if ckpt['target_col'] == 'log_label' -> expm1(pred) |
| - else -> raw pred |
| """ |
|
|
| def __init__( |
| self, |
| device, |
| ckpt_path = "./classifier_ckpt/wt_halflife.pt", |
| ): |
| self.device = device |
|
|
| |
| ckpt = torch.load(ckpt_path, map_location=device, weights_only=False) |
| if not isinstance(ckpt, dict) or "state_dict" not in ckpt: |
| raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.") |
|
|
| self.best_params = ckpt.get("best_params", {}) |
| self.in_dim = int(ckpt.get("in_dim")) |
| self.target_col = ckpt.get("target_col", "label") |
|
|
| |
| self.regressor = build_model(model_name="transformer", in_dim=self.in_dim, params=self.best_params) |
| self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True) |
| self.regressor.to(self.device) |
| self.regressor.eval() |
|
|
| |
| self.emb_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(self.device) |
| self.emb_model.eval() |
| self.tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D") |
|
|
| |
| esm_hidden = int(self.emb_model.config.hidden_size) |
| if esm_hidden != self.in_dim: |
| raise ValueError( |
| f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n" |
| f"Did you train on a different embedding model/dimension than facebook/esm2_t33_650M_UR50D?" |
| ) |
|
|
| @torch.no_grad() |
| def _embed_unpooled_batch(self, sequences): |
| out = self.emb_model(input_ids=sequences) |
| hs = out.last_hidden_state |
|
|
| per_seq = [] |
| lengths = [] |
|
|
| for i in range(hs.shape[0]): |
| emb = hs[i, 1:-1, :] |
| per_seq.append(emb) |
| lengths.append(int(emb.shape[0])) |
|
|
| Lmax = max(lengths) if lengths else 0 |
| H = hs.shape[-1] |
| X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32) |
| M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device) |
|
|
| for i, emb in enumerate(per_seq): |
| L = emb.shape[0] |
| if L == 0: |
| continue |
| X[i, :L, :] = emb.to(torch.float32) |
| M[i, :L] = True |
|
|
| return X, M |
|
|
| @torch.no_grad() |
| def predict_raw(self, input_seqs): |
| """ |
| Returns the regressor output in the same space as training target_col: |
| - if trained on log_label -> returns log1p(hours) |
| - if trained on label -> returns hours (or whatever label scale was) |
| """ |
| if len(input_seqs) == 0: |
| return np.array([], dtype=np.float32) |
|
|
| X, M = self._embed_unpooled_batch(input_seqs) |
| yhat = self.regressor(X, M).detach().cpu().numpy().astype(np.float32) |
| |
| return yhat |
|
|
| def predict_hours(self, input_seqs) -> np.ndarray: |
| """ |
| If your model was trained on log_label, convert back to hours via expm1. |
| Otherwise returns raw predictions. |
| """ |
| raw = self.predict_raw(input_seqs) |
| if self.target_col == "log_label": |
| return np.expm1(raw).astype(np.float32) |
| return raw.astype(np.float32) |
|
|
| def __call__(self, input_seqs) -> np.ndarray: |
| return torch.from_numpy(self.predict_hours(input_seqs)).to(self.device) |
|
|
|
|
| def load_bindevaluator(checkpoint_path, device): |
| bindevaluator = BindEvaluator.load_from_checkpoint(checkpoint_path, n_layers=8, d_model=128, d_hidden=128, n_head=8, d_k=64, d_v=128, d_inner=64).to(device) |
| bindevaluator.eval() |
| for param in bindevaluator.parameters(): |
| param.requires_grad = False |
|
|
| return bindevaluator |
|
|
|
|
| def load_solver(checkpoint_path, vocab_size, device): |
| lr = 1e-4 |
| epochs = 200 |
| embed_dim = 512 |
| hidden_dim = 256 |
| epsilon = 1e-3 |
| batch_size = 256 |
| warmup_epochs = epochs // 10 |
| device = 'cuda:0' |
| |
|
|
| probability_denoiser = CNNModel(alphabet_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device) |
| probability_denoiser.load_state_dict(torch.load(checkpoint_path, map_location=device, weights_only=False)) |
| probability_denoiser.eval() |
| for param in probability_denoiser.parameters(): |
| param.requires_grad = False |
|
|
| |
| scheduler = PolynomialConvexScheduler(n=2.0) |
| path = MixtureDiscreteProbPath(scheduler=scheduler) |
|
|
| class WrappedModel(ModelWrapper): |
| def forward(self, x: torch.Tensor, t: torch.Tensor, **extras): |
| return torch.softmax(self.model(x, t), dim=-1) |
|
|
| wrapped_probability_denoiser = WrappedModel(probability_denoiser) |
| solver = MixtureDiscreteEulerSolver(model=wrapped_probability_denoiser, path=path, vocabulary_size=vocab_size) |
|
|
| return solver |
|
|
|
|
| class CrossAttnUnpooled(nn.Module): |
| """ |
| token sequences with masks; alternating cross attention. |
| """ |
| def __init__(self, Ht=1280, Hb=1280, hidden=768, n_heads=8, n_layers=1, dropout=0.16430662769055482): |
| super().__init__() |
| self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden)) |
| self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden)) |
|
|
| self.layers = nn.ModuleList([]) |
| for _ in range(n_layers): |
| self.layers.append(nn.ModuleDict({ |
| "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True), |
| "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True), |
| "n1t": nn.LayerNorm(hidden), |
| "n2t": nn.LayerNorm(hidden), |
| "n1b": nn.LayerNorm(hidden), |
| "n2b": nn.LayerNorm(hidden), |
| "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)), |
| "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)), |
| })) |
|
|
| self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout)) |
| self.reg = nn.Linear(hidden, 1) |
| self.cls = nn.Linear(hidden, 3) |
|
|
| def masked_mean(self, X, M): |
| Mf = M.unsqueeze(-1).float() |
| denom = Mf.sum(dim=1).clamp(min=1.0) |
| return (X * Mf).sum(dim=1) / denom |
|
|
| def forward(self, T, Mt, B, Mb): |
| |
| T = self.t_proj(T) |
| Bx = self.b_proj(B) |
|
|
| kp_t = ~Mt |
| kp_b = ~Mb |
|
|
| for L in self.layers: |
| |
| T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b) |
| T = L["n1t"](T + T_attn) |
| T = L["n2t"](T + L["fft"](T)) |
|
|
| |
| B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t) |
| Bx = L["n1b"](Bx + B_attn) |
| Bx = L["n2b"](Bx + L["ffb"](Bx)) |
|
|
| t_pool = self.masked_mean(T, Mt) |
| b_pool = self.masked_mean(Bx, Mb) |
| z = torch.cat([t_pool, b_pool], dim=-1) |
| h = self.shared(z) |
| return self.reg(h).squeeze(-1), self.cls(h) |
|
|
| def load_affinity_predictor(device): |
| """Load trained model from checkpoint.""" |
| checkpoint = torch.load('./classifier_ckpt/wt_affinity.pt', map_location=device, weights_only=False) |
|
|
| model = CrossAttnUnpooled() |
|
|
| model.load_state_dict(checkpoint['state_dict']) |
| model.eval() |
| model = model.to(device) |
| |
| return model |
|
|
| class AffinityModel(nn.Module): |
| def __init__(self, affinity_predictor, target_sequence, device): |
| super(AffinityModel, self).__init__() |
| self.affinity_predictor = affinity_predictor |
| self.target_sequence = target_sequence |
| self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device) |
| self.esm_model.eval() |
| self.device=device |
| |
| def forward(self, x): |
| batch = x.shape[0] |
| Mt = self.target_sequence['attention_mask'][:, 1:-1].repeat(batch, 1) |
| with torch.no_grad(): |
| T = self.esm_model(**self.target_sequence).last_hidden_state[:, 1:-1, :].repeat(batch, 1, 1) |
|
|
| Mb = torch.ones(batch, x.shape[1] - 2, dtype=torch.bool).to(self.device) |
| with torch.no_grad(): |
| for i in range(batch): |
| attention_mask = torch.ones_like(x).to(self.device) |
| B = self.esm_model(input_ids=x, attention_mask=torch.ones_like(x).to(self.device)).last_hidden_state[:, 1:-1] |
| |
| affinity, _ = self.affinity_predictor(T, Mt.bool(), B, Mb) |
| return affinity / 10 |
| |