moPPIt / models /peptide_classifiers.py

Update models/peptide_classifiers.py

a913f53 verified 16 days ago

22.3 kB

	import pdb
	import torch
	import torch.nn.functional as F
	import torch.nn as nn
	import pytorch_lightning as pl
	import time
	from transformers import AutoModel, AutoConfig, AutoTokenizer
	import xgboost as xgb
	import esm

	from flow_matching.path import MixtureDiscreteProbPath
	from flow_matching.path.scheduler import PolynomialConvexScheduler
	from flow_matching.solver import MixtureDiscreteEulerSolver
	from flow_matching.utils import ModelWrapper
	from flow_matching.loss import MixturePathGeneralizedKL

	from models.peptide_models import CNNModel
	from modules.bindevaluator_modules import *

	def parse_motifs(motif: str) -> list:
	parts = motif.split(',')
	result = []

	for part in parts:
	part = part.strip()
	if '-' in part:
	start, end = map(int, part.split('-'))
	result.extend(range(start, end + 1))
	else:
	result.append(int(part))

	# result = [pos-1 for pos in result]
	print(f'Target Motifs: {result}')
	return torch.tensor(result)

	class BindEvaluator(pl.LightningModule):
	def __init__(self, n_layers, d_model, d_hidden, n_head,
	d_k, d_v, d_inner, dropout=0.2,
	learning_rate=0.00001, max_epochs=15, kl_weight=1):
	super(BindEvaluator, self).__init__()

	self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
	self.esm_model.eval()
	# freeze all the esm_model parameters
	for param in self.esm_model.parameters():
	param.requires_grad = False

	self.repeated_module = RepeatedModule3(n_layers, d_model, d_hidden,
	n_head, d_k, d_v, d_inner, dropout=dropout)

	self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
	d_k, d_v, dropout=dropout)

	self.final_ffn = FFN(d_model, d_inner, dropout=dropout)

	self.output_projection_prot = nn.Linear(d_model, 1)

	self.learning_rate = learning_rate
	self.max_epochs = max_epochs
	self.kl_weight = kl_weight

	self.classification_threshold = nn.Parameter(torch.tensor(0.5)) # Initial threshold
	self.historical_memory = 0.9
	self.class_weights = torch.tensor([3.000471363174231, 0.5999811490272925]) # binding_site weights, non-bidning site weights

	def forward(self, binder_tokens, target_tokens):
	peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
	protein_sequence = self.esm_model(**target_tokens).last_hidden_state

	prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
	seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
	protein_sequence)

	prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)

	prot_enc = self.final_ffn(prot_enc)

	prot_enc = self.output_projection_prot(prot_enc)

	return prot_enc

	def get_probs(self, x_t, target_sequence):
	'''
	Inputs:
	- xt: Shape (bsz, seq_len)
	- target_sequence: Shape (1, tgt_len)
	'''
	# pdb.set_trace()
	target_sequence = target_sequence.repeat(x_t.shape[0], 1)
	binder_attention_mask = torch.ones_like(x_t)
	target_attention_mask = torch.ones_like(target_sequence)

	binder_attention_mask[:, 0] = binder_attention_mask[:, -1] = 0
	target_attention_mask[:, 0] = target_attention_mask[:, -1] = 0

	binder_tokens = {'input_ids': x_t, 'attention_mask': binder_attention_mask.to(x_t.device)}
	target_tokens = {'input_ids': target_sequence, 'attention_mask': target_attention_mask.to(target_sequence.device)}

	logits = self.forward(binder_tokens, target_tokens).squeeze(-1)
	# pdb.set_trace()
	logits[:, 0] = logits[:, -1] = -100 # float('-inf')
	probs = torch.sigmoid(logits)

	return probs # shape (bsz, tgt_len)

	def motif_score(self, x_t, target_sequence, motifs):
	probs = self.get_probs(x_t, target_sequence)
	motif_probs = probs[:, motifs]
	motif_score = motif_probs.sum(dim=-1) / len(motifs)
	# pdb.set_trace()
	return motif_score

	def non_motif_score(self, x_t, target_sequence, motifs):
	probs = self.get_probs(x_t, target_sequence)
	non_motif_probs = probs[:, [i for i in range(probs.shape[1]) if i not in motifs]]
	mask = non_motif_probs >= 0.5
	count = mask.sum(dim=-1)

	non_motif_score = torch.where(count > 0, (non_motif_probs * mask).sum(dim=-1) / count, torch.zeros_like(count))

	return non_motif_score

	def scoring(self, x_t, target_sequence, motifs, penalty=False):
	probs = self.get_probs(x_t, target_sequence)
	motif_probs = probs[:, motifs]
	motif_score = motif_probs.sum(dim=-1) / len(motifs)
	# pdb.set_trace()

	if penalty:
	non_motif_probs = probs[:, [i for i in range(probs.shape[1]) if i not in motifs]]
	mask = non_motif_probs >= 0.5
	count = mask.sum(dim=-1)
	# non_motif_score = 1 - torch.where(count > 0, (non_motif_probs * mask).sum(dim=-1) / count, torch.zeros_like(count))
	non_motif_score = count / target_sequence.shape[1]
	return motif_score, 1 - non_motif_score
	else:
	return motif_score

	class MotifModel(nn.Module):
	def __init__(self, bindevaluator, target_sequence, motifs, penalty=False):
	super(MotifModel, self).__init__()
	self.bindevaluator = bindevaluator
	self.target_sequence = target_sequence
	self.motifs = motifs
	self.penalty = penalty

	def forward(self, x):
	return self.bindevaluator.scoring(x, self.target_sequence, self.motifs, self.penalty)

	class HemolysisModel:
	def __init__(self, device):
	self.predictor = xgb.Booster(model_file='./classifier_ckpt/wt_hemolysis.json')

	self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
	self.model.eval()

	self.device = device

	def get_scores(self, input_seqs):
	scores = np.ones(len(input_seqs))
	with torch.no_grad():
	embeddings = self.model(input_ids=input_seqs, attention_mask=torch.ones_like(input_seqs).to(self.device)).last_hidden_state
	keep = (input_seqs != 0) & (input_seqs != 1) & (input_seqs != 2)
	embeddings[keep==False] = 0
	features = torch.sum(embeddings, dim=1)/torch.sum(keep==True, dim=1).unsqueeze(-1)
	features = features.cpu().numpy()

	if len(features) == 0:
	return scores

	features = np.nan_to_num(features, nan=0.)
	features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)

	features = xgb.DMatrix(features)

	probs = self.predictor.predict(features)
	# return the probability of it being not hemolytic
	return torch.from_numpy(scores - probs).to(self.device)

	def __call__(self, input_seqs: list):
	scores = self.get_scores(input_seqs)
	return scores

	# ======================== MLP =========================================
	# Still need mean pooling along lengths
	class MaskedMeanPool(nn.Module):
	def forward(self, X, M): # X: (B,L,H), M: (B,L)
	Mf = M.unsqueeze(-1).float()
	denom = Mf.sum(dim=1).clamp(min=1.0)
	return (X * Mf).sum(dim=1) / denom # (B,H)

	class MLPClassifier(nn.Module):
	def __init__(self, in_dim, hidden=512, dropout=0.1):
	super().__init__()
	self.pool = MaskedMeanPool()
	self.net = nn.Sequential(
	nn.Linear(in_dim, hidden),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(hidden, 1),
	)
	def forward(self, X, M):
	z = self.pool(X, M)
	return self.net(z).squeeze(-1) # logits
	# ======================== MLP =========================================

	class NonfoulingModel:
	def __init__(self, device):
	ckpt = torch.load('./classifier_ckpt/wt_nonfouling.pt', weights_only=False, map_location=device)
	best_params = ckpt["best_params"]
	self.predictor = MLPClassifier(in_dim=1280, hidden=int(best_params["hidden"]), dropout=float(best_params.get("dropout", 0.1)))
	self.predictor.load_state_dict(ckpt["state_dict"])
	self.predictor = self.predictor.to(device)
	self.predictor.eval()

	self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
	self.model.eval()

	self.device = device

	def get_scores(self, input_ids, attention_mask):
	with torch.no_grad():
	features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

	keep = (input_ids != 0) & (input_ids != 1) & (input_ids != 2)
	attention_mask[keep==False] = 0
	scores = self.predictor(features, attention_mask)
	return scores

	def __call__(self, input_ids):
	attention_mask = torch.ones_like(input_ids).to(self.device)
	scores = self.get_scores(input_ids, attention_mask)
	return 1.0 / (1.0 + torch.exp(-scores))

	class SolubilityModel:
	def __init__(self, device):
	self.hydro_ids = torch.tensor([5, 7, 4, 12, 20, 18, 22, 14], device=device)
	self.device = device

	def get_scores(self, x):
	a = x[:, 1:-1]
	mask = (a.unsqueeze(-1) == self.hydro_ids).any(dim=-1)
	ratios = mask.float().mean(dim=1)
	return 1 - ratios

	def __call__(self, input_seqs: list):
	scores = self.get_scores(input_seqs)
	return scores

	class PeptideCNN(nn.Module):
	def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate):
	super().__init__()
	self.conv1 = nn.Conv1d(input_dim, hidden_dims[0], kernel_size=3, padding=1)
	self.conv2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=1)
	self.fc = nn.Linear(hidden_dims[1], output_dim)
	self.dropout = nn.Dropout(dropout_rate)
	self.predictor = nn.Linear(output_dim, 1) # For regression/classification

	self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
	self.esm_model.eval()

	def forward(self, input_ids, attention_mask=None, return_features=False):
	with torch.no_grad():
	x = self.esm_model(input_ids, attention_mask).last_hidden_state
	# x shape: (B, L, input_dim)
	x = x.permute(0, 2, 1) # Reshape to (B, input_dim, L) for Conv1d
	x = nn.functional.relu(self.conv1(x))
	x = self.dropout(x)
	x = nn.functional.relu(self.conv2(x))
	x = self.dropout(x)
	x = x.permute(0, 2, 1) # Reshape back to (B, L, hidden_dims[1])

	# Global average pooling over the sequence dimension (L)
	x = x.mean(dim=1) # Shape: (B, hidden_dims[1])

	features = self.fc(x) # features shape: (B, output_dim)
	if return_features:
	return features
	return self.predictor(features) # Output shape: (B, 1)


	# -----------------------------
	# Model definition (must match training)
	# -----------------------------
	class TransformerRegressor(nn.Module):
	def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
	super().__init__()
	self.proj = nn.Linear(in_dim, d_model)
	enc_layer = nn.TransformerEncoderLayer(
	d_model=d_model,
	nhead=nhead,
	dim_feedforward=ff,
	dropout=dropout,
	batch_first=True,
	activation="gelu",
	)
	self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
	self.head = nn.Linear(d_model, 1)

	def forward(self, X, M):
	# M: True = keep token, False = padding
	pad_mask = ~M
	Z = self.proj(X)
	Z = self.enc(Z, src_key_padding_mask=pad_mask)
	Mf = M.unsqueeze(-1).float()
	denom = Mf.sum(dim=1).clamp(min=1.0)
	pooled = (Z * Mf).sum(dim=1) / denom
	return self.head(pooled).squeeze(-1)


	def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module:
	# In your training code, transformer uses fixed architecture values (d_model/nhead/layers/ff/dropout).
	# (See build_model in finetune_nn_cv.py :contentReference[oaicite:2]{index=2})
	if model_name != "transformer":
	raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}")
	return TransformerRegressor(
	in_dim=in_dim,
	d_model=384,
	nhead=4,
	layers=1,
	ff=512,
	dropout=0.1521676463658988,
	)

	def _clean_state_dict(state_dict: dict) -> dict:
	cleaned = {}
	for k, v in state_dict.items():
	if k.startswith("module."):
	k = k[len("module.") :]
	if k.startswith("model."):
	k = k[len("model.") :]
	cleaned[k] = v
	return cleaned

	class HalfLifeModel:
	"""
	Loads:
	- ESM2 encoder to generate unpooled token embeddings (per residue)
	- Your fine-tuned TransformerRegressor from final_model.pt

	By default, __call__ returns "hours":
	- if ckpt['target_col'] == 'log_label' -> expm1(pred)
	- else -> raw pred
	"""

	def __init__(
	self,
	device,
	ckpt_path = "./classifier_ckpt/wt_halflife.pt",
	):
	self.device = device

	# --- load NN checkpoint (saved by your finetune script) ---
	ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
	if not isinstance(ckpt, dict) or "state_dict" not in ckpt:
	raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.")

	self.best_params = ckpt.get("best_params", {})
	self.in_dim = int(ckpt.get("in_dim"))
	self.target_col = ckpt.get("target_col", "label") # 'log_label' or 'label'

	# --- build + load regressor ---
	self.regressor = build_model(model_name="transformer", in_dim=self.in_dim, params=self.best_params)
	self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True)
	self.regressor.to(self.device)
	self.regressor.eval()

	# --- ESM2 embedding model ---
	self.emb_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(self.device)
	self.emb_model.eval()
	self.tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")

	# sanity: ESM2 hidden size should match training in_dim
	esm_hidden = int(self.emb_model.config.hidden_size)
	if esm_hidden != self.in_dim:
	raise ValueError(
	f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n"
	f"Did you train on a different embedding model/dimension than facebook/esm2_t33_650M_UR50D?"
	)

	@torch.no_grad()
	def _embed_unpooled_batch(self, sequences):
	out = self.emb_model(input_ids=sequences)
	hs = out.last_hidden_state # (B, T, H)

	per_seq = []
	lengths = []

	for i in range(hs.shape[0]):
	emb = hs[i, 1:-1, :] # (L, H)
	per_seq.append(emb)
	lengths.append(int(emb.shape[0]))

	Lmax = max(lengths) if lengths else 0
	H = hs.shape[-1]
	X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32)
	M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device)

	for i, emb in enumerate(per_seq):
	L = emb.shape[0]
	if L == 0:
	continue
	X[i, :L, :] = emb.to(torch.float32)
	M[i, :L] = True

	return X, M

	@torch.no_grad()
	def predict_raw(self, input_seqs):
	"""
	Returns the regressor output in the same space as training target_col:
	- if trained on log_label -> returns log1p(hours)
	- if trained on label -> returns hours (or whatever label scale was)
	"""
	if len(input_seqs) == 0:
	return np.array([], dtype=np.float32)

	X, M = self._embed_unpooled_batch(input_seqs)
	yhat = self.regressor(X, M).detach().cpu().numpy().astype(np.float32) # (B,)
	# pdb.set_trace()
	return yhat

	def predict_hours(self, input_seqs) -> np.ndarray:
	"""
	If your model was trained on log_label, convert back to hours via expm1.
	Otherwise returns raw predictions.
	"""
	raw = self.predict_raw(input_seqs)
	if self.target_col == "log_label":
	return np.expm1(raw).astype(np.float32)
	return raw.astype(np.float32)

	def __call__(self, input_seqs) -> np.ndarray:
	return torch.from_numpy(self.predict_hours(input_seqs)).to(self.device)


	def load_bindevaluator(checkpoint_path, device):
	bindevaluator = BindEvaluator.load_from_checkpoint(checkpoint_path, n_layers=8, d_model=128, d_hidden=128, n_head=8, d_k=64, d_v=128, d_inner=64).to(device)
	bindevaluator.eval()
	for param in bindevaluator.parameters():
	param.requires_grad = False

	return bindevaluator


	def load_solver(checkpoint_path, vocab_size, device):
	lr = 1e-4
	epochs = 200
	embed_dim = 512
	hidden_dim = 256
	epsilon = 1e-3
	batch_size = 256
	warmup_epochs = epochs // 10
	device = 'cuda:0'


	probability_denoiser = CNNModel(alphabet_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
	probability_denoiser.load_state_dict(torch.load(checkpoint_path, map_location=device, weights_only=False))
	probability_denoiser.eval()
	for param in probability_denoiser.parameters():
	param.requires_grad = False

	# instantiate a convex path object
	scheduler = PolynomialConvexScheduler(n=2.0)
	path = MixtureDiscreteProbPath(scheduler=scheduler)

	class WrappedModel(ModelWrapper):
	def forward(self, x: torch.Tensor, t: torch.Tensor, **extras):
	return torch.softmax(self.model(x, t), dim=-1)

	wrapped_probability_denoiser = WrappedModel(probability_denoiser)
	solver = MixtureDiscreteEulerSolver(model=wrapped_probability_denoiser, path=path, vocabulary_size=vocab_size)

	return solver


	class CrossAttnUnpooled(nn.Module):
	"""
	token sequences with masks; alternating cross attention.
	"""
	def __init__(self, Ht=1280, Hb=1280, hidden=768, n_heads=8, n_layers=1, dropout=0.16430662769055482):
	super().__init__()
	self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
	self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))

	self.layers = nn.ModuleList([])
	for _ in range(n_layers):
	self.layers.append(nn.ModuleDict({
	"attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
	"attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
	"n1t": nn.LayerNorm(hidden),
	"n2t": nn.LayerNorm(hidden),
	"n1b": nn.LayerNorm(hidden),
	"n2b": nn.LayerNorm(hidden),
	"fft": nn.Sequential(nn.Linear(hidden, 4hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4hidden, hidden)),
	"ffb": nn.Sequential(nn.Linear(hidden, 4hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4hidden, hidden)),
	}))

	self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
	self.reg = nn.Linear(hidden, 1)
	self.cls = nn.Linear(hidden, 3)

	def masked_mean(self, X, M):
	Mf = M.unsqueeze(-1).float()
	denom = Mf.sum(dim=1).clamp(min=1.0)
	return (X * Mf).sum(dim=1) / denom

	def forward(self, T, Mt, B, Mb):
	# T:(B,Lt,Ht), Mt:(B,Lt) ; B:(B,Lb,Hb), Mb:(B,Lb)
	T = self.t_proj(T)
	Bx = self.b_proj(B)

	kp_t = ~Mt # key_padding_mask True = pad
	kp_b = ~Mb

	for L in self.layers:
	# T attends to B
	T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
	T = L["n1t"](T + T_attn)
	T = L["n2t"](T + L["fft"](T))

	# B attends to T
	B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
	Bx = L["n1b"](Bx + B_attn)
	Bx = L["n2b"](Bx + L["ffb"](Bx))

	t_pool = self.masked_mean(T, Mt)
	b_pool = self.masked_mean(Bx, Mb)
	z = torch.cat([t_pool, b_pool], dim=-1)
	h = self.shared(z)
	return self.reg(h).squeeze(-1), self.cls(h)

	def load_affinity_predictor(device):
	"""Load trained model from checkpoint."""
	checkpoint = torch.load('./classifier_ckpt/wt_affinity.pt', map_location=device, weights_only=False)

	model = CrossAttnUnpooled()

	model.load_state_dict(checkpoint['state_dict'])
	model.eval()
	model = model.to(device)

	return model

	class AffinityModel(nn.Module):
	def __init__(self, affinity_predictor, target_sequence, device):
	super(AffinityModel, self).__init__()
	self.affinity_predictor = affinity_predictor
	self.target_sequence = target_sequence
	self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
	self.esm_model.eval()
	self.device=device

	def forward(self, x):
	batch = x.shape[0]
	Mt = self.target_sequence['attention_mask'][:, 1:-1].repeat(batch, 1)
	with torch.no_grad():
	T = self.esm_model(**self.target_sequence).last_hidden_state[:, 1:-1, :].repeat(batch, 1, 1)

	Mb = torch.ones(batch, x.shape[1] - 2, dtype=torch.bool).to(self.device)
	with torch.no_grad():
	for i in range(batch):
	attention_mask = torch.ones_like(x).to(self.device)
	B = self.esm_model(input_ids=x, attention_mask=torch.ones_like(x).to(self.device)).last_hidden_state[:, 1:-1]

	affinity, _ = self.affinity_predictor(T, Mt.bool(), B, Mb)
	return affinity / 10