bert-beatrix-2048 / bert_handler.py

Create bert_handler.py

61d52d9 verified 9 months ago

23.3 kB

	import torch
	import torch.nn as nn
	from transformers import AutoTokenizer, AutoModelForMaskedLM
	from pathlib import Path
	import json
	import re
	import gc


	class BERTHandler:
	"""
	VRAM-safe BERT model handler for loading, tokenization, and saving
	Handles all token management and checkpoint operations with proper cleanup
	"""

	def __init__(self, symbolic_tokens=None):
	# Default symbolic tokens
	self.symbolic_tokens = symbolic_tokens or [
	"<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
	"<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
	"<upper_body_clothing>", "<hair_style>", "<hair_length>", "<headwear>",
	"<texture>", "<pattern>", "<grid>", "<zone>", "<offset>",
	"<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
	"<fabric>", "<jewelry>"
	]

	# Generate shunt tokens
	self.shunt_tokens = [f"[SHUNT_{1000000 + i}]" for i in range(len(self.symbolic_tokens))]
	self.all_special_tokens = self.symbolic_tokens + self.shunt_tokens

	# Model components
	self.tokenizer = None
	self.model = None
	self.current_step = 0
	self.current_epoch = 1

	print(f"🎯 BERTHandler initialized with {len(self.all_special_tokens)} special tokens")

	def __del__(self):
	"""Destructor to ensure cleanup when object is deleted"""
	self._cleanup_model()

	def _cleanup_model(self):
	"""
	CRITICAL: Comprehensive model cleanup to free VRAM
	This is the core method that prevents VRAM accumulation
	"""
	if hasattr(self, 'model') and self.model is not None:
	print("🧹 Cleaning up existing model from VRAM...")

	# Move model to CPU first to free GPU memory
	if torch.cuda.is_available() and next(self.model.parameters(), None) is not None:
	if next(self.model.parameters()).is_cuda:
	self.model = self.model.cpu()

	# Delete the model
	del self.model
	self.model = None

	# Force garbage collection
	gc.collect()

	# Clear CUDA cache
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize() # Ensure all CUDA operations complete

	print("✅ Model cleanup complete")

	def _print_vram_usage(self, prefix=""):
	"""Print current VRAM usage for monitoring"""
	if torch.cuda.is_available():
	allocated = torch.cuda.memory_allocated() / 1e9
	reserved = torch.cuda.memory_reserved() / 1e9
	print(f"🎯 {prefix}VRAM: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
	else:
	print(f"🎯 {prefix}CUDA not available")

	def load_fresh_model(self, model_name="nomic-ai/nomic-bert-2048"):
	"""Load fresh model and add special tokens with proper VRAM management"""
	print(f"🆕 Loading fresh model: {model_name}")
	self._print_vram_usage("Before cleanup: ")

	# CRITICAL: Clean up existing model first
	self._cleanup_model()
	self._print_vram_usage("After cleanup: ")

	try:
	# Load base model and tokenizer
	print("📥 Loading base tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	print("📥 Loading base model...")
	self.model = AutoModelForMaskedLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	torch_dtype=torch.float32 # Explicit dtype for consistency
	)

	# Add special tokens (ONLY for fresh models)
	original_size = len(self.tokenizer)
	special_tokens_dict = {"additional_special_tokens": self.all_special_tokens}
	num_added = self.tokenizer.add_special_tokens(special_tokens_dict)

	print(f" - Original vocab size: {original_size}")
	print(f" - Added {num_added} special tokens")
	print(f" - New vocab size: {len(self.tokenizer)}")

	# Resize model embeddings (ONLY for fresh models)
	if num_added > 0:
	self._resize_embeddings()

	# Reset training state
	self.current_step = 0
	self.current_epoch = 1

	print("✅ Fresh model loaded successfully")
	self._print_vram_usage("After loading: ")
	return self.model, self.tokenizer

	except Exception as e:
	print(f"❌ Failed to load fresh model: {e}")
	# Clean up on failure
	self._cleanup_model()
	raise

	def load_checkpoint(self, checkpoint_path):
	"""Load model from checkpoint - use saved tokenizer as-is, no modifications"""
	print(f"📂 Loading checkpoint: {checkpoint_path}")
	self._print_vram_usage("Before cleanup: ")

	# CRITICAL: Clean up existing model first
	self._cleanup_model()
	self._print_vram_usage("After cleanup: ")

	try:
	# Load saved tokenizer AS-IS (already contains special tokens)
	print("📥 Loading saved tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
	print(f" - Tokenizer loaded: {len(self.tokenizer)} tokens (already includes special tokens)")

	# Load saved model AS-IS (already matches tokenizer)
	print("📥 Loading saved model...")
	self.model = AutoModelForMaskedLM.from_pretrained(
	checkpoint_path,
	trust_remote_code=True,
	torch_dtype=torch.float32,
	)

	print(f"✅ Model loaded successfully")
	print(f" - Model vocab size: {self.model.config.vocab_size}")
	print(f" - Embedding size: {self.model.bert.embeddings.word_embeddings.weight.shape[0]}")
	print(f" - Tokenizer size: {len(self.tokenizer)}")

	# DO NOT MODIFY ANYTHING - checkpoint is self-consistent

	# Load training state
	self._load_training_state(checkpoint_path)

	print(f"✅ Checkpoint loaded - Step: {self.current_step}, Epoch: {self.current_epoch}")
	self._print_vram_usage("After loading: ")
	return self.model, self.tokenizer

	except Exception as e:
	print(f"❌ Failed to load checkpoint: {e}")
	# Clean up on failure
	self._cleanup_model()
	raise

	def save_checkpoint(self, save_path, step=None, epoch=None):
	"""Save model checkpoint with consistency verification"""
	if self.model is None or self.tokenizer is None:
	raise RuntimeError("No model loaded to save")

	step = step or self.current_step
	epoch = epoch or self.current_epoch

	# CRITICAL: Verify consistency before saving
	tokenizer_size = len(self.tokenizer)
	model_vocab_size = self.model.config.vocab_size
	embedding_size = self.model.bert.embeddings.word_embeddings.weight.shape[0]

	if not (tokenizer_size == model_vocab_size == embedding_size):
	print(f"⚠️ CONSISTENCY CHECK FAILED before saving:")
	print(f" - Tokenizer size: {tokenizer_size}")
	print(f" - Model config vocab_size: {model_vocab_size}")
	print(f" - Embedding size: {embedding_size}")

	# Force consistency before saving
	print(f"🔧 Forcing consistency to tokenizer size: {tokenizer_size}")
	self.model.config.vocab_size = tokenizer_size

	# Resize embeddings if needed
	if embedding_size != tokenizer_size:
	print(f"🔧 Resizing embeddings to match tokenizer: {embedding_size} → {tokenizer_size}")
	self._resize_embeddings()

	# Create checkpoint directory
	checkpoint_dir = Path(save_path) / f"symbolic_bert_step{step}_epoch{epoch}"
	checkpoint_dir.mkdir(parents=True, exist_ok=True)

	print(f"💾 Saving checkpoint: {checkpoint_dir}")

	try:
	# Save model and tokenizer
	print("💾 Saving model...")
	self.model.save_pretrained(checkpoint_dir)

	print("💾 Saving tokenizer...")
	self.tokenizer.save_pretrained(checkpoint_dir)

	# Save training state with consistency info
	training_state = {
	"step": step,
	"epoch": epoch,
	"vocab_size": len(self.tokenizer),
	"model_vocab_size": self.model.config.vocab_size,
	"embedding_size": self.model.bert.embeddings.word_embeddings.weight.shape[0],
	"consistency_verified": True,
	"special_tokens_count": len(self.all_special_tokens)
	}

	with open(checkpoint_dir / "training_config.json", "w") as f:
	json.dump(training_state, f, indent=2)

	# Save token mappings
	self._save_token_mappings(checkpoint_dir)

	# VERIFICATION: Load and check consistency
	print("🔍 Verifying saved checkpoint consistency...")
	test_tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
	test_config_path = checkpoint_dir / "config.json"

	with open(test_config_path) as f:
	test_config = json.load(f)

	saved_tokenizer_size = len(test_tokenizer)
	saved_model_vocab = test_config["vocab_size"]

	if saved_tokenizer_size != saved_model_vocab:
	raise RuntimeError(
	f"CHECKPOINT SAVE FAILED! Inconsistency detected:\n"
	f" Saved tokenizer size: {saved_tokenizer_size}\n"
	f" Saved model vocab: {saved_model_vocab}"
	)

	# Update internal state
	self.current_step = step
	self.current_epoch = epoch

	print(f"✅ Checkpoint saved and verified successfully")
	print(f" - Consistent vocab size: {saved_tokenizer_size}")
	return checkpoint_dir

	except Exception as e:
	print(f"❌ Failed to save checkpoint: {e}")
	raise

	def find_latest_checkpoint(self, base_path, pattern="symbolic_bert"):
	"""Find latest checkpoint in directory"""
	path = Path(base_path)
	if not path.exists():
	print(f"⚠️ Checkpoint directory does not exist: {base_path}")
	return None

	# Find checkpoints
	checkpoints = list(path.glob(f"{pattern}_step_epoch"))
	if not checkpoints:
	print(f"⚠️ No checkpoints found in {base_path}")
	return None

	# Sort by step number (more reliable than modification time)
	def extract_step(checkpoint_path):
	match = re.search(r"step(\d+)", checkpoint_path.name)
	return int(match.group(1)) if match else 0

	checkpoints.sort(key=extract_step, reverse=True)
	latest = checkpoints[0]

	print(f"📂 Found latest checkpoint: {latest}")
	return latest

	def get_token_mappings(self):
	"""Get token ID mappings"""
	if self.tokenizer is None:
	return {}, {}

	symbolic_ids = {}
	shunt_ids = {}

	for token in self.symbolic_tokens:
	token_id = self.tokenizer.convert_tokens_to_ids(token)
	if token_id != self.tokenizer.unk_token_id:
	symbolic_ids[token] = token_id

	for token in self.shunt_tokens:
	token_id = self.tokenizer.convert_tokens_to_ids(token)
	if token_id != self.tokenizer.unk_token_id:
	shunt_ids[token] = token_id

	return symbolic_ids, shunt_ids

	def to_device(self, device):
	"""Move model to device with VRAM monitoring"""
	if self.model is not None:
	print(f"📱 Moving model to {device}...")
	self._print_vram_usage("Before device move: ")

	self.model = self.model.to(device)

	# Clear cache after moving to device
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	print(f"✅ Model moved to {device}")
	self._print_vram_usage("After device move: ")
	else:
	print(f"⚠️ No model loaded to move to {device}")
	return self

	def _resize_embeddings(self):
	"""Resize model embeddings to match tokenizer (handles both expansion and shrinking)"""
	if self.model is None:
	raise RuntimeError("No model loaded")

	old_embeddings = self.model.bert.embeddings.word_embeddings
	old_size, embedding_dim = old_embeddings.weight.shape
	new_size = len(self.tokenizer)

	if old_size == new_size:
	print(f"✅ Embeddings already correct size: {new_size}")
	return

	print(f"🔄 Resizing embeddings: {old_size} → {new_size}")

	try:
	# Create new embeddings
	new_embeddings = nn.Embedding(new_size, embedding_dim)

	# Copy existing embeddings (handle both expansion and shrinking)
	with torch.no_grad():
	# Copy the minimum of old_size and new_size
	copy_size = min(old_size, new_size)
	new_embeddings.weight.data[:copy_size] = old_embeddings.weight.data[:copy_size].clone()

	# If expanding, initialize new token embeddings
	if new_size > old_size:
	num_added = new_size - old_size
	# Use small random initialization for new tokens
	new_embeddings.weight.data[old_size:] = torch.randn(
	num_added, embedding_dim, device=old_embeddings.weight.device
	) * 0.02
	print(f" - Added {num_added} new token embeddings")
	elif new_size < old_size:
	num_removed = old_size - new_size
	print(f" - Removed {num_removed} token embeddings")

	# Replace embeddings
	self.model.bert.embeddings.word_embeddings = new_embeddings

	# Resize decoder if it exists
	if hasattr(self.model.cls.predictions, "decoder"):
	old_decoder = self.model.cls.predictions.decoder
	new_decoder = nn.Linear(embedding_dim, new_size, bias=True)

	with torch.no_grad():
	# Copy existing weights (handle both expansion and shrinking)
	copy_size = min(old_decoder.weight.shape[0], new_size)
	new_decoder.weight.data[:copy_size] = old_decoder.weight.data[:copy_size].clone()

	# Handle bias
	if old_decoder.bias is not None:
	new_decoder.bias.data[:copy_size] = old_decoder.bias.data[:copy_size].clone()

	# If expanding, tie new decoder weights to new embeddings and init bias
	if new_size > old_decoder.weight.shape[0]:
	start_idx = old_decoder.weight.shape[0]
	new_decoder.weight.data[start_idx:] = new_embeddings.weight.data[start_idx:].clone()
	if old_decoder.bias is not None:
	new_decoder.bias.data[start_idx:] = torch.zeros(
	new_size - start_idx, device=old_decoder.bias.device
	)

	self.model.cls.predictions.decoder = new_decoder

	# Update config
	self.model.config.vocab_size = new_size

	print(f"✅ Embeddings resized successfully")

	except Exception as e:
	print(f"❌ Failed to resize embeddings: {e}")
	raise

	def _load_training_state(self, checkpoint_path):
	"""Load training state from checkpoint"""
	# Try training_config.json first
	config_path = Path(checkpoint_path) / "training_config.json"
	if config_path.exists():
	try:
	with open(config_path) as f:
	config = json.load(f)
	self.current_step = config.get("step", 0)
	self.current_epoch = config.get("epoch", 1)
	print(f"📊 Loaded training state: step {self.current_step}, epoch {self.current_epoch}")
	return
	except Exception as e:
	print(f"⚠️ Failed to load training_config.json: {e}")

	# Fallback: extract from path name
	match = re.search(r"step(\d+)_epoch(\d+)", str(checkpoint_path))
	if match:
	self.current_step = int(match.group(1))
	self.current_epoch = int(match.group(2))
	print(f"📊 Extracted training state from path: step {self.current_step}, epoch {self.current_epoch}")
	else:
	self.current_step = 0
	self.current_epoch = 1
	print(f"⚠️ Could not determine training state, using defaults: step 0, epoch 1")

	def _save_token_mappings(self, checkpoint_dir):
	"""Save token ID mappings"""
	try:
	symbolic_ids, shunt_ids = self.get_token_mappings()

	token_mappings = {
	"symbolic_token_ids": symbolic_ids,
	"shunt_token_ids": shunt_ids,
	"symbolic_tokens": self.symbolic_tokens,
	"shunt_tokens": self.shunt_tokens,
	"total_special_tokens": len(self.all_special_tokens)
	}

	with open(checkpoint_dir / "special_token_ids.json", "w") as f:
	json.dump(token_mappings, f, indent=2)

	print(f"💾 Saved {len(symbolic_ids)} symbolic and {len(shunt_ids)} shunt token mappings")

	except Exception as e:
	print(f"⚠️ Failed to save token mappings: {e}")

	def summary(self):
	"""Print comprehensive handler summary"""
	print(f"\n📋 BERT HANDLER SUMMARY:")

	if self.model is None:
	print("❌ No model loaded")
	return

	symbolic_ids, shunt_ids = self.get_token_mappings()

	print(f" 📚 Tokenizer:")
	print(f" - Size: {len(self.tokenizer)}")
	print(f" - Special tokens: {len(self.tokenizer.additional_special_tokens or [])}")

	print(f" 🤖 Model:")
	print(f" - Config vocab size: {self.model.config.vocab_size}")
	print(f" - Embedding vocab size: {self.model.bert.embeddings.word_embeddings.weight.shape[0]}")
	print(f" - Embedding dim: {self.model.bert.embeddings.word_embeddings.weight.shape[1]}")

	if hasattr(self.model.cls.predictions, "decoder"):
	decoder = self.model.cls.predictions.decoder
	print(f" - Decoder output size: {decoder.weight.shape[0]}")

	print(f" 🎯 Special Tokens:")
	print(f" - Symbolic tokens mapped: {len(symbolic_ids)}")
	print(f" - Shunt tokens mapped: {len(shunt_ids)}")
	print(f" - Total defined: {len(self.all_special_tokens)}")

	print(f" 📊 Training State:")
	print(f" - Current step: {self.current_step}")
	print(f" - Current epoch: {self.current_epoch}")

	# VRAM usage
	self._print_vram_usage(" 🎯 ")

	# Check for vocab consistency
	tokenizer_size = len(self.tokenizer)
	model_config_size = self.model.config.vocab_size
	embedding_size = self.model.bert.embeddings.word_embeddings.weight.shape[0]

	if tokenizer_size == model_config_size == embedding_size:
	print(f" ✅ All vocab sizes consistent: {tokenizer_size}")
	else:
	print(f" ⚠️ Vocab size mismatch detected:")
	print(f" - Tokenizer: {tokenizer_size}")
	print(f" - Model config: {model_config_size}")
	print(f" - Embeddings: {embedding_size}")

	def clear_vram(self):
	"""Explicit method to clear VRAM for debugging"""
	print("🧹 Explicit VRAM cleanup requested...")
	self._cleanup_model()
	self._print_vram_usage("After cleanup: ")


	# Utility functions for safe usage patterns

	def create_handler_with_fresh_model(model_name="nomic-ai/nomic-bert-2048", symbolic_tokens=None):
	"""Factory function to create handler and load fresh model safely"""
	print("🔄 Creating new BERTHandler with fresh model...")
	handler = BERTHandler(symbolic_tokens=symbolic_tokens)
	model, tokenizer = handler.load_fresh_model(model_name)
	return handler, model, tokenizer


	def create_handler_from_checkpoint(checkpoint_path, symbolic_tokens=None):
	"""Factory function to create handler and load from checkpoint safely"""
	print("🔄 Creating new BERTHandler from checkpoint...")
	handler = BERTHandler(symbolic_tokens=symbolic_tokens)
	model, tokenizer = handler.load_checkpoint(checkpoint_path)
	return handler, model, tokenizer


	# Usage examples and testing
	if __name__ == "__main__":
	# Example usage with comprehensive error handling

	def test_vram_safety():
	"""Test VRAM safety by loading multiple models"""
	print("🧪 Testing VRAM safety...")

	handler = BERTHandler()

	# Load model 1
	print("\n--- Loading Model 1 ---")
	handler.load_fresh_model("bert-base-uncased")
	handler.summary()

	# Load model 2 (should clean up model 1)
	print("\n--- Loading Model 2 (should cleanup Model 1) ---")
	handler.load_fresh_model("distilbert-base-uncased")
	handler.summary()

	# Explicit cleanup
	print("\n--- Explicit Cleanup ---")
	handler.clear_vram()

	print("✅ VRAM safety test complete")

	# Uncomment to run test
	# test_vram_safety()

	"""
	USAGE EXAMPLES:

	# Safe way to work with fresh models:
	handler, model, tokenizer = create_handler_with_fresh_model("nomic-ai/nomic-bert-2048")

	# Safe way to work with checkpoints:
	handler, model, tokenizer = create_handler_from_checkpoint("/path/to/checkpoint")

	# Manual cleanup when needed:
	handler.clear_vram()

	# Always check summary for consistency:
	handler.summary()
	"""