| |
| """ |
| BINARY TRANSFORMER - Raw network bytes → neural network |
| No tokenizer. No preprocessing. Just bytes. |
| |
| Vocab = 256 (one token per byte value 0x00-0xFF) |
| Input: Raw bytes from network stream via stdin |
| """ |
|
|
| import sys |
| import math |
| import time |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from collections import deque |
|
|
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| torch.backends.cuda.matmul.allow_tf32 = True |
|
|
| |
| CONFIG = { |
| "d": 128, |
| "layers": 3, |
| "heads": 4, |
| "vocab": 256, |
| "ctx": 1024, |
| } |
|
|
| LR = 3e-4 |
| UPDATE_EVERY = 64 |
| PRINT_EVERY = 50000 |
|
|
| class ByteAttention(nn.Module): |
| def __init__(self, d, h): |
| super().__init__() |
| self.h, self.dk = h, d // h |
| self.qkv = nn.Linear(d, 3 * d, bias=False) |
| self.proj = nn.Linear(d, d, bias=False) |
| |
| def forward(self, x, mask=None): |
| B, N, D = x.shape |
| qkv = self.qkv(x).view(B, N, 3, self.h, self.dk).permute(2, 0, 3, 1, 4) |
| q, k, v = qkv[0], qkv[1], qkv[2] |
| att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk) |
| if mask is not None: |
| att = att + mask |
| return self.proj((F.softmax(att, -1) @ v).transpose(1, 2).reshape(B, N, D)) |
|
|
| class ByteBlock(nn.Module): |
| def __init__(self, d, h): |
| super().__init__() |
| self.ln1, self.ln2 = nn.LayerNorm(d), nn.LayerNorm(d) |
| self.attn = ByteAttention(d, h) |
| self.ff = nn.Sequential(nn.Linear(d, 4*d), nn.GELU(), nn.Linear(4*d, d)) |
| |
| def forward(self, x, mask): |
| x = x + self.attn(self.ln1(x), mask) |
| return x + self.ff(self.ln2(x)) |
|
|
| class BinaryTransformer(nn.Module): |
| def __init__(self, cfg): |
| super().__init__() |
| d, L, h, V = cfg["d"], cfg["layers"], cfg["heads"], cfg["vocab"] |
| self.emb = nn.Embedding(V, d) |
| self.blocks = nn.ModuleList([ByteBlock(d, h) for _ in range(L)]) |
| self.ln = nn.LayerNorm(d) |
| self.head = nn.Linear(d, V, bias=False) |
| self.head.weight = self.emb.weight |
| |
| def forward(self, x): |
| B, N = x.shape |
| mask = torch.triu(torch.ones(N, N, device=x.device), 1) * -1e9 |
| h = self.emb(x) |
| for block in self.blocks: |
| h = block(h, mask) |
| return self.head(self.ln(h)) |
| |
| def count_params(self): |
| return sum(p.numel() for p in self.parameters()) |
|
|
| class BinaryTrainer: |
| def __init__(self, model, lr=LR): |
| self.model = model.to(DEVICE) |
| self.opt = torch.optim.AdamW(model.parameters(), lr=lr) |
| self.ctx_size = CONFIG["ctx"] |
| self.buffer = deque(maxlen=self.ctx_size + 1) |
| |
| self.bytes_seen = 0 |
| self.total_loss = 0.0 |
| self.updates = 0 |
| self.start_time = time.time() |
| |
| def ingest_byte(self, byte_val): |
| """Absorb a single byte (0-255)""" |
| self.buffer.append(byte_val) |
| self.bytes_seen += 1 |
| |
| if len(self.buffer) >= UPDATE_EVERY + 1 and self.bytes_seen % UPDATE_EVERY == 0: |
| self._update() |
| |
| if self.bytes_seen % PRINT_EVERY == 0: |
| self._print_stats() |
| |
| |
| if self.bytes_seen % 500000 == 0 and self.bytes_seen > 0: |
| self._save() |
| |
| def _update(self): |
| tokens = list(self.buffer) |
| x = torch.tensor(tokens[:-1], device=DEVICE, dtype=torch.long).unsqueeze(0) |
| y = torch.tensor(tokens[1:], device=DEVICE, dtype=torch.long).unsqueeze(0) |
| |
| self.model.train() |
| logits = self.model(x) |
| loss = F.cross_entropy( |
| logits[:, -UPDATE_EVERY:].reshape(-1, 256), |
| y[:, -UPDATE_EVERY:].reshape(-1) |
| ) |
| |
| self.opt.zero_grad() |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) |
| self.opt.step() |
| |
| self.total_loss += loss.item() |
| self.updates += 1 |
| |
| def _print_stats(self): |
| elapsed = time.time() - self.start_time |
| rate = self.bytes_seen / elapsed if elapsed > 0 else 0 |
| avg_loss = self.total_loss / max(1, self.updates) |
| mb = self.bytes_seen / 1_000_000 |
| |
| |
| bpb = avg_loss / math.log(2) |
| |
| print(f"[{elapsed:.0f}s] {mb:.2f}MB | {rate/1000:.1f} KB/s | " |
| f"loss={avg_loss:.3f} | bpb={bpb:.2f} | updates={self.updates}", flush=True) |
| |
| def _save(self): |
| avg_loss = self.total_loss / max(1, self.updates) |
| mb = self.bytes_seen // 1_000_000 |
| ckpt = { |
| "model": self.model.state_dict(), |
| "bytes": self.bytes_seen, |
| "loss": avg_loss, |
| } |
| torch.save(ckpt, f"byte_ckpt_{mb}mb.pt") |
| print(f"[SAVED] {mb}MB checkpoint", flush=True) |
|
|
| def main(): |
| print(f"BINARY TRANSFORMER - Raw bytes learning", flush=True) |
| print(f"Config: {CONFIG}", flush=True) |
| print(f"Device: {DEVICE}", flush=True) |
| |
| model = BinaryTransformer(CONFIG) |
| params = model.count_params() |
| print(f"Parameters: {params:,} ({params/1e6:.1f}M)", flush=True) |
| print(f"Vocab: 256 (one per byte)", flush=True) |
| |
| trainer = BinaryTrainer(model) |
| |
| print(f"Listening for raw bytes on stdin...", flush=True) |
| |
| |
| while True: |
| byte = sys.stdin.buffer.read(1) |
| if not byte: |
| break |
| trainer.ingest_byte(byte[0]) |
| |
| print(f"Stream ended. Total bytes: {trainer.bytes_seen:,}", flush=True) |
|
|
| if __name__ == "__main__": |
| main() |
|
|