sync: 2026-03-16 18:00 — 21 files from Alexandria
Some checks failed
Lint & Format / detect (push) Has been cancelled
Lint & Format / js-lint (push) Has been cancelled
Lint & Format / py-lint (push) Has been cancelled
Lint & Format / sh-lint (push) Has been cancelled
Lint & Format / go-lint (push) Has been cancelled
Monorepo Lint / lint-shell (push) Has been cancelled
Monorepo Lint / lint-js (push) Has been cancelled

RoadChain-SHA2048: c316572452cf6246
RoadChain-Identity: alexa@sovereign
RoadChain-Full: c316572452cf6246294a9bf089ca177403d4187333252430e919cc2f55176f80722ae82807051dbc5772fa50a393bfc6ba5f1dc464d9ae0a435082d60da68a8f61c7521bc0fbc0509bf9e043035456b80763949d1e1e30cca278c3b264e78a417b62be8636feeda64580bd6eac8adede7fcb7c095eadc922a1057710484f98d08f347104bf31434f66ced3da243d480c58a4dfa0cc068ddfda54acadde593e3964f2a6ee69283eb62557638056c9e69be5cf70d8d9698b4c07f76c3f2d91e713cff5568eef41c51991648e8e28e2e0523cf04269cb5be393cae9250b285c968b97e330f5ddf91836d3f67555a147108c594274f1878fe4dc8f47ec4b2291b82f
This commit is contained in:
2026-03-16 18:00:02 -05:00
parent 24ed26bbc7
commit 6777292f6e
21 changed files with 346 additions and 142 deletions

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""
BlackRoad LLM v4 — Train 8M param transformer on corpus v4 (5.8MB)
Uses word-level tokenizer (fast) + MPS acceleration on Mac
"""
import torch
import torch.nn as nn
import torch.optim as optim
import time
import os
import math
from collections import Counter
CORPUS_PATH = os.path.expanduser("~/.blackroad/training-corpus-v4.txt")
MODEL_PATH = os.path.expanduser("~/.blackroad/blackroad-llm-8m.pt")
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
# Model config — ~8M params
VOCAB_SIZE = 16000
DIM = 256
NUM_LAYERS = 6
NUM_HEADS = 8
SEQ_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LR = 3e-4
print(f"BlackRoad LLM v4 Training")
print(f"Device: {DEVICE} | Params: ~8M | Corpus: v4")
print(f"Config: dim={DIM}, layers={NUM_LAYERS}, heads={NUM_HEADS}, vocab={VOCAB_SIZE}")
print("=" * 60)
# ============================================================================
# WORD-LEVEL TOKENIZER (fast — no BPE bottleneck)
# ============================================================================
print("Loading corpus...")
with open(CORPUS_PATH, 'r', encoding='utf-8') as f:
text = f.read()
print(f" Corpus: {len(text):,} chars, {len(text.split()):,} words")
# Build vocabulary from word frequencies
words = text.lower().split()
word_counts = Counter(words)
vocab_words = [w for w, _ in word_counts.most_common(VOCAB_SIZE - 3)]
word2id = {"<PAD>": 0, "<UNK>": 1, "<EOS>": 2}
for i, w in enumerate(vocab_words, start=3):
word2id[w] = i
id2word = {v: k for k, v in word2id.items()}
actual_vocab = len(word2id)
print(f" Vocabulary: {actual_vocab:,} tokens")
# Encode entire corpus
print("Encoding corpus...")
token_ids = [word2id.get(w, 1) for w in words]
print(f" Encoded: {len(token_ids):,} tokens")
# Create sequences
num_sequences = (len(token_ids) - 1) // SEQ_LEN
data = torch.tensor(token_ids[:num_sequences * SEQ_LEN + 1], dtype=torch.long)
print(f" Sequences: {num_sequences:,} (len={SEQ_LEN})")
print()
# ============================================================================
# TRANSFORMER MODEL
# ============================================================================
class BlackRoadLLM(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(actual_vocab, DIM)
self.pos_embedding = nn.Embedding(SEQ_LEN, DIM)
encoder_layer = nn.TransformerEncoderLayer(
d_model=DIM,
nhead=NUM_HEADS,
dim_feedforward=DIM * 4,
dropout=0.1,
batch_first=True,
norm_first=True,
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=NUM_LAYERS)
self.ln_f = nn.LayerNorm(DIM)
self.head = nn.Linear(DIM, actual_vocab, bias=False)
# Causal mask
self.register_buffer(
"causal_mask",
torch.triu(torch.ones(SEQ_LEN, SEQ_LEN), diagonal=1).bool()
)
def forward(self, x):
B, T = x.shape
pos = torch.arange(T, device=x.device).unsqueeze(0)
x = self.embedding(x) + self.pos_embedding(pos)
x = self.transformer(x, mask=self.causal_mask[:T, :T])
x = self.ln_f(x)
return self.head(x)
model = BlackRoadLLM().to(DEVICE)
num_params = sum(p.numel() for p in model.parameters())
print(f"Model: {num_params:,} parameters ({num_params/1e6:.1f}M)")
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS * (num_sequences // BATCH_SIZE))
# ============================================================================
# TRAINING LOOP
# ============================================================================
print(f"\nTraining {EPOCHS} epochs...")
start_time = time.time()
for epoch in range(EPOCHS):
model.train()
epoch_loss = 0
num_batches = 0
# Shuffle sequence order
perm = torch.randperm(num_sequences)
for i in range(0, num_sequences - BATCH_SIZE, BATCH_SIZE):
batch_idx = perm[i:i + BATCH_SIZE]
# Build batch
inputs = torch.stack([data[idx * SEQ_LEN:(idx * SEQ_LEN) + SEQ_LEN] for idx in batch_idx]).to(DEVICE)
targets = torch.stack([data[(idx * SEQ_LEN) + 1:(idx * SEQ_LEN) + SEQ_LEN + 1] for idx in batch_idx]).to(DEVICE)
logits = model(inputs)
loss = nn.functional.cross_entropy(logits.reshape(-1, actual_vocab), targets.reshape(-1), ignore_index=0)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
epoch_loss += loss.item()
num_batches += 1
if num_batches % 100 == 0:
avg_loss = epoch_loss / num_batches
ppl = math.exp(min(avg_loss, 20))
elapsed = time.time() - start_time
print(f" Epoch {epoch+1}/{EPOCHS} | Batch {num_batches} | Loss: {avg_loss:.4f} | PPL: {ppl:.1f} | Time: {elapsed:.0f}s")
avg_loss = epoch_loss / max(num_batches, 1)
ppl = math.exp(min(avg_loss, 20))
elapsed = time.time() - start_time
print(f" Epoch {epoch+1} DONE — Loss: {avg_loss:.4f} | PPL: {ppl:.1f} | Time: {elapsed:.0f}s")
print()
total_time = time.time() - start_time
print(f"Training complete in {total_time:.0f}s ({total_time/60:.1f} min)")
# ============================================================================
# SAVE
# ============================================================================
torch.save({
'model_state_dict': model.state_dict(),
'word2id': word2id,
'id2word': id2word,
'config': {
'vocab_size': actual_vocab,
'dim': DIM,
'num_layers': NUM_LAYERS,
'num_heads': NUM_HEADS,
'seq_len': SEQ_LEN,
'params': num_params,
'corpus': 'v4',
'corpus_size': len(text),
'trained': time.strftime('%Y-%m-%dT%H:%M:%S'),
}
}, MODEL_PATH)
print(f"Saved to {MODEL_PATH}")
size_mb = os.path.getsize(MODEL_PATH) / (1024 * 1024)
print(f"Model size: {size_mb:.1f}MB")
# ============================================================================
# QUICK TEST — generate a few tokens
# ============================================================================
print("\nTest generation:")
model.eval()
prompts = ["blackroad", "the mesh network", "agents collaborate"]
for prompt in prompts:
tokens = [word2id.get(w, 1) for w in prompt.lower().split()]
x = torch.tensor([tokens + [0] * (SEQ_LEN - len(tokens))], dtype=torch.long).to(DEVICE)
with torch.no_grad():
for _ in range(20):
logits = model(x)
next_token = logits[0, len(tokens) - 1].argmax().item()
tokens.append(next_token)
x = torch.tensor([tokens + [0] * (SEQ_LEN - len(tokens))], dtype=torch.long).to(DEVICE)
output = ' '.join(id2word.get(t, '<UNK>') for t in tokens if t > 2)
print(f" '{prompt}'{output[:100]}")
print("\nBlackRoad LLM v4 ready.")