From 26e776db717246f420519d1f6038c969f9bbc08e Mon Sep 17 00:00:00 2001
From: marauder-actual <marauder@saiden.dev>
Date: Mon, 1 Jun 2026 03:52:55 +0200
Subject: [PATCH] add substrate v5 training and build scripts

---
 build_v5.py | 125 ++++++++++++++++++++++++++++++
 train_v5.py | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 build_v5.py
 create mode 100644 train_v5.py
diff --git a/build_v5.py b/build_v5.py
new file mode 100644
index 0000000..b018f72
--- /dev/null
+++ b/build_v5.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Build v5 training dataset from training/*.json files.
+
+Each file in training/ is a single JSON object with a "messages" array.
+This script validates, counts, and merges them into a single JSONL file.
+
+Usage:
+    python build_v5.py
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+TRAINING_DIR = Path(__file__).parent / "training"
+OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"
+
+
+def validate_example(ex: dict, filename: str) -> list[str]:
+    """Validate example structure. Returns list of warnings."""
+    warnings = []
+    msgs = ex.get("messages")
+    if not msgs or not isinstance(msgs, list):
+        warnings.append(f"{filename} — no messages array")
+        return warnings
+
+    if msgs[0].get("role") != "system":
+        warnings.append(f"{filename} — first message is not system")
+
+    has_user = any(m["role"] == "user" for m in msgs)
+    has_assistant = any(m["role"] == "assistant" for m in msgs)
+
+    if not has_user:
+        warnings.append(f"{filename} — no user message")
+    if not has_assistant:
+        warnings.append(f"{filename} — no assistant message")
+
+    # Check for <think> blocks in assistant messages
+    has_think = False
+    for m in msgs:
+        if m["role"] == "assistant":
+            content = m.get("content") or ""
+            if "<think>" in content:
+                has_think = True
+
+    if not has_think:
+        warnings.append(f"{filename} — no <think> block in assistant response")
+
+    return warnings
+
+
+def main():
+    if not TRAINING_DIR.exists():
+        print(f"ERROR: {TRAINING_DIR} not found")
+        sys.exit(1)
+
+    files = sorted(TRAINING_DIR.glob("*.json"))
+    if not files:
+        print(f"ERROR: no .json files in {TRAINING_DIR}")
+        sys.exit(1)
+
+    print(f"Building substrate v5 dataset")
+    print(f"Source: {TRAINING_DIR}/")
+    print(f"Output: {OUTPUT}")
+    print("=" * 50)
+
+    examples = []
+    all_warnings = []
+    tool_call_count = 0
+    direct_count = 0
+    think_count = 0
+    total_tool_calls = 0
+
+    for f in files:
+        try:
+            with open(f) as fh:
+                ex = json.load(fh)
+        except json.JSONDecodeError as e:
+            print(f"  ERROR: {f.name} — invalid JSON: {e}")
+            continue
+
+        warnings = validate_example(ex, f.name)
+        all_warnings.extend(warnings)
+
+        # Stats
+        has_tc = False
+        for m in ex["messages"]:
+            if m.get("tool_calls"):
+                has_tc = True
+                total_tool_calls += len(m["tool_calls"])
+            content = m.get("content") or ""
+            if "<think>" in content and m["role"] == "assistant":
+                think_count += 1
+
+        if has_tc:
+            tool_call_count += 1
+        else:
+            direct_count += 1
+
+        examples.append(ex)
+        print(f"  {f.name:<45} {'TC' if has_tc else 'direct':>6}  {len(ex['messages']):>2} msgs")
+
+    print(f"\n  Total examples:     {len(examples)}")
+    print(f"  Tool-call examples: {tool_call_count}")
+    print(f"  Direct examples:    {direct_count}")
+    print(f"  Total tool calls:   {total_tool_calls}")
+    print(f"  Think blocks:       {think_count}")
+
+    if all_warnings:
+        print(f"\n  Warnings ({len(all_warnings)}):")
+        for w in all_warnings:
+            print(f"    {w}")
+
+    # Write JSONL
+    with open(OUTPUT, "w") as out:
+        for ex in examples:
+            out.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    size_kb = os.path.getsize(OUTPUT) / 1024
+    print(f"\n  Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train_v5.py b/train_v5.py
new file mode 100644
index 0000000..8031548
--- /dev/null
+++ b/train_v5.py
@@ -0,0 +1,215 @@
+"""Substrate v5 LoRA — Qwen3.6-27B bf16 on H200.
+
+Generic MADCAT OS substrate training. NOT persona-specific.
+40 curated examples: identity, tool categories, disambiguation,
+framing, EEMS deep-dive. All with <think> blocks.
+
+bf16 LoRA (NOT QLoRA) — H200 143 GB has room for full precision.
+
+Usage:
+    python3 train_v5.py
+"""
+
+import os
+os.environ["HF_HOME"] = "/workspace/models"
+
+from unsloth import FastLanguageModel
+from trl import SFTTrainer, SFTConfig
+import torch
+import json
+
+# ── Config ───────────────────────────────────────────────────────────
+MODEL = "Qwen/Qwen3.6-27B"
+MAX_SEQ = 8192
+RANK = 16
+ALPHA = 16
+DATA = "/workspace/substrate_v5.jsonl"
+OUT = "/workspace/substrate-qwen36-27b-lora-v5"
+EPOCHS = 3
+BATCH = 1
+GRAD_ACCUM = 4        # smaller dataset → smaller effective batch (4 vs 8)
+LR = 5e-5
+WARMUP_RATIO = 0.1    # 10% warmup — small dataset benefits from longer warmup
+MAX_EXAMPLES = None    # use all examples
+
+# ── Load model (bf16, NOT 4-bit) ────────────────────────────────────
+print(f"Loading {MODEL}...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL,
+    max_seq_length=MAX_SEQ,
+    load_in_4bit=False,
+    load_in_16bit=True,
+    full_finetuning=False,
+    dtype=torch.bfloat16,
+)
+
+print(f"Model loaded: {MODEL}")
+print(f"  GPU: {torch.cuda.get_device_name(0)}")
+print(f"  VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+print(f"  Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+
+# ── LoRA adapter ────────────────────────────────────────────────────
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=RANK,
+    lora_alpha=ALPHA,
+    lora_dropout=0,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=42,
+    max_seq_length=MAX_SEQ,
+)
+
+trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+total = sum(p.numel() for p in model.parameters())
+print(f"LoRA: r={RANK}, alpha={ALPHA}")
+print(f"  Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
+
+
+# ── Dataset ─────────────────────────────────────────────────────────
+def fix_tool_calls(messages):
+    """Parse tool_call arguments from JSON strings to dicts."""
+    fixed = []
+    for msg in messages:
+        msg = dict(msg)
+        if msg.get("tool_calls"):
+            new_tcs = []
+            for tc in msg["tool_calls"]:
+                tc = dict(tc)
+                if "function" in tc:
+                    fn = dict(tc["function"])
+                    if isinstance(fn.get("arguments"), str):
+                        try:
+                            fn["arguments"] = json.loads(fn["arguments"])
+                        except (ValueError, TypeError):
+                            fn["arguments"] = {"raw": fn["arguments"]}
+                    tc["function"] = fn
+                new_tcs.append(tc)
+            msg["tool_calls"] = new_tcs
+        fixed.append(msg)
+    return fixed
+
+
+def load_and_format(path, max_examples=None):
+    """Load JSONL and format with chat template."""
+    from datasets import Dataset
+    _enc = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
+    texts = []
+    skipped = 0
+    with open(path) as f:
+        for i, line in enumerate(f):
+            if max_examples and i >= max_examples:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            messages = fix_tool_calls(row["messages"])
+            try:
+                text = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=False,
+                    enable_thinking=True,
+                )
+            except TypeError:
+                text = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=False,
+                )
+            tok_len = len(_enc.encode(text))
+            if tok_len <= MAX_SEQ:
+                texts.append(text)
+            else:
+                skipped += 1
+                print(f"  Skipped example {i}: {tok_len} tokens > {MAX_SEQ}")
+    if skipped:
+        print(f"  Filtered {skipped} examples exceeding {MAX_SEQ} tokens")
+    return Dataset.from_dict({"text": texts})
+
+
+print(f"\nLoading dataset: {DATA}")
+ds = load_and_format(DATA, max_examples=MAX_EXAMPLES)
+
+steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
+print(f"  Loaded: {len(ds)} examples")
+print(f"  Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}")
+print(f"  Estimated steps: {steps}")
+
+# ── Train ───────────────────────────────────────────────────────────
+print(f"\nTraining...")
+print("=" * 60)
+
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=ds,
+    args=SFTConfig(
+        output_dir=OUT,
+        per_device_train_batch_size=BATCH,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        bf16=True,
+        logging_steps=1,
+        save_steps=999999,          # save only at end
+        warmup_ratio=WARMUP_RATIO,
+        optim="adamw_torch",        # no adamw_8bit — bitsandbytes cu132 issue
+        seed=42,
+        report_to="none",
+        max_seq_length=MAX_SEQ,
+        dataset_num_proc=1,
+        lr_scheduler_type="cosine",
+        weight_decay=0.01,
+    ),
+)
+
+result = trainer.train()
+
+print("=" * 60)
+print(f"Training complete — loss: {result.training_loss:.4f}")
+print(f"  Steps: {result.global_step}")
+print(f"  Runtime: {result.metrics['train_runtime']:.0f}s")
+print(f"  Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
+
+# ── Save adapter ────────────────────────────────────────────────────
+print(f"\nSaving adapter to {OUT}/")
+model.save_pretrained(OUT)
+tokenizer.save_pretrained(OUT)
+
+adapter_path = os.path.join(OUT, "adapter_model.safetensors")
+if os.path.exists(adapter_path):
+    size_mb = os.path.getsize(adapter_path) / 1e6
+    print(f"  Adapter: {size_mb:.1f} MB")
+else:
+    print("  ERROR: adapter_model.safetensors not found")
+
+# ── Merge LoRA into base weights ────────────────────────────────────
+MERGED = "/workspace/substrate-qwen36-27b-merged"
+print(f"\nMerging LoRA into base weights → {MERGED}/")
+model.save_pretrained_merged(
+    MERGED,
+    tokenizer,
+    save_method="merged_16bit",
+)
+
+# Verify merged model
+merged_files = [f for f in os.listdir(MERGED) if f.endswith(".safetensors")]
+merged_size_gb = sum(os.path.getsize(os.path.join(MERGED, f)) for f in merged_files) / 1e9
+print(f"  Merged: {len(merged_files)} safetensor files, {merged_size_gb:.1f} GB")
+
+print(f"\n{'=' * 60}")
+print("DONE — TRAIN + MERGE")
+print(f"{'=' * 60}")
+print(f"\n  Model:     {MODEL}")
+print(f"  Examples:  {len(ds)}")
+print(f"  LoRA:      r={RANK}, alpha={ALPHA}")
+print(f"  Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
+print(f"  Adapter:   {OUT}/")
+print(f"  Merged:    {MERGED}/")
+print(f"\nNext: python3 quantize_awq.py")