From 26e776db717246f420519d1f6038c969f9bbc08e Mon Sep 17 00:00:00 2001 From: marauder-actual Date: Mon, 1 Jun 2026 03:52:55 +0200 Subject: [PATCH] add substrate v5 training and build scripts --- build_v5.py | 125 ++++++++++++++++++++++++++++++ train_v5.py | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100644 build_v5.py create mode 100644 train_v5.py diff --git a/build_v5.py b/build_v5.py new file mode 100644 index 0000000..b018f72 --- /dev/null +++ b/build_v5.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Build v5 training dataset from training/*.json files. + +Each file in training/ is a single JSON object with a "messages" array. +This script validates, counts, and merges them into a single JSONL file. + +Usage: + python build_v5.py +""" + +import json +import os +import sys +from pathlib import Path + +TRAINING_DIR = Path(__file__).parent / "training" +OUTPUT = Path(__file__).parent / "substrate_v5.jsonl" + + +def validate_example(ex: dict, filename: str) -> list[str]: + """Validate example structure. Returns list of warnings.""" + warnings = [] + msgs = ex.get("messages") + if not msgs or not isinstance(msgs, list): + warnings.append(f"{filename} — no messages array") + return warnings + + if msgs[0].get("role") != "system": + warnings.append(f"{filename} — first message is not system") + + has_user = any(m["role"] == "user" for m in msgs) + has_assistant = any(m["role"] == "assistant" for m in msgs) + + if not has_user: + warnings.append(f"{filename} — no user message") + if not has_assistant: + warnings.append(f"{filename} — no assistant message") + + # Check for blocks in assistant messages + has_think = False + for m in msgs: + if m["role"] == "assistant": + content = m.get("content") or "" + if "" in content: + has_think = True + + if not has_think: + warnings.append(f"{filename} — no block in assistant response") + + return warnings + + +def main(): + if not TRAINING_DIR.exists(): + print(f"ERROR: {TRAINING_DIR} not found") + sys.exit(1) + + files = sorted(TRAINING_DIR.glob("*.json")) + if not files: + print(f"ERROR: no .json files in {TRAINING_DIR}") + sys.exit(1) + + print(f"Building substrate v5 dataset") + print(f"Source: {TRAINING_DIR}/") + print(f"Output: {OUTPUT}") + print("=" * 50) + + examples = [] + all_warnings = [] + tool_call_count = 0 + direct_count = 0 + think_count = 0 + total_tool_calls = 0 + + for f in files: + try: + with open(f) as fh: + ex = json.load(fh) + except json.JSONDecodeError as e: + print(f" ERROR: {f.name} — invalid JSON: {e}") + continue + + warnings = validate_example(ex, f.name) + all_warnings.extend(warnings) + + # Stats + has_tc = False + for m in ex["messages"]: + if m.get("tool_calls"): + has_tc = True + total_tool_calls += len(m["tool_calls"]) + content = m.get("content") or "" + if "" in content and m["role"] == "assistant": + think_count += 1 + + if has_tc: + tool_call_count += 1 + else: + direct_count += 1 + + examples.append(ex) + print(f" {f.name:<45} {'TC' if has_tc else 'direct':>6} {len(ex['messages']):>2} msgs") + + print(f"\n Total examples: {len(examples)}") + print(f" Tool-call examples: {tool_call_count}") + print(f" Direct examples: {direct_count}") + print(f" Total tool calls: {total_tool_calls}") + print(f" Think blocks: {think_count}") + + if all_warnings: + print(f"\n Warnings ({len(all_warnings)}):") + for w in all_warnings: + print(f" {w}") + + # Write JSONL + with open(OUTPUT, "w") as out: + for ex in examples: + out.write(json.dumps(ex, ensure_ascii=False) + "\n") + + size_kb = os.path.getsize(OUTPUT) / 1024 + print(f"\n Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)") + + +if __name__ == "__main__": + main() diff --git a/train_v5.py b/train_v5.py new file mode 100644 index 0000000..8031548 --- /dev/null +++ b/train_v5.py @@ -0,0 +1,215 @@ +"""Substrate v5 LoRA — Qwen3.6-27B bf16 on H200. + +Generic MADCAT OS substrate training. NOT persona-specific. +40 curated examples: identity, tool categories, disambiguation, +framing, EEMS deep-dive. All with blocks. + +bf16 LoRA (NOT QLoRA) — H200 143 GB has room for full precision. + +Usage: + python3 train_v5.py +""" + +import os +os.environ["HF_HOME"] = "/workspace/models" + +from unsloth import FastLanguageModel +from trl import SFTTrainer, SFTConfig +import torch +import json + +# ── Config ─────────────────────────────────────────────────────────── +MODEL = "Qwen/Qwen3.6-27B" +MAX_SEQ = 8192 +RANK = 16 +ALPHA = 16 +DATA = "/workspace/substrate_v5.jsonl" +OUT = "/workspace/substrate-qwen36-27b-lora-v5" +EPOCHS = 3 +BATCH = 1 +GRAD_ACCUM = 4 # smaller dataset → smaller effective batch (4 vs 8) +LR = 5e-5 +WARMUP_RATIO = 0.1 # 10% warmup — small dataset benefits from longer warmup +MAX_EXAMPLES = None # use all examples + +# ── Load model (bf16, NOT 4-bit) ──────────────────────────────────── +print(f"Loading {MODEL}...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL, + max_seq_length=MAX_SEQ, + load_in_4bit=False, + load_in_16bit=True, + full_finetuning=False, + dtype=torch.bfloat16, +) + +print(f"Model loaded: {MODEL}") +print(f" GPU: {torch.cuda.get_device_name(0)}") +print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") +print(f" Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB") + +# ── LoRA adapter ──────────────────────────────────────────────────── +model = FastLanguageModel.get_peft_model( + model, + r=RANK, + lora_alpha=ALPHA, + lora_dropout=0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, + max_seq_length=MAX_SEQ, +) + +trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) +total = sum(p.numel() for p in model.parameters()) +print(f"LoRA: r={RANK}, alpha={ALPHA}") +print(f" Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)") + + +# ── Dataset ───────────────────────────────────────────────────────── +def fix_tool_calls(messages): + """Parse tool_call arguments from JSON strings to dicts.""" + fixed = [] + for msg in messages: + msg = dict(msg) + if msg.get("tool_calls"): + new_tcs = [] + for tc in msg["tool_calls"]: + tc = dict(tc) + if "function" in tc: + fn = dict(tc["function"]) + if isinstance(fn.get("arguments"), str): + try: + fn["arguments"] = json.loads(fn["arguments"]) + except (ValueError, TypeError): + fn["arguments"] = {"raw": fn["arguments"]} + tc["function"] = fn + new_tcs.append(tc) + msg["tool_calls"] = new_tcs + fixed.append(msg) + return fixed + + +def load_and_format(path, max_examples=None): + """Load JSONL and format with chat template.""" + from datasets import Dataset + _enc = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer + texts = [] + skipped = 0 + with open(path) as f: + for i, line in enumerate(f): + if max_examples and i >= max_examples: + break + line = line.strip() + if not line: + continue + row = json.loads(line) + messages = fix_tool_calls(row["messages"]) + try: + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + enable_thinking=True, + ) + except TypeError: + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + ) + tok_len = len(_enc.encode(text)) + if tok_len <= MAX_SEQ: + texts.append(text) + else: + skipped += 1 + print(f" Skipped example {i}: {tok_len} tokens > {MAX_SEQ}") + if skipped: + print(f" Filtered {skipped} examples exceeding {MAX_SEQ} tokens") + return Dataset.from_dict({"text": texts}) + + +print(f"\nLoading dataset: {DATA}") +ds = load_and_format(DATA, max_examples=MAX_EXAMPLES) + +steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM) +print(f" Loaded: {len(ds)} examples") +print(f" Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}") +print(f" Estimated steps: {steps}") + +# ── Train ─────────────────────────────────────────────────────────── +print(f"\nTraining...") +print("=" * 60) + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=ds, + args=SFTConfig( + output_dir=OUT, + per_device_train_batch_size=BATCH, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + bf16=True, + logging_steps=1, + save_steps=999999, # save only at end + warmup_ratio=WARMUP_RATIO, + optim="adamw_torch", # no adamw_8bit — bitsandbytes cu132 issue + seed=42, + report_to="none", + max_seq_length=MAX_SEQ, + dataset_num_proc=1, + lr_scheduler_type="cosine", + weight_decay=0.01, + ), +) + +result = trainer.train() + +print("=" * 60) +print(f"Training complete — loss: {result.training_loss:.4f}") +print(f" Steps: {result.global_step}") +print(f" Runtime: {result.metrics['train_runtime']:.0f}s") +print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") + +# ── Save adapter ──────────────────────────────────────────────────── +print(f"\nSaving adapter to {OUT}/") +model.save_pretrained(OUT) +tokenizer.save_pretrained(OUT) + +adapter_path = os.path.join(OUT, "adapter_model.safetensors") +if os.path.exists(adapter_path): + size_mb = os.path.getsize(adapter_path) / 1e6 + print(f" Adapter: {size_mb:.1f} MB") +else: + print(" ERROR: adapter_model.safetensors not found") + +# ── Merge LoRA into base weights ──────────────────────────────────── +MERGED = "/workspace/substrate-qwen36-27b-merged" +print(f"\nMerging LoRA into base weights → {MERGED}/") +model.save_pretrained_merged( + MERGED, + tokenizer, + save_method="merged_16bit", +) + +# Verify merged model +merged_files = [f for f in os.listdir(MERGED) if f.endswith(".safetensors")] +merged_size_gb = sum(os.path.getsize(os.path.join(MERGED, f)) for f in merged_files) / 1e9 +print(f" Merged: {len(merged_files)} safetensor files, {merged_size_gb:.1f} GB") + +print(f"\n{'=' * 60}") +print("DONE — TRAIN + MERGE") +print(f"{'=' * 60}") +print(f"\n Model: {MODEL}") +print(f" Examples: {len(ds)}") +print(f" LoRA: r={RANK}, alpha={ALPHA}") +print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") +print(f" Adapter: {OUT}/") +print(f" Merged: {MERGED}/") +print(f"\nNext: python3 quantize_awq.py")