"""Substrate v5 LoRA — Qwen3.6-27B bf16 on H200. Generic MADCAT OS substrate training. NOT persona-specific. 40 curated examples: identity, tool categories, disambiguation, framing, EEMS deep-dive. All with blocks. bf16 LoRA (NOT QLoRA) — H200 143 GB has room for full precision. Usage: python3 train_v5.py """ import os os.environ["HF_HOME"] = "/workspace/models" from unsloth import FastLanguageModel from trl import SFTTrainer, SFTConfig import torch import json # ── Config ─────────────────────────────────────────────────────────── MODEL = "Qwen/Qwen3.6-27B" MAX_SEQ = 8192 RANK = 16 ALPHA = 16 DATA = "/workspace/substrate_v5.jsonl" OUT = "/workspace/substrate-qwen36-27b-lora-v5" EPOCHS = 3 BATCH = 1 GRAD_ACCUM = 4 # smaller dataset → smaller effective batch (4 vs 8) LR = 5e-5 WARMUP_RATIO = 0.1 # 10% warmup — small dataset benefits from longer warmup MAX_EXAMPLES = None # use all examples # ── Load model (bf16, NOT 4-bit) ──────────────────────────────────── print(f"Loading {MODEL}...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL, max_seq_length=MAX_SEQ, load_in_4bit=False, load_in_16bit=True, full_finetuning=False, dtype=torch.bfloat16, ) print(f"Model loaded: {MODEL}") print(f" GPU: {torch.cuda.get_device_name(0)}") print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") print(f" Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB") # ── LoRA adapter ──────────────────────────────────────────────────── model = FastLanguageModel.get_peft_model( model, r=RANK, lora_alpha=ALPHA, lora_dropout=0, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], bias="none", use_gradient_checkpointing="unsloth", random_state=42, max_seq_length=MAX_SEQ, ) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f"LoRA: r={RANK}, alpha={ALPHA}") print(f" Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)") # ── Dataset ───────────────────────────────────────────────────────── def fix_tool_calls(messages): """Parse tool_call arguments from JSON strings to dicts.""" fixed = [] for msg in messages: msg = dict(msg) if msg.get("tool_calls"): new_tcs = [] for tc in msg["tool_calls"]: tc = dict(tc) if "function" in tc: fn = dict(tc["function"]) if isinstance(fn.get("arguments"), str): try: fn["arguments"] = json.loads(fn["arguments"]) except (ValueError, TypeError): fn["arguments"] = {"raw": fn["arguments"]} tc["function"] = fn new_tcs.append(tc) msg["tool_calls"] = new_tcs fixed.append(msg) return fixed def load_and_format(path, max_examples=None): """Load JSONL and format with chat template.""" from datasets import Dataset _enc = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer texts = [] skipped = 0 with open(path) as f: for i, line in enumerate(f): if max_examples and i >= max_examples: break line = line.strip() if not line: continue row = json.loads(line) messages = fix_tool_calls(row["messages"]) try: text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False, enable_thinking=True, ) except TypeError: text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False, ) tok_len = len(_enc.encode(text)) if tok_len <= MAX_SEQ: texts.append(text) else: skipped += 1 print(f" Skipped example {i}: {tok_len} tokens > {MAX_SEQ}") if skipped: print(f" Filtered {skipped} examples exceeding {MAX_SEQ} tokens") return Dataset.from_dict({"text": texts}) print(f"\nLoading dataset: {DATA}") ds = load_and_format(DATA, max_examples=MAX_EXAMPLES) steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM) print(f" Loaded: {len(ds)} examples") print(f" Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}") print(f" Estimated steps: {steps}") # ── Train ─────────────────────────────────────────────────────────── print(f"\nTraining...") print("=" * 60) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=ds, args=SFTConfig( output_dir=OUT, per_device_train_batch_size=BATCH, gradient_accumulation_steps=GRAD_ACCUM, num_train_epochs=EPOCHS, learning_rate=LR, bf16=True, logging_steps=1, save_steps=999999, # save only at end warmup_ratio=WARMUP_RATIO, optim="adamw_torch", # no adamw_8bit — bitsandbytes cu132 issue seed=42, report_to="none", max_seq_length=MAX_SEQ, dataset_num_proc=1, lr_scheduler_type="cosine", weight_decay=0.01, ), ) result = trainer.train() print("=" * 60) print(f"Training complete — loss: {result.training_loss:.4f}") print(f" Steps: {result.global_step}") print(f" Runtime: {result.metrics['train_runtime']:.0f}s") print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") # ── Save adapter ──────────────────────────────────────────────────── print(f"\nSaving adapter to {OUT}/") model.save_pretrained(OUT) tokenizer.save_pretrained(OUT) adapter_path = os.path.join(OUT, "adapter_model.safetensors") if os.path.exists(adapter_path): size_mb = os.path.getsize(adapter_path) / 1e6 print(f" Adapter: {size_mb:.1f} MB") else: print(" ERROR: adapter_model.safetensors not found") # ── Merge LoRA into base weights ──────────────────────────────────── MERGED = "/workspace/substrate-qwen36-27b-merged" print(f"\nMerging LoRA into base weights → {MERGED}/") model.save_pretrained_merged( MERGED, tokenizer, save_method="merged_16bit", ) # Verify merged model merged_files = [f for f in os.listdir(MERGED) if f.endswith(".safetensors")] merged_size_gb = sum(os.path.getsize(os.path.join(MERGED, f)) for f in merged_files) / 1e9 print(f" Merged: {len(merged_files)} safetensor files, {merged_size_gb:.1f} GB") print(f"\n{'=' * 60}") print("DONE — TRAIN + MERGE") print(f"{'=' * 60}") print(f"\n Model: {MODEL}") print(f" Examples: {len(ds)}") print(f" LoRA: r={RANK}, alpha={ALPHA}") print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") print(f" Adapter: {OUT}/") print(f" Merged: {MERGED}/") print(f"\nNext: python3 quantize_awq.py")