lora/smoke_test.py

"""LoRA training smoke test — Qwen3-0.6B on RTX 2000 Ada.

Minimal training script to verify:
  1. GPU access works
  2. unsloth LoRA training pipeline works
  3. Model saves correctly

Usage:
    # Inside madcat-ml container on junkpile:
    python smoke_test.py

Expected runtime: <5 min
Expected VRAM: ~3-4 GB
"""

from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch
import json
import os

# ── Config ──────────────────────────────────────────────────────────────
MODEL = "Qwen/Qwen3-0.6B"        # Tiny model for smoke testing
MAX_SEQ = 2048                    # Short sequences
RANK = 8                          # Small LoRA rank
ALPHA = 8
DATA = "./bt7274_v4.jsonl"
OUT = "./smoke-test-lora"
EPOCHS = 1                        # Single epoch
BATCH = 1
GRAD_ACCUM = 2                    # Minimal effective batch
LR = 1e-4
MAX_EXAMPLES = 20                 # Only use first 20 examples

# ── Load model (bf16, NOT 4-bit) ───────────────────────────────────────
print("Loading model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL,
    max_seq_length=MAX_SEQ,
    load_in_4bit=False,
    load_in_16bit=True,
    full_finetuning=False,
    dtype=torch.bfloat16,
)

print(f"✓ Model loaded: {MODEL}")
print(f"  CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# ── LoRA adapter ───────────────────────────────────────────────────────
print("\nConfiguring LoRA...")
model = FastLanguageModel.get_peft_model(
    model,
    r=RANK,
    lora_alpha=ALPHA,
    lora_dropout=0,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    max_seq_length=MAX_SEQ,
)

print(f"✓ LoRA configured: r={RANK}, alpha={ALPHA}")

# ── Dataset ────────────────────────────────────────────────────────────
print(f"\nLoading dataset: {DATA}")

def fix_tool_calls(messages):
    """Parse tool_call arguments from JSON strings to dicts."""
    fixed = []
    for msg in messages:
        msg = dict(msg)
        if msg.get("tool_calls"):
            new_tcs = []
            for tc in msg["tool_calls"]:
                tc = dict(tc)
                if "function" in tc:
                    fn = dict(tc["function"])
                    if isinstance(fn.get("arguments"), str):
                        try:
                            fn["arguments"] = json.loads(fn["arguments"])
                        except (ValueError, TypeError):
                            fn["arguments"] = {"raw": fn["arguments"]}
                    tc["function"] = fn
                new_tcs.append(tc)
            msg["tool_calls"] = new_tcs
        fixed.append(msg)
    return fixed

def load_and_format(path, max_examples=None):
    """Load JSONL and format with chat template."""
    from datasets import Dataset
    _enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer
    texts = []
    skipped = 0

    with open(path) as f:
        for i, line in enumerate(f):
            if max_examples and i >= max_examples:
                break
            line = line.strip()
            if not line:
                continue
            row = json.loads(line)
            messages = fix_tool_calls(row["messages"])
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False,
            )
            if len(_enc.encode(text)) <= MAX_SEQ:
                texts.append(text)
            else:
                skipped += 1

    if skipped:
        print(f"  ⚠ Filtered {skipped} examples exceeding {MAX_SEQ} tokens")

    return Dataset.from_dict({"text": texts})

ds = load_and_format(DATA, max_examples=MAX_EXAMPLES)

steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
print(f"✓ Dataset: {len(ds)} examples")
print(f"  Epochs: {EPOCHS}")
print(f"  Effective batch size: {BATCH * GRAD_ACCUM}")
print(f"  Estimated steps: {steps}")

# ── Train ──────────────────────────────────────────────────────────────
print("\nStarting training...")
print("=" * 60)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds,
    args=SFTConfig(
        output_dir=OUT,
        per_device_train_batch_size=BATCH,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        bf16=True,
        logging_steps=2,
        save_steps=999999,  # Don't save checkpoints during training
        warmup_ratio=0.1,
        optim="adamw_torch",
        seed=42,
        report_to="none",
        max_seq_length=MAX_SEQ,
        dataset_num_proc=1,
    ),
)

trainer.train()

print("=" * 60)
print("✓ Training complete")

# ── Save adapter ───────────────────────────────────────────────────────
print(f"\nSaving adapter to {OUT}/")
model.save_pretrained(OUT)
tokenizer.save_pretrained(OUT)

# Verify saved files
adapter_path = os.path.join(OUT, "adapter_model.safetensors")
if os.path.exists(adapter_path):
    size_mb = os.path.getsize(adapter_path) / 1e6
    print(f"✓ Adapter saved: {size_mb:.2f} MB")
else:
    print("✗ ERROR: adapter_model.safetensors not found")

print("\n" + "=" * 60)
print("SMOKE TEST PASSED")
print("=" * 60)
print(f"\nAdapter location: {OUT}/")
print(f"Model: {MODEL}")
print(f"Examples: {len(ds)}")
print(f"LoRA rank: {RANK}")