add training scripts: memory, specialist, mining, smoke test

2026-05-31 11:38:42 +02:00
parent df0d4a6eac
commit 4678816795
9 changed files with 2256 additions and 0 deletions
@@ -0,0 +1,186 @@
+"""LoRA training smoke test — Qwen3-0.6B on RTX 2000 Ada.
+
+Minimal training script to verify:
+  1. GPU access works
+  2. unsloth LoRA training pipeline works
+  3. Model saves correctly
+
+Usage:
+    # Inside madcat-ml container on junkpile:
+    python smoke_test.py
+
+Expected runtime: <5 min
+Expected VRAM: ~3-4 GB
+"""
+
+from unsloth import FastLanguageModel
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+import torch
+import json
+import os
+
+# ── Config ──────────────────────────────────────────────────────────────
+MODEL = "Qwen/Qwen3-0.6B"        # Tiny model for smoke testing
+MAX_SEQ = 2048                    # Short sequences
+RANK = 8                          # Small LoRA rank
+ALPHA = 8
+DATA = "./bt7274_v4.jsonl"
+OUT = "./smoke-test-lora"
+EPOCHS = 1                        # Single epoch
+BATCH = 1
+GRAD_ACCUM = 2                    # Minimal effective batch
+LR = 1e-4
+MAX_EXAMPLES = 20                 # Only use first 20 examples
+
+# ── Load model (bf16, NOT 4-bit) ───────────────────────────────────────
+print("Loading model...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL,
+    max_seq_length=MAX_SEQ,
+    load_in_4bit=False,
+    load_in_16bit=True,
+    full_finetuning=False,
+    dtype=torch.bfloat16,
+)
+
+print(f"✓ Model loaded: {MODEL}")
+print(f"  CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"  GPU: {torch.cuda.get_device_name(0)}")
+    print(f"  VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+
+# ── LoRA adapter ───────────────────────────────────────────────────────
+print("\nConfiguring LoRA...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=RANK,
+    lora_alpha=ALPHA,
+    lora_dropout=0,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=42,
+    max_seq_length=MAX_SEQ,
+)
+
+print(f"✓ LoRA configured: r={RANK}, alpha={ALPHA}")
+
+# ── Dataset ────────────────────────────────────────────────────────────
+print(f"\nLoading dataset: {DATA}")
+
+def fix_tool_calls(messages):
+    """Parse tool_call arguments from JSON strings to dicts."""
+    fixed = []
+    for msg in messages:
+        msg = dict(msg)
+        if msg.get("tool_calls"):
+            new_tcs = []
+            for tc in msg["tool_calls"]:
+                tc = dict(tc)
+                if "function" in tc:
+                    fn = dict(tc["function"])
+                    if isinstance(fn.get("arguments"), str):
+                        try:
+                            fn["arguments"] = json.loads(fn["arguments"])
+                        except (ValueError, TypeError):
+                            fn["arguments"] = {"raw": fn["arguments"]}
+                    tc["function"] = fn
+                new_tcs.append(tc)
+            msg["tool_calls"] = new_tcs
+        fixed.append(msg)
+    return fixed
+
+def load_and_format(path, max_examples=None):
+    """Load JSONL and format with chat template."""
+    from datasets import Dataset
+    _enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer
+    texts = []
+    skipped = 0
+    
+    with open(path) as f:
+        for i, line in enumerate(f):
+            if max_examples and i >= max_examples:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            messages = fix_tool_calls(row["messages"])
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=False,
+            )
+            if len(_enc.encode(text)) <= MAX_SEQ:
+                texts.append(text)
+            else:
+                skipped += 1
+    
+    if skipped:
+        print(f"  ⚠ Filtered {skipped} examples exceeding {MAX_SEQ} tokens")
+    
+    return Dataset.from_dict({"text": texts})
+
+ds = load_and_format(DATA, max_examples=MAX_EXAMPLES)
+
+steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
+print(f"✓ Dataset: {len(ds)} examples")
+print(f"  Epochs: {EPOCHS}")
+print(f"  Effective batch size: {BATCH * GRAD_ACCUM}")
+print(f"  Estimated steps: {steps}")
+
+# ── Train ──────────────────────────────────────────────────────────────
+print("\nStarting training...")
+print("=" * 60)
+
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=ds,
+    args=SFTConfig(
+        output_dir=OUT,
+        per_device_train_batch_size=BATCH,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        bf16=True,
+        logging_steps=2,
+        save_steps=999999,  # Don't save checkpoints during training
+        warmup_ratio=0.1,
+        optim="adamw_torch",
+        seed=42,
+        report_to="none",
+        max_seq_length=MAX_SEQ,
+        dataset_num_proc=1,
+    ),
+)
+
+trainer.train()
+
+print("=" * 60)
+print("✓ Training complete")
+
+# ── Save adapter ───────────────────────────────────────────────────────
+print(f"\nSaving adapter to {OUT}/")
+model.save_pretrained(OUT)
+tokenizer.save_pretrained(OUT)
+
+# Verify saved files
+adapter_path = os.path.join(OUT, "adapter_model.safetensors")
+if os.path.exists(adapter_path):
+    size_mb = os.path.getsize(adapter_path) / 1e6
+    print(f"✓ Adapter saved: {size_mb:.2f} MB")
+else:
+    print("✗ ERROR: adapter_model.safetensors not found")
+
+print("\n" + "=" * 60)
+print("SMOKE TEST PASSED")
+print("=" * 60)
+print(f"\nAdapter location: {OUT}/")
+print(f"Model: {MODEL}")
+print(f"Examples: {len(ds)}")
+print(f"LoRA rank: {RANK}")