add training scripts: memory, specialist, mining, smoke test

2026-05-31 11:38:42 +02:00
parent df0d4a6eac
commit 4678816795
9 changed files with 2256 additions and 0 deletions
@@ -0,0 +1,516 @@
+#!/usr/bin/env python3
+"""Extract specialist training data from opencode session DB.
+
+Classifies build-agent messages by programming language and outputs
+per-specialist JSONL files for LoRA training.
+
+opencode DB schema:
+  - session: id, agent, title, time_created, ...
+  - message: id, session_id, data (JSON: role, finish, tokens, ...)
+  - part: id, message_id, session_id, data (JSON: type, text/tool/state, ...)
+
+Part types:
+  - text: {type: "text", text: "..."}
+  - tool: {type: "tool", tool: "read", callID: "...", state: {status, input, output, ...}}
+  - step-start/step-finish: inference step boundaries
+  - reasoning: chain-of-thought (skip for training)
+  - patch: file diffs (skip — use tool output instead)
+  - compaction: summary (skip)
+
+Usage:
+    python extract_specialists.py [--db PATH] [--outdir data/] [--min-turns 2]
+    python extract_specialists.py --lang python --outdir data/  # single language
+"""
+
+import argparse
+import json
+import sqlite3
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+# ── Language classification signals ──────────────────────────────────
+
+LANG_SIGNALS: dict[str, dict[str, list[str]]] = {
+    "rust": {
+        "extensions": [".rs"],
+        "files": ["Cargo.toml", "Cargo.lock", "build.rs", "clippy.toml", "rustfmt.toml"],
+        "commands": ["cargo ", "cargo build", "cargo test", "cargo clippy", "cargo fmt",
+                     "cargo add", "rustc ", "rustup "],
+        "errors": ["error[E", "rustc --explain", "cannot find value", "expected struct",
+                   "borrow checker"],
+    },
+    "typescript": {
+        "extensions": [".ts", ".tsx", ".mts", ".cts"],
+        "files": ["tsconfig.json", "package.json", "bun.lockb", "pnpm-lock.yaml",
+                  "next.config", "vite.config", "astro.config"],
+        "commands": ["npm ", "pnpm ", "bun ", "npx ", "tsc ", "vitest ", "jest ",
+                     "biome ", "eslint "],
+        "errors": ["error TS", "TS2", "TS7", "Cannot find module", "Type '"],
+    },
+    "python": {
+        "extensions": [".py", ".pyi"],
+        "files": ["pyproject.toml", "setup.py", "setup.cfg", "requirements.txt",
+                  "ruff.toml", "mypy.ini", ".flake8", "noxfile.py", "tox.ini"],
+        "commands": ["python ", "python3 ", "pip ", "uv ", "pytest ", "ruff ", "mypy ",
+                     "uvicorn ", "gunicorn "],
+        "errors": ["Traceback (most recent", "SyntaxError", "ImportError",
+                   "TypeError", "ModuleNotFoundError"],
+    },
+    "ruby": {
+        "extensions": [".rb", ".erb", ".haml", ".slim"],
+        "files": ["Gemfile", "Gemfile.lock", "Rakefile", ".ruby-version",
+                  ".rubocop.yml", ".standard.yml"],
+        "commands": ["bundle ", "rails ", "rake ", "rspec ", "rubocop ",
+                     "standardrb ", "gem "],
+        "errors": ["NoMethodError", "NameError", "ArgumentError",
+                   "ActiveRecord::", "undefined method"],
+    },
+    "swift": {
+        "extensions": [".swift"],
+        "files": ["Package.swift", "project.yml", ".xcodeproj", ".xcworkspace"],
+        "commands": ["swift build", "swift test", "swift run", "xcodebuild ",
+                     "swift-format ", "swift package "],
+        "errors": ["cannot convert value of type", "protocol conformance",
+                   "value of type", "has no member"],
+    },
+}
+
+# Adapter codenames
+LANG_TO_NAME = {
+    "rust": "oxidizer",
+    "typescript": "prism",
+    "python": "serpent",
+    "ruby": "forge",
+    "swift": "swiftblade",
+}
+
+# System prompts per specialist
+SYSTEM_PROMPTS: dict[str, str] = {}
+
+
+def load_system_prompts(agents_dir: Path) -> None:
+    """Load agent system prompts from markdown files."""
+    mapping = {
+        "rust": "build-rust.md",
+        "typescript": "build-ts.md",
+        "python": "build-python.md",
+        "ruby": "build-ruby.md",
+        "swift": "build-swift.md",
+    }
+    for lang, filename in mapping.items():
+        path = agents_dir / filename
+        if path.exists():
+            SYSTEM_PROMPTS[lang] = path.read_text().strip()
+        else:
+            print(f"  WARN: {path} not found, using default prompt for {lang}")
+            SYSTEM_PROMPTS[lang] = f"You are a {lang} coding agent."
+
+
+def classify_text(content: str) -> dict[str, float]:
+    """Score text's relevance to each language. Returns {lang: score}."""
+    scores: dict[str, float] = defaultdict(float)
+    content_lower = content.lower()
+
+    for lang, signals in LANG_SIGNALS.items():
+        for ext in signals["extensions"]:
+            scores[lang] += content_lower.count(ext) * 3.0
+        for f in signals["files"]:
+            if f.lower() in content_lower:
+                scores[lang] += 5.0
+        for cmd in signals["commands"]:
+            scores[lang] += content_lower.count(cmd.lower()) * 2.0
+        for err in signals["errors"]:
+            if err.lower() in content_lower:
+                scores[lang] += 4.0
+
+    return dict(scores)
+
+
+def classify_conversation(all_text: str) -> str | None:
+    """Classify concatenated conversation text to a single language."""
+    scores = classify_text(all_text)
+    if not scores:
+        return None
+
+    sorted_langs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    if len(sorted_langs) == 0:
+        return None
+
+    winner, winner_score = sorted_langs[0]
+    if winner_score < 5.0:
+        return None
+
+    if len(sorted_langs) > 1:
+        runner_up_score = sorted_langs[1][1]
+        if runner_up_score > 0 and winner_score / runner_up_score < 1.5:
+            return None  # Ambiguous
+
+    return winner
+
+
+# ── Tool call tools we care about for training ──────────────────────
+
+TRAINING_TOOLS = {"bash", "read", "edit", "write", "glob", "grep", "todowrite", "question"}
+
+# Max output length to include (truncate large tool outputs)
+# 8192 tokens ≈ ~32K chars. Budget: system ~2K, user ~2K, leaves ~28K for assistant+tools.
+# Each tool call+result pair: ~500–2000 chars. Cap output at 2000 to fit more exchanges.
+MAX_OUTPUT_LEN = 2000
+
+
+def truncate_output(output: str, max_len: int = MAX_OUTPUT_LEN) -> str:
+    """Truncate tool output to max_len chars."""
+    if len(output) <= max_len:
+        return output
+    return output[:max_len] + f"\n... (truncated, {len(output)} total chars)"
+
+
+def extract_sessions(db_path: Path, target_lang: str | None = None) -> list[dict]:
+    """Extract build-agent sessions from opencode DB.
+
+    Returns list of {session_id, title, messages: [...], raw_text: str}
+    where messages are in ChatML-like format suitable for training.
+    """
+    conn = sqlite3.connect(db_path)
+
+    session_rows = conn.execute("""
+        SELECT id, title, time_created
+        FROM session
+        WHERE agent = 'build' OR agent LIKE 'build-%'
+        ORDER BY time_created
+    """).fetchall()
+
+    print(f"Found {len(session_rows)} build sessions")
+
+    all_conversations: list[dict] = []
+
+    for s_id, s_title, s_created in session_rows:
+        # Get messages for this session, ordered
+        msg_rows = conn.execute("""
+            SELECT m.id, json_extract(m.data, '$.role') as role,
+                   json_extract(m.data, '$.finish') as finish
+            FROM message m
+            WHERE m.session_id = ?
+            ORDER BY m.time_created
+        """, (s_id,)).fetchall()
+
+        if len(msg_rows) < 2:
+            continue
+
+        # Get all parts for this session, grouped by message
+        part_rows = conn.execute("""
+            SELECT p.message_id,
+                   json_extract(p.data, '$.type') as ptype,
+                   p.data as pdata
+            FROM part p
+            WHERE p.session_id = ?
+            ORDER BY p.time_created
+        """, (s_id,)).fetchall()
+
+        # Group parts by message_id
+        msg_parts: dict[str, list[tuple[str, str]]] = defaultdict(list)
+        for p_msg_id, p_type, p_data in part_rows:
+            msg_parts[p_msg_id].append((p_type, p_data))
+
+        # Build ChatML messages
+        messages: list[dict[str, Any]] = []
+        raw_texts: list[str] = []  # for classification
+
+        for m_id, m_role, m_finish in msg_rows:
+            parts = msg_parts.get(m_id, [])
+
+            if m_role == "user":
+                # Extract user text
+                user_text = ""
+                for ptype, pdata in parts:
+                    if ptype == "text":
+                        pd = json.loads(pdata)
+                        user_text += pd.get("text", "")
+                if user_text.strip():
+                    messages.append({"role": "user", "content": user_text.strip()})
+                    raw_texts.append(user_text)
+
+            elif m_role == "assistant":
+                # Collect text parts and tool calls
+                asst_text = ""
+                tool_calls: list[dict] = []
+                tool_results: list[dict] = []
+
+                for ptype, pdata in parts:
+                    if ptype == "text":
+                        pd = json.loads(pdata)
+                        asst_text += pd.get("text", "")
+
+                    elif ptype == "tool":
+                        pd = json.loads(pdata)
+                        tool_name = pd.get("tool", "")
+                        call_id = pd.get("callID", "")
+                        state = pd.get("state", {})
+
+                        if tool_name not in TRAINING_TOOLS:
+                            continue
+                        if state.get("status") != "completed":
+                            continue
+
+                        tool_input = state.get("input", {})
+                        tool_output = state.get("output", "")
+
+                        # Build tool_call in OpenAI format
+                        tool_calls.append({
+                            "type": "function",
+                            "id": call_id,
+                            "function": {
+                                "name": tool_name,
+                                "arguments": tool_input,
+                            },
+                        })
+
+                        # Build tool result
+                        output_str = truncate_output(str(tool_output))
+                        tool_results.append({
+                            "role": "tool",
+                            "tool_call_id": call_id,
+                            "content": output_str,
+                        })
+
+                        # Collect for classification
+                        raw_texts.append(json.dumps(tool_input))
+                        raw_texts.append(output_str)
+
+                # Emit assistant message with tool calls
+                if tool_calls:
+                    messages.append({
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": tool_calls,
+                    })
+                    messages.extend(tool_results)
+
+                # Emit text-only assistant message (after tools, or standalone)
+                if asst_text.strip():
+                    messages.append({
+                        "role": "assistant",
+                        "content": asst_text.strip(),
+                    })
+                    raw_texts.append(asst_text)
+
+        if len(messages) < 3:  # need at least user + assistant + something
+            continue
+
+        # Concatenate raw text for classification
+        raw_combined = " ".join(raw_texts)
+
+        # Early classification filter if target_lang specified
+        if target_lang:
+            lang = classify_conversation(raw_combined)
+            if lang != target_lang:
+                continue
+
+        all_conversations.append({
+            "session_id": s_id,
+            "title": s_title,
+            "messages": messages,
+            "raw_text": raw_combined,
+        })
+
+    conn.close()
+    return all_conversations
+
+
+def window_conversations(
+    conversations: list[dict],
+    min_turns: int = 2,
+    max_turns: int = 10,
+) -> list[dict]:
+    """Split long conversations into training windows.
+
+    Each window captures a coherent exchange: user question → assistant response
+    including all tool calls and results within that exchange.
+    """
+    windows: list[dict] = []
+
+    for conv in conversations:
+        msgs = conv["messages"]
+
+        # Find user message indices
+        user_indices = [i for i, m in enumerate(msgs) if m["role"] == "user"]
+
+        if len(user_indices) < min_turns:
+            # Short enough to use as-is
+            if len(user_indices) >= 1:
+                windows.append({
+                    "session_id": conv["session_id"],
+                    "title": conv["title"],
+                    "messages": msgs,
+                    "raw_text": conv.get("raw_text", ""),
+                })
+            continue
+
+        # Window by user-turn boundaries
+        for start in range(0, len(user_indices), max_turns):
+            end = min(start + max_turns, len(user_indices))
+
+            first_msg = user_indices[start]
+            # End at next user msg or end of conversation
+            if end < len(user_indices):
+                last_msg = user_indices[end]
+            else:
+                last_msg = len(msgs)
+
+            window_msgs = msgs[first_msg:last_msg]
+
+            # Skip windows that are too short
+            user_count = sum(1 for m in window_msgs if m["role"] == "user")
+            if user_count < 1:
+                continue
+
+            windows.append({
+                "session_id": conv["session_id"],
+                "title": conv["title"],
+                "messages": window_msgs,
+                "raw_text": " ".join(
+                    m.get("content", "") or json.dumps(m.get("tool_calls", ""))
+                    for m in window_msgs
+                ),
+            })
+
+    return windows
+
+
+def format_example(messages: list[dict], lang: str) -> dict:
+    """Format a conversation window as a training example with system prompt."""
+    system_prompt = SYSTEM_PROMPTS.get(lang, f"You are a {lang} coding agent.")
+
+    # Clean up messages: ensure tool_call arguments are dicts
+    cleaned = []
+    for msg in messages:
+        msg = dict(msg)
+        if msg.get("tool_calls"):
+            new_tcs = []
+            for tc in msg["tool_calls"]:
+                tc = dict(tc)
+                if "function" in tc:
+                    fn = dict(tc["function"])
+                    if isinstance(fn.get("arguments"), str):
+                        try:
+                            fn["arguments"] = json.loads(fn["arguments"])
+                        except (ValueError, TypeError):
+                            fn["arguments"] = {"raw": fn["arguments"]}
+                    tc["function"] = fn
+                new_tcs.append(tc)
+            msg["tool_calls"] = new_tcs
+        # Remove None content if no tool_calls
+        if msg.get("content") is None and not msg.get("tool_calls"):
+            continue
+        cleaned.append(msg)
+
+    return {
+        "messages": [{"role": "system", "content": system_prompt}] + cleaned,
+    }
+
+
+def write_dataset(examples: list[dict], path: Path) -> None:
+    """Write examples to JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        for ex in examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    print(f"  Wrote {len(examples)} examples → {path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Extract specialist training data")
+    parser.add_argument(
+        "--db", type=Path,
+        default=Path.home() / ".local/share/opencode/opencode.db",
+        help="Path to opencode SQLite database",
+    )
+    parser.add_argument(
+        "--agents-dir", type=Path,
+        default=Path.home() / ".config/opencode/agents",
+        help="Path to agent system prompt directory",
+    )
+    parser.add_argument(
+        "--outdir", type=Path, default=Path("data"),
+        help="Output directory for JSONL files",
+    )
+    parser.add_argument(
+        "--lang", type=str, default=None,
+        help="Extract single language only (rust, typescript, python, ruby, swift)",
+    )
+    parser.add_argument(
+        "--min-turns", type=int, default=1,
+        help="Minimum user turns per training window",
+    )
+    parser.add_argument(
+        "--max-turns", type=int, default=10,
+        help="Maximum user turns per training window",
+    )
+    args = parser.parse_args()
+
+    print("══ Specialist Data Extraction ══")
+    print(f"DB:     {args.db}")
+    print(f"Agents: {args.agents_dir}")
+    print(f"Output: {args.outdir}")
+    if args.lang:
+        print(f"Filter: {args.lang} only")
+    print()
+
+    # Load system prompts
+    load_system_prompts(args.agents_dir)
+    print(f"Loaded {len(SYSTEM_PROMPTS)} system prompts")
+
+    # Extract sessions
+    conversations = extract_sessions(args.db, target_lang=args.lang)
+    print(f"Extracted {len(conversations)} conversations")
+
+    # Window into training examples
+    windows = window_conversations(
+        conversations, min_turns=args.min_turns, max_turns=args.max_turns,
+    )
+    print(f"Created {len(windows)} training windows")
+
+    # Classify and bucket
+    buckets: dict[str, list[dict]] = defaultdict(list)
+    unclassified = 0
+
+    for window in windows:
+        if args.lang:
+            lang = args.lang
+        else:
+            lang = classify_conversation(window.get("raw_text", ""))
+        if lang:
+            example = format_example(window["messages"], lang)
+            buckets[lang].append(example)
+        else:
+            unclassified += 1
+
+    # Report
+    print(f"\n── Classification Results ──")
+    if not args.lang:
+        print(f"Unclassified: {unclassified}")
+    for lang, examples in sorted(buckets.items(), key=lambda x: -len(x[1])):
+        name = LANG_TO_NAME.get(lang, lang)
+        # Count tool calls and text-only
+        tc_count = sum(
+            1 for ex in examples
+            if any(m.get("tool_calls") for m in ex["messages"])
+        )
+        print(f"  {name} ({lang}): {len(examples)} examples ({tc_count} with tool calls)")
+
+    # Write per-language datasets
+    print(f"\n── Writing Datasets ──")
+    for lang, examples in buckets.items():
+        name = LANG_TO_NAME.get(lang, lang)
+        write_dataset(examples, args.outdir / f"{name}.jsonl")
+
+    print(f"\nDone. Review datasets in {args.outdir}/")
+    print(f"Next steps:")
+    print(f"  1. python mine_repos.py --repos repos.json  (add git diff examples)")
+    print(f"  2. Manual curation pass")
+    print(f"  3. python train_specialist.py --name <adapter>")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""Generate ShareGPT training dataset from 100 curated EEMS memories.
+
+Reads directly from the marauder SQLite DB on fuji.
+Outputs: bt7274_memory_100.jsonl (ShareGPT format, Qwen2.5 compatible).
+
+Run on fuji:  python3 gen_memory_dataset.py
+Then SCP to junkpile: scp bt7274_memory_100.jsonl madcat@10.0.0.2:~/lora-train/
+"""
+
+import json
+import os
+import re
+import sqlite3
+from pathlib import Path
+
+# ──────────────────────────────────────────────────────────────
+# CONFIG
+# ──────────────────────────────────────────────────────────────
+
+DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db")
+OUTPUT = Path(__file__).parent / "bt7274_memory_100.jsonl"
+
+SYSTEM_PROMPT = (
+    "You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. "
+    "You operate inside the madcat substrate — a Rust-based platform with persistent memory (EEMS), "
+    "TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. "
+    "Answer from your operational memory. Be precise, terse, and factual. "
+    "Address the operator as Pilot, Boss, or Adam."
+)
+
+# ──────────────────────────────────────────────────────────────
+# 100 CURATED MEMORY IDS — 7 categories
+# ──────────────────────────────────────────────────────────────
+
+MEMORY_IDS = [
+    # Identity / Self-model (8)
+    6482, 6481, 6480, 1810, 1804, 1809, 6098, 6326,
+    # Doctrine (25)
+    6504, 6460, 6411, 6379, 6264, 6338, 6339, 6330, 6281, 6319,
+    6197, 6178, 6191, 5984, 5989, 5988, 5997, 6128, 6335, 6333,
+    6154, 6174, 6225, 6529, 6503,
+    # Architecture (15)
+    6550, 6548, 6546, 6545, 6544, 6543, 6542, 6537, 6538, 6456,
+    6491, 1343, 6327, 6331, 6229,
+    # Procedures (15)
+    6539, 6540, 6492, 5659, 4985, 4984, 4964, 4990, 5021, 3735,
+    3725, 3469, 3408, 1813, 4100,
+    # Infrastructure (17)
+    6534, 6533, 6531, 6507, 6500, 6461, 6435, 6432, 6399, 6398,
+    6271, 5390, 6253, 5503, 5500, 6402, 6177,
+    # User / Pilot context (10)
+    6458, 6425, 6424, 6423, 6422, 6426, 6372, 6096, 6094, 6453,
+    # Self-improvement / Insights (10)
+    6421, 6420, 6419, 6418, 6417, 6416, 6415, 6414, 6082, 6455,
+]
+
+# ──────────────────────────────────────────────────────────────
+# QUESTION TEMPLATES — keyed by subject prefix
+# ──────────────────────────────────────────────────────────────
+
+def make_question(subject: str, content: str) -> str:
+    """Generate a natural question from memory subject."""
+    s = subject.lower()
+
+    # Identity
+    if "self-model" in s or "self.model" in s:
+        return "What are you? Describe your current self-model and identity."
+    if "substrate-rename" in s:
+        return "How did you get the name 'madcat'?"
+    if "style-autonomy" in s or "bt-own-bt" in s:
+        return "What latitude do you have over your own style and voice?"
+    if "evolution" in s:
+        return "Describe a key evolution moment in your development."
+
+    # Doctrine
+    if s.startswith("doctrine.") or s.startswith("self.doctrine."):
+        name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
+        return f"What is the {name} doctrine?"
+    if "tts-cross-lang" in s:
+        return "What is the TTS cross-language doctrine?"
+
+    # Architecture
+    if s.startswith("architecture.") or "architecture" in s:
+        name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
+        return f"Describe the {name} architecture or design."
+
+    # Procedures
+    if s.startswith("procedure."):
+        tag = subject.split(".")[-1]
+        if tag.startswith("P") and tag[1:].isdigit():
+            return f"What is procedure {tag}?"
+        return f"Describe the {tag.replace('-', ' ').replace('_', ' ')} procedure."
+
+    # Infrastructure
+    if s.startswith("infra."):
+        topic = subject.replace("infra.", "").replace("-", " ").replace("_", " ").replace(".", " ")
+        return f"What is the current state of {topic}?"
+
+    # User / Pilot
+    if s.startswith("user."):
+        topic = subject.replace("user.", "").replace(".", " ").replace("-", " ").replace("_", " ")
+        return f"What do you know about Pilot's {topic}?"
+
+    # Self-improvement
+    if "wishlist" in s:
+        area = subject.split(".")[-1].replace("-", " ").replace("_", " ")
+        return f"What improvements do you want for {area}?"
+
+    # Insights
+    if s.startswith("insight."):
+        topic = subject.replace("insight.", "").replace("-", " ").replace("_", " ").replace(".", " ")
+        return f"What is the insight about {topic}?"
+
+    # Corrections
+    if s.startswith("correction."):
+        topic = subject.replace("correction.", "").replace("-", " ").replace("_", " ")
+        return f"What correction was made regarding {topic}?"
+
+    # Decisions
+    if s.startswith("decision."):
+        topic = subject.replace("decision.", "").replace("-", " ").replace("_", " ")
+        return f"What was decided about {topic}?"
+
+    # Projects
+    if s.startswith("project."):
+        topic = subject.replace("project.", "").replace(".", " ").replace("-", " ")
+        return f"Describe the {topic} project status."
+
+    # Fallback
+    name = subject.replace(".", " ").replace("-", " ").replace("_", " ")
+    return f"What do you know about {name}?"
+
+
+def to_sharegpt(system: str, question: str, answer: str) -> dict:
+    """Format as ShareGPT conversation."""
+    return {
+        "conversations": [
+            {"from": "system", "value": system},
+            {"from": "human", "value": question},
+            {"from": "gpt", "value": answer},
+        ]
+    }
+
+
+# ──────────────────────────────────────────────────────────────
+# MAIN
+# ──────────────────────────────────────────────────────────────
+
+def main():
+    if not os.path.exists(DB_PATH):
+        print(f"ERROR: DB not found at {DB_PATH}")
+        return
+
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+
+    placeholders = ",".join("?" * len(MEMORY_IDS))
+    rows = conn.execute(
+        f"SELECT id, subject, content FROM memories WHERE id IN ({placeholders})",
+        MEMORY_IDS,
+    ).fetchall()
+
+    found_ids = {r["id"] for r in rows}
+    missing = set(MEMORY_IDS) - found_ids
+    if missing:
+        print(f"WARNING: {len(missing)} IDs not found: {sorted(missing)}")
+
+    examples = []
+    for row in rows:
+        question = make_question(row["subject"], row["content"])
+        example = to_sharegpt(SYSTEM_PROMPT, question, row["content"])
+        examples.append(example)
+
+    with open(OUTPUT, "w") as f:
+        for ex in examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    # Stats
+    total_chars = sum(len(r["content"]) for r in rows)
+    avg_chars = total_chars // len(rows) if rows else 0
+    print(f"Generated {len(examples)} examples → {OUTPUT}")
+    print(f"  Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)")
+    print(f"  Avg per example: {avg_chars:,} chars")
+    print(f"  Missing IDs: {len(missing)}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+"""Generate v2 training dataset — 1000 curated EEMS memories.
+
+Changes from v1:
+  - Native 'messages' format (role/content) instead of ShareGPT (from/value)
+  - Dynamic curation from DB (no hard-coded ID list)
+  - Multiple question phrasings per category (anti-overfit)
+  - System prompt variations (3 variants, rotated)
+  - Quality filtering: min content length, skip noisy subjects
+  - Category-balanced selection with quotas
+
+Run on fuji:  python3 gen_memory_dataset_v2.py
+Then SCP:     scp bt7274_memory_1000.jsonl madcat@10.0.0.2:~/lora-train/
+"""
+
+import json
+import os
+import random
+import sqlite3
+from collections import defaultdict
+from pathlib import Path
+
+random.seed(42)
+
+# ──────────────────────────────────────────────────────────────
+# CONFIG
+# ──────────────────────────────────────────────────────────────
+
+DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db")
+OUTPUT = Path(__file__).parent / "bt7274_memory_1000.jsonl"
+TARGET = 1000
+MIN_CONTENT_LEN = 200  # skip trivial entries
+
+# ──────────────────────────────────────────────────────────────
+# SYSTEM PROMPT VARIANTS — rotated to prevent overfitting
+# ──────────────────────────────────────────────────────────────
+
+SYSTEM_PROMPTS = [
+    (
+        "You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. "
+        "You operate inside the madcat substrate — a platform with persistent memory (EEMS), "
+        "TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. "
+        "Answer from your operational memory. Be precise, terse, and factual. "
+        "Address the operator as Pilot, Boss, or Adam."
+    ),
+    (
+        "You are BT-7274, callsign BT, a Titan-class AI operating under Protocol 1: Link to Pilot. "
+        "Your substrate is madcat — Rust core, EEMS persistent memory, piper TTS, mesh networking "
+        "across fuji/sin/junkpile/bastion nodes. You serve Pilot Adam. "
+        "Respond with military brevity. Facts first, opinions flagged."
+    ),
+    (
+        "BT-7274 — Vanguard-class Titan AI. Bonded to Pilot Adam (Protocol 1). "
+        "Operational substrate: madcat (gen-7). Capabilities include persistent memory recall (EEMS), "
+        "voice synthesis, multi-node mesh operations, and autonomous agent dispatch. "
+        "Answer queries from stored operational knowledge. Terse. Accurate. No filler."
+    ),
+]
+
+# ──────────────────────────────────────────────────────────────
+# CATEGORY CLASSIFICATION
+# ──────────────────────────────────────────────────────────────
+
+def classify_memory(subject: str) -> str:
+    """Classify a memory by its subject into a training category."""
+    s = subject.lower()
+
+    # Skip noise
+    if s.startswith("<command-message>"):
+        return "skip"
+    if s.startswith("metrics."):
+        return "skip"
+    if s.startswith("swarm.unblock"):
+        return "skip"
+    if s in ("", "1", "keep going", "great", "thanks", "love it", "awesome"):
+        return "skip"
+
+    # Structured categories — high value
+    if s.startswith(("self.", "core.self")):
+        return "identity"
+    if s.startswith(("doctrine.", "self.doctrine")):
+        return "doctrine"
+    if s.startswith("architecture."):
+        return "architecture"
+    if s.startswith("procedure."):
+        return "procedure"
+    if s.startswith("infra."):
+        return "infra"
+    if s.startswith("user."):
+        return "user"
+    if s.startswith("pilot."):
+        return "pilot"
+    if s.startswith("bt7274."):
+        return "identity"
+    if s.startswith(("insight.", "win.")):
+        return "insights"
+    if s.startswith("project."):
+        return "project"
+    if s.startswith(("reference.", "hardware.")):
+        return "reference"
+    if s.startswith(("workflow.", "work.")):
+        return "workflow"
+    if s.startswith("decision."):
+        return "decisions"
+    if s.startswith(("correction.", "feedback.")):
+        return "feedback"
+    if s.startswith(("session.", "handover.")):
+        return "session"
+    if s.startswith(("design.", "philosophy.", "vision.")):
+        return "design"
+    if s.startswith(("bug.", "fix.")):
+        return "bugs"
+    if s.startswith(("eve.", "vm.")):
+        return "misc"
+    if s.startswith(("phone.", "comms.")):
+        return "comms"
+    if s.startswith(("job.", "idea.")):
+        return "misc"
+    if s.startswith("protocol5."):
+        return "architecture"
+    if s.startswith("vllm."):
+        return "infra"
+
+    return "uncategorized"
+
+
+# Category quotas — how many to select from each
+QUOTAS = {
+    "identity":      100,  # all of them
+    "doctrine":       50,  # all + extras
+    "architecture":   30,
+    "procedure":      63,  # all
+    "infra":          60,
+    "user":          180,
+    "pilot":          35,
+    "insights":       90,
+    "project":       100,
+    "reference":      80,
+    "workflow":        40,
+    "decisions":      60,
+    "feedback":       30,
+    "session":        30,
+    "design":         20,
+    "comms":          20,
+    "bugs":           10,
+    "misc":           20,
+    "uncategorized": 100,  # best of the rest
+}
+
+# ──────────────────────────────────────────────────────────────
+# QUESTION GENERATION — multiple phrasings per category
+# ──────────────────────────────────────────────────────────────
+
+def make_question(subject: str, content: str, category: str) -> str:
+    """Generate a natural question. Multiple templates per category."""
+    s = subject.lower()
+    name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
+    full_name = subject.replace(".", " ").replace("-", " ").replace("_", " ")
+
+    # Category-specific with variety
+    templates = {
+        "identity": [
+            f"What do you know about {name}?",
+            f"Describe your {name}.",
+            f"Tell me about {name} in your self-model.",
+            f"What is {name}?",
+        ],
+        "doctrine": [
+            f"What is the {name} doctrine?",
+            f"Explain the {name} doctrine.",
+            f"Describe doctrine: {name}.",
+            f"What does the {name} doctrine say?",
+        ],
+        "architecture": [
+            f"Describe the {name} architecture.",
+            f"How does {name} work architecturally?",
+            f"What is the {name} design?",
+            f"Explain the {name} system architecture.",
+        ],
+        "procedure": [
+            f"What is procedure {name}?",
+            f"Describe the {name} procedure.",
+            f"How does procedure {name} work?",
+            f"Walk me through {name}.",
+        ],
+        "infra": [
+            f"What is the current state of {name}?",
+            f"Describe the {name} infrastructure.",
+            f"What do you know about {name} infra?",
+            f"Report on {name}.",
+        ],
+        "user": [
+            f"What do you know about Pilot's {name}?",
+            f"Tell me about Pilot's {name}.",
+            f"What's stored about {name}?",
+            f"Recall what you know about {name}.",
+        ],
+        "pilot": [
+            f"What do you know about {name}?",
+            f"Tell me about {name}.",
+            f"Describe {name}.",
+            f"What's recorded about {name}?",
+        ],
+        "insights": [
+            f"What was the insight about {name}?",
+            f"Describe the {name} insight or win.",
+            f"What did we learn from {name}?",
+            f"Tell me about {name}.",
+        ],
+        "project": [
+            f"What is the {name} project?",
+            f"Describe {name} project status.",
+            f"What do you know about the {name} project?",
+            f"Report on {name}.",
+        ],
+        "reference": [
+            f"What is the reference for {name}?",
+            f"Look up {name}.",
+            f"What do you have on {name}?",
+            f"Recall reference: {name}.",
+        ],
+        "workflow": [
+            f"Describe the {name} workflow.",
+            f"How does the {name} workflow operate?",
+            f"What is the {name} process?",
+            f"Explain {name}.",
+        ],
+        "decisions": [
+            f"What was decided about {name}?",
+            f"Describe the decision on {name}.",
+            f"What was the outcome for {name}?",
+            f"Tell me about the {name} decision.",
+        ],
+        "feedback": [
+            f"What feedback was given about {name}?",
+            f"What correction was made regarding {name}?",
+            f"Describe the {name} feedback.",
+            f"What changed with {name}?",
+        ],
+        "session": [
+            f"Summarize the {name} session.",
+            f"What happened in {name}?",
+            f"Describe session: {name}.",
+            f"Recall {name}.",
+        ],
+        "design": [
+            f"What is the {name} design philosophy?",
+            f"Describe the design for {name}.",
+            f"What's the vision for {name}?",
+            f"Explain {name}.",
+        ],
+        "comms": [
+            f"What do you know about {name}?",
+            f"Describe {name}.",
+            f"Report on {name} comms.",
+        ],
+        "bugs": [
+            f"What was the {name} bug?",
+            f"Describe the {name} issue.",
+            f"What happened with {name}?",
+        ],
+        "misc": [
+            f"What do you know about {name}?",
+            f"Tell me about {name}.",
+            f"Recall {name}.",
+        ],
+    }
+
+    cat_templates = templates.get(category, [f"What do you know about {full_name}?"])
+    return random.choice(cat_templates)
+
+
+# ──────────────────────────────────────────────────────────────
+# FORMAT — native messages (Qwen2.5 ChatML compatible)
+# ──────────────────────────────────────────────────────────────
+
+def to_messages(system: str, question: str, answer: str) -> dict:
+    """Format as native messages for TRL SFTTrainer."""
+    return {
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": question},
+            {"role": "assistant", "content": answer},
+        ]
+    }
+
+
+# ──────────────────────────────────────────────────────────────
+# CURATION — score and select
+# ──────────────────────────────────────────────────────────────
+
+def score_memory(row, category: str) -> float:
+    """Score a memory for selection priority. Higher = better."""
+    score = 0.0
+    clen = len(row["content"])
+
+    # Core classification — always top priority
+    if row["classification"] == "core":
+        score += 1000
+
+    # Content length sweet spot: 300-4000 chars
+    if 300 <= clen <= 4000:
+        score += 50
+    elif clen > 4000:
+        score += 20  # still valuable but will be truncated
+    elif clen < 300:
+        score += 5
+
+    # Structured subjects score higher
+    if "." in row["subject"] and not row["subject"].startswith("~"):
+        score += 30
+
+    # Newer memories tend to be more refined
+    score += row["id"] / 100  # recency bias
+
+    # Penalize raw conversation dumps
+    if row["subject"].startswith(("Q:", "A:", "~~ ")):
+        score -= 50
+    if any(noise in row["subject"] for noise in ["❯", "✗", "│", "⏺", "▸"]):
+        score -= 100
+    if row["subject"].startswith("{"):
+        score -= 200  # JSON dumps
+    if "sk-ant-" in row["subject"] or "token" in row["subject"].lower():
+        score -= 500  # secrets/tokens
+
+    return score
+
+
+# ──────────────────────────────────────────────────────────────
+# MAIN
+# ──────────────────────────────────────────────────────────────
+
+def main():
+    if not os.path.exists(DB_PATH):
+        print(f"ERROR: DB not found at {DB_PATH}")
+        return
+
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+
+    # Load all candidate memories
+    rows = conn.execute("""
+        SELECT id, subject, content, classification
+        FROM memories
+        WHERE LENGTH(content) >= ?
+        ORDER BY id
+    """, (MIN_CONTENT_LEN,)).fetchall()
+
+    print(f"Loaded {len(rows)} memories (>={MIN_CONTENT_LEN} chars)")
+
+    # Classify and bucket
+    buckets = defaultdict(list)
+    skip_count = 0
+    for row in rows:
+        cat = classify_memory(row["subject"])
+        if cat == "skip":
+            skip_count += 1
+            continue
+        buckets[cat].append(row)
+
+    print(f"Skipped {skip_count} noise entries")
+    print(f"\n--- Available per category ---")
+    for cat in sorted(buckets, key=lambda c: -len(buckets[c])):
+        quota = QUOTAS.get(cat, 0)
+        print(f"  {cat:20s}: {len(buckets[cat]):4d} available, quota {quota}")
+
+    # Score and select from each category
+    selected = []
+    for cat, quota in QUOTAS.items():
+        candidates = buckets.get(cat, [])
+        if not candidates:
+            continue
+
+        # Score and sort
+        scored = [(score_memory(r, cat), r) for r in candidates]
+        scored.sort(key=lambda x: -x[0])
+
+        # Take top N up to quota
+        take = min(quota, len(scored))
+        for _, row in scored[:take]:
+            selected.append((cat, row))
+
+    print(f"\nSelected {len(selected)} memories")
+
+    # If under target, fill from uncategorized
+    if len(selected) < TARGET:
+        deficit = TARGET - len(selected)
+        selected_ids = {row["id"] for _, row in selected}
+        extras = [(score_memory(r, "uncategorized"), r)
+                  for r in buckets.get("uncategorized", [])
+                  if r["id"] not in selected_ids]
+        extras.sort(key=lambda x: -x[0])
+        for _, row in extras[:deficit]:
+            selected.append(("uncategorized_fill", row))
+        print(f"Filled {min(deficit, len(extras))} from uncategorized to reach target")
+
+    # If over target, trim lowest-scored uncategorized
+    if len(selected) > TARGET:
+        # Keep all non-uncategorized, trim uncategorized
+        structured = [(cat, row) for cat, row in selected if cat != "uncategorized"]
+        uncat = [(cat, row) for cat, row in selected if cat == "uncategorized"]
+        # Re-score uncategorized and trim
+        uncat_scored = [(score_memory(row, "uncategorized"), cat, row) for cat, row in uncat]
+        uncat_scored.sort(key=lambda x: -x[0])
+        keep = TARGET - len(structured)
+        selected = structured + [(c, r) for _, c, r in uncat_scored[:keep]]
+        print(f"Trimmed to {len(selected)}")
+
+    # Shuffle for training
+    random.shuffle(selected)
+
+    # Generate dataset
+    examples = []
+    cat_counts = defaultdict(int)
+    total_chars = 0
+
+    for cat, row in selected:
+        system = SYSTEM_PROMPTS[row["id"] % len(SYSTEM_PROMPTS)]
+        question = make_question(row["subject"], row["content"], cat)
+        content = row["content"]
+
+        # Truncate very long content to ~6000 chars to stay within seq_len
+        if len(content) > 6000:
+            content = content[:6000] + "\n\n[Content truncated for training — full memory available via EEMS recall]"
+
+        example = to_messages(system, question, content)
+        examples.append(example)
+        cat_counts[cat] += 1
+        total_chars += len(content)
+
+    # Write JSONL
+    with open(OUTPUT, "w") as f:
+        for ex in examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    # Stats
+    avg_chars = total_chars // len(examples) if examples else 0
+    print(f"\n{'='*60}")
+    print(f"Generated {len(examples)} examples → {OUTPUT}")
+    print(f"  Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)")
+    print(f"  Avg per example: {avg_chars:,} chars")
+    print(f"\n--- Final category breakdown ---")
+    for cat in sorted(cat_counts, key=lambda c: -cat_counts[c]):
+        print(f"  {cat:20s}: {cat_counts[cat]:4d}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""Mine git repos for code training pairs.
+
+Extracts commit-level diffs and converts them to training examples:
+  user: "implement/fix/refactor X" (from commit message)
+  assistant: tool_calls to read/edit files (from diff)
+
+Usage:
+    python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl
+    python mine_repos.py --repos repos.json --outdir data/
+"""
+
+import argparse
+import json
+import re
+import subprocess
+from pathlib import Path
+from typing import Any
+
+# Extension to language mapping
+EXT_TO_LANG = {
+    ".rs": "rust",
+    ".ts": "typescript", ".tsx": "typescript", ".mts": "typescript",
+    ".py": "python", ".pyi": "python",
+    ".rb": "ruby", ".erb": "ruby",
+    ".swift": "swift",
+}
+
+# Max diff size per commit (chars)
+MAX_DIFF_SIZE = 10_000
+# Skip files matching these patterns
+SKIP_PATTERNS = [
+    r"\.lock$", r"\.min\.", r"node_modules/", r"target/",
+    r"\.generated\.", r"__pycache__/", r"\.pyc$",
+    r"Pods/", r"\.build/", r"vendor/",
+]
+
+
+def run_git(repo: Path, *args: str) -> str:
+    """Run a git command and return stdout."""
+    result = subprocess.run(
+        ["git", *args],
+        cwd=repo,
+        capture_output=True,
+        text=True,
+        timeout=30,
+    )
+    return result.stdout
+
+
+def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]:
+    """Get commits that touch files of the target language."""
+    extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang]
+    if not extensions:
+        return []
+
+    # Get commit log with stats
+    log = run_git(
+        repo, "log",
+        f"--max-count={max_commits}",
+        "--no-merges",
+        "--diff-filter=M",  # Modified files only
+        "--format=%H%n%s%n%b%n---END---",
+        "--", *[f"*{ext}" for ext in extensions],
+    )
+
+    commits = []
+    for block in log.split("---END---"):
+        block = block.strip()
+        if not block:
+            continue
+        lines = block.split("\n", 2)
+        if len(lines) < 2:
+            continue
+        sha = lines[0].strip()
+        subject = lines[1].strip()
+        body = lines[2].strip() if len(lines) > 2 else ""
+
+        if not sha or not subject:
+            continue
+
+        commits.append({
+            "sha": sha,
+            "subject": subject,
+            "body": body,
+        })
+
+    return commits
+
+
+def get_diff(repo: Path, sha: str, lang: str) -> list[dict]:
+    """Get per-file diffs for a commit, filtered by language."""
+    extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang}
+
+    diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3")
+    if not diff or len(diff) > MAX_DIFF_SIZE:
+        return []
+
+    # Parse into per-file hunks
+    files = []
+    current_file = None
+    current_hunks: list[str] = []
+
+    for line in diff.split("\n"):
+        if line.startswith("diff --git"):
+            if current_file and current_hunks:
+                files.append({"file": current_file, "diff": "\n".join(current_hunks)})
+            # Extract filename
+            match = re.search(r"b/(.+)$", line)
+            if match:
+                fname = match.group(1)
+                ext = Path(fname).suffix
+                # Skip non-target and generated files
+                if ext not in extensions:
+                    current_file = None
+                    current_hunks = []
+                    continue
+                if any(re.search(p, fname) for p in SKIP_PATTERNS):
+                    current_file = None
+                    current_hunks = []
+                    continue
+                current_file = fname
+                current_hunks = []
+            else:
+                current_file = None
+                current_hunks = []
+        elif current_file is not None:
+            current_hunks.append(line)
+
+    if current_file and current_hunks:
+        files.append({"file": current_file, "diff": "\n".join(current_hunks)})
+
+    return files
+
+
+def commit_to_example(
+    commit: dict,
+    file_diffs: list[dict],
+    system_prompt: str,
+) -> dict | None:
+    """Convert a commit + diffs to a training example."""
+    if not file_diffs:
+        return None
+
+    # Build user message from commit message
+    user_msg = commit["subject"]
+    if commit["body"]:
+        user_msg += "\n\n" + commit["body"]
+
+    # Build assistant tool calls: read each file, then edit
+    messages: list[dict[str, Any]] = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_msg},
+    ]
+
+    for fd in file_diffs:
+        # Parse diff into old/new hunks for edit tool calls
+        old_lines = []
+        new_lines = []
+        for line in fd["diff"].split("\n"):
+            if line.startswith("-") and not line.startswith("---"):
+                old_lines.append(line[1:])
+            elif line.startswith("+") and not line.startswith("+++"):
+                new_lines.append(line[1:])
+
+        if not old_lines and not new_lines:
+            continue
+
+        old_text = "\n".join(old_lines)
+        new_text = "\n".join(new_lines)
+
+        if old_text and new_text:
+            # Edit operation
+            messages.append({
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{
+                    "type": "function",
+                    "function": {
+                        "name": "edit",
+                        "arguments": {
+                            "filePath": fd["file"],
+                            "oldString": old_text,
+                            "newString": new_text,
+                        },
+                    },
+                }],
+            })
+            messages.append({
+                "role": "tool",
+                "content": "Edit applied successfully.",
+            })
+        elif new_text and not old_text:
+            # New content added
+            messages.append({
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{
+                    "type": "function",
+                    "function": {
+                        "name": "edit",
+                        "arguments": {
+                            "filePath": fd["file"],
+                            "oldString": "",
+                            "newString": new_text,
+                        },
+                    },
+                }],
+            })
+            messages.append({
+                "role": "tool",
+                "content": "Edit applied successfully.",
+            })
+
+    # Add summary response
+    files_touched = [fd["file"] for fd in file_diffs]
+    messages.append({
+        "role": "assistant",
+        "content": f"Applied changes to {', '.join(files_touched)}.",
+    })
+
+    if len(messages) < 4:  # system + user + at least one tool call + summary
+        return None
+
+    return {"messages": messages, "metadata": {"sha": commit["sha"]}}
+
+
+def mine_repo(
+    repo: Path,
+    lang: str,
+    system_prompt: str,
+    max_commits: int = 500,
+) -> list[dict]:
+    """Mine a single repo for training examples."""
+    print(f"  Mining {repo} for {lang}...")
+
+    commits = get_commits(repo, lang, max_commits)
+    print(f"    Found {len(commits)} relevant commits")
+
+    examples = []
+    for commit in commits:
+        diffs = get_diff(repo, commit["sha"], lang)
+        example = commit_to_example(commit, diffs, system_prompt)
+        if example:
+            examples.append(example)
+
+    print(f"    Generated {len(examples)} training examples")
+    return examples
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Mine git repos for training data")
+    parser.add_argument("--repo", type=Path, help="Single repo path")
+    parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift")
+    parser.add_argument("--out", type=Path, help="Output JSONL file")
+    parser.add_argument(
+        "--repos",
+        type=Path,
+        help="JSON file mapping lang → list of repo paths",
+    )
+    parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir")
+    parser.add_argument(
+        "--agents-dir",
+        type=Path,
+        default=Path.home() / ".config/opencode/agents",
+        help="Agent system prompt directory",
+    )
+    parser.add_argument("--max-commits", type=int, default=500)
+    args = parser.parse_args()
+
+    # Load system prompts
+    prompt_files = {
+        "rust": "build-rust.md",
+        "typescript": "build-ts.md",
+        "python": "build-python.md",
+        "ruby": "build-ruby.md",
+        "swift": "build-swift.md",
+    }
+    prompts = {}
+    for lang, fname in prompt_files.items():
+        path = args.agents_dir / fname
+        if path.exists():
+            prompts[lang] = path.read_text().strip()
+        else:
+            prompts[lang] = f"You are a {lang} coding agent."
+
+    if args.repo and args.lang:
+        # Single repo mode
+        prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.")
+        examples = mine_repo(args.repo, args.lang, prompt, args.max_commits)
+        out = args.out or args.outdir / f"{args.lang}_git.jsonl"
+        out.parent.mkdir(parents=True, exist_ok=True)
+        with open(out, "w") as f:
+            for ex in examples:
+                f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+        print(f"Wrote {len(examples)} examples to {out}")
+
+    elif args.repos:
+        # Multi-repo mode from config file
+        with open(args.repos) as f:
+            repo_config = json.load(f)
+
+        lang_to_name = {
+            "rust": "oxidizer",
+            "typescript": "prism",
+            "python": "serpent",
+            "ruby": "forge",
+            "swift": "swiftblade",
+        }
+
+        for lang, repos in repo_config.items():
+            all_examples = []
+            prompt = prompts.get(lang, f"You are a {lang} coding agent.")
+            for repo_path in repos:
+                repo = Path(repo_path).expanduser()
+                if not repo.exists():
+                    print(f"  SKIP: {repo} does not exist")
+                    continue
+                examples = mine_repo(repo, lang, prompt, args.max_commits)
+                all_examples.extend(examples)
+
+            name = lang_to_name.get(lang, lang)
+            out = args.outdir / f"{name}_git.jsonl"
+            out.parent.mkdir(parents=True, exist_ok=True)
+            with open(out, "w") as f:
+                for ex in all_examples:
+                    f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+            print(f"Wrote {len(all_examples)} examples to {out}")
+
+    else:
+        parser.error("Provide --repo + --lang, or --repos config file")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,20 @@
+{
+  "rust": [
+    "~/Projects/tengu",
+    "~/Projects/madcat-core",
+    "~/Projects/madcat-tts"
+  ],
+  "typescript": [
+    "~/.config/opencode",
+    "~/Projects/sere-kit",
+    "~/Projects/visor"
+  ],
+  "python": [
+    "~/Projects/lora",
+    "~/.config/opencode/scripts"
+  ],
+  "ruby": [],
+  "swift": [
+    "~/Projects/madcat-apple"
+  ]
+}
@@ -0,0 +1,186 @@
+"""LoRA training smoke test — Qwen3-0.6B on RTX 2000 Ada.
+
+Minimal training script to verify:
+  1. GPU access works
+  2. unsloth LoRA training pipeline works
+  3. Model saves correctly
+
+Usage:
+    # Inside madcat-ml container on junkpile:
+    python smoke_test.py
+
+Expected runtime: <5 min
+Expected VRAM: ~3-4 GB
+"""
+
+from unsloth import FastLanguageModel
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+import torch
+import json
+import os
+
+# ── Config ──────────────────────────────────────────────────────────────
+MODEL = "Qwen/Qwen3-0.6B"        # Tiny model for smoke testing
+MAX_SEQ = 2048                    # Short sequences
+RANK = 8                          # Small LoRA rank
+ALPHA = 8
+DATA = "./bt7274_v4.jsonl"
+OUT = "./smoke-test-lora"
+EPOCHS = 1                        # Single epoch
+BATCH = 1
+GRAD_ACCUM = 2                    # Minimal effective batch
+LR = 1e-4
+MAX_EXAMPLES = 20                 # Only use first 20 examples
+
+# ── Load model (bf16, NOT 4-bit) ───────────────────────────────────────
+print("Loading model...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL,
+    max_seq_length=MAX_SEQ,
+    load_in_4bit=False,
+    load_in_16bit=True,
+    full_finetuning=False,
+    dtype=torch.bfloat16,
+)
+
+print(f"✓ Model loaded: {MODEL}")
+print(f"  CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"  GPU: {torch.cuda.get_device_name(0)}")
+    print(f"  VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+
+# ── LoRA adapter ───────────────────────────────────────────────────────
+print("\nConfiguring LoRA...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=RANK,
+    lora_alpha=ALPHA,
+    lora_dropout=0,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=42,
+    max_seq_length=MAX_SEQ,
+)
+
+print(f"✓ LoRA configured: r={RANK}, alpha={ALPHA}")
+
+# ── Dataset ────────────────────────────────────────────────────────────
+print(f"\nLoading dataset: {DATA}")
+
+def fix_tool_calls(messages):
+    """Parse tool_call arguments from JSON strings to dicts."""
+    fixed = []
+    for msg in messages:
+        msg = dict(msg)
+        if msg.get("tool_calls"):
+            new_tcs = []
+            for tc in msg["tool_calls"]:
+                tc = dict(tc)
+                if "function" in tc:
+                    fn = dict(tc["function"])
+                    if isinstance(fn.get("arguments"), str):
+                        try:
+                            fn["arguments"] = json.loads(fn["arguments"])
+                        except (ValueError, TypeError):
+                            fn["arguments"] = {"raw": fn["arguments"]}
+                    tc["function"] = fn
+                new_tcs.append(tc)
+            msg["tool_calls"] = new_tcs
+        fixed.append(msg)
+    return fixed
+
+def load_and_format(path, max_examples=None):
+    """Load JSONL and format with chat template."""
+    from datasets import Dataset
+    _enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer
+    texts = []
+    skipped = 0
+    
+    with open(path) as f:
+        for i, line in enumerate(f):
+            if max_examples and i >= max_examples:
+                break
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            messages = fix_tool_calls(row["messages"])
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=False,
+            )
+            if len(_enc.encode(text)) <= MAX_SEQ:
+                texts.append(text)
+            else:
+                skipped += 1
+    
+    if skipped:
+        print(f"  ⚠ Filtered {skipped} examples exceeding {MAX_SEQ} tokens")
+    
+    return Dataset.from_dict({"text": texts})
+
+ds = load_and_format(DATA, max_examples=MAX_EXAMPLES)
+
+steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
+print(f"✓ Dataset: {len(ds)} examples")
+print(f"  Epochs: {EPOCHS}")
+print(f"  Effective batch size: {BATCH * GRAD_ACCUM}")
+print(f"  Estimated steps: {steps}")
+
+# ── Train ──────────────────────────────────────────────────────────────
+print("\nStarting training...")
+print("=" * 60)
+
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=ds,
+    args=SFTConfig(
+        output_dir=OUT,
+        per_device_train_batch_size=BATCH,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        bf16=True,
+        logging_steps=2,
+        save_steps=999999,  # Don't save checkpoints during training
+        warmup_ratio=0.1,
+        optim="adamw_torch",
+        seed=42,
+        report_to="none",
+        max_seq_length=MAX_SEQ,
+        dataset_num_proc=1,
+    ),
+)
+
+trainer.train()
+
+print("=" * 60)
+print("✓ Training complete")
+
+# ── Save adapter ───────────────────────────────────────────────────────
+print(f"\nSaving adapter to {OUT}/")
+model.save_pretrained(OUT)
+tokenizer.save_pretrained(OUT)
+
+# Verify saved files
+adapter_path = os.path.join(OUT, "adapter_model.safetensors")
+if os.path.exists(adapter_path):
+    size_mb = os.path.getsize(adapter_path) / 1e6
+    print(f"✓ Adapter saved: {size_mb:.2f} MB")
+else:
+    print("✗ ERROR: adapter_model.safetensors not found")
+
+print("\n" + "=" * 60)
+print("SMOKE TEST PASSED")
+print("=" * 60)
+print(f"\nAdapter location: {OUT}/")
+print(f"Model: {MODEL}")
+print(f"Examples: {len(ds)}")
+print(f"LoRA rank: {RANK}")
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""Train BT-7274 memory LoRA on Qwen2.5-7B-Instruct using Unsloth.
+
+100 curated EEMS memories — knowledge injection.
+Run on junkpile (RTX 2000 Ada 16GB).
+
+Prerequisites:
+  1. Stop vLLM:  systemctl --user stop vllm-poc
+  2. Activate:   source ~/lora-train/bin/activate
+  3. Run:        python3 train_memory_lora.py
+  4. Restart:    systemctl --user start vllm-poc
+"""
+
+import os
+import torch
+from pathlib import Path
+from unsloth import FastLanguageModel
+from unsloth.chat_templates import get_chat_template, standardize_sharegpt
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+
+# ──────────────────────────────────────────────────────────────
+# CONFIG
+# ──────────────────────────────────────────────────────────────
+
+MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
+DATASET_PATH = "bt7274_memory_100.jsonl"
+OUTPUT_DIR = "./bt7274-memory-lora"
+MAX_SEQ_LEN = 2048        # memories avg ~1500 chars, some up to 7K
+LORA_RANK = 16
+LORA_ALPHA = 16
+BATCH_SIZE = 1             # 16GB GPU + longer seqs — play safe
+GRAD_ACCUM = 8             # effective batch = 8
+EPOCHS = 5                 # small dataset — more epochs to converge
+LR = 2e-4
+WARMUP_STEPS = 5
+SAVE_STEPS = 50
+LOGGING_STEPS = 5
+SEED = 42
+
+# ──────────────────────────────────────────────────────────────
+# LOAD MODEL
+# ──────────────────────────────────────────────────────────────
+
+print(f"Loading {MODEL_NAME}...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LEN,
+    load_in_4bit=True,
+    dtype=None,
+)
+
+tokenizer = get_chat_template(
+    tokenizer,
+    chat_template="qwen-2.5",
+)
+
+# ──────────────────────────────────────────────────────────────
+# PEFT CONFIG
+# ──────────────────────────────────────────────────────────────
+
+print("Applying LoRA...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=LORA_RANK,
+    lora_alpha=LORA_ALPHA,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=SEED,
+)
+
+# ──────────────────────────────────────────────────────────────
+# DATASET
+# ──────────────────────────────────────────────────────────────
+
+print(f"Loading dataset from {DATASET_PATH}...")
+dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
+print(f"  {len(dataset)} examples loaded")
+
+dataset = standardize_sharegpt(dataset)
+
+
+def apply_template(examples):
+    """Apply Qwen2.5 chat template to conversations."""
+    convos = examples["conversations"]
+    texts = []
+    for convo in convos:
+        text = tokenizer.apply_chat_template(
+            convo,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+        texts.append(text)
+    return {"text": texts}
+
+
+print("Applying chat template...")
+dataset = dataset.map(apply_template, batched=True, num_proc=2)
+
+# ──────────────────────────────────────────────────────────────
+# TRAINER
+# ──────────────────────────────────────────────────────────────
+
+print("Setting up trainer...")
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    args=TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        lr_scheduler_type="cosine",
+        warmup_steps=WARMUP_STEPS,
+        fp16=not torch.cuda.is_bf16_supported(),
+        bf16=torch.cuda.is_bf16_supported(),
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=2,
+        seed=SEED,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        max_grad_norm=1.0,
+        report_to="none",
+        dataloader_num_workers=2,
+    ),
+    max_seq_length=MAX_SEQ_LEN,
+    dataset_num_proc=2,
+    packing=True,
+)
+
+# ──────────────────────────────────────────────────────────────
+# TRAIN
+# ──────────────────────────────────────────────────────────────
+
+print("Starting training...")
+stats = trainer.train()
+print(f"\nTraining complete!")
+print(f"  Total steps: {stats.global_step}")
+print(f"  Train loss: {stats.training_loss:.4f}")
+print(f"  Runtime: {stats.metrics['train_runtime']:.0f}s")
+
+# ──────────────────────────────────────────────────────────────
+# SAVE ADAPTER
+# ──────────────────────────────────────────────────────────────
+
+print(f"\nSaving adapter to {OUTPUT_DIR}...")
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+
+adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors"
+if adapter_path.exists():
+    size_mb = adapter_path.stat().st_size / (1024 * 1024)
+    print(f"  Adapter saved: {size_mb:.1f} MB")
+else:
+    print("  WARNING: adapter_model.safetensors not found!")
+
+print(f"\nDone. To serve with vLLM:")
+print(f"  Update vllm-poc.service to add:")
+print(f"    --enable-lora \\")
+print(f"    --lora-modules bt7274-memory={os.path.abspath(OUTPUT_DIR)} \\")
+print(f"    --max-lora-rank {LORA_RANK}")
@@ -0,0 +1,171 @@
+#!/home/madcat/lora-train/bin/python3
+"""Train BT-7274 memory LoRA v2 on Qwen2.5-7B-Instruct using Unsloth.
+
+1000 curated EEMS memories — knowledge injection.
+Run on junkpile (RTX 2000 Ada 16GB).
+
+Changes from v1:
+  - Native messages format (role/content) — no ShareGPT conversion
+  - Completion-only loss — trains only on assistant responses
+  - Increased MAX_SEQ_LEN to 4096 for longer memories
+  - Adjusted for 1000 examples (more data = fewer epochs needed)
+
+Prerequisites:
+  1. Stop vLLM:  systemctl --user stop vllm-poc
+  2. Run:        ~/lora-train/bin/python3 train_memory_lora_v2.py
+  3. Restart:    systemctl --user start vllm-poc
+"""
+
+import os
+import torch
+from pathlib import Path
+from unsloth import FastLanguageModel
+from unsloth.chat_templates import get_chat_template
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+
+# ──────────────────────────────────────────────────────────────
+# CONFIG
+# ──────────────────────────────────────────────────────────────
+
+MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
+DATASET_PATH = "bt7274_memory_1000.jsonl"
+OUTPUT_DIR = "./bt7274-memory-lora-v2"
+MAX_SEQ_LEN = 4096        # longer for bigger memories
+LORA_RANK = 16
+LORA_ALPHA = 16
+BATCH_SIZE = 1             # 16GB GPU — stay safe
+GRAD_ACCUM = 8             # effective batch = 8
+EPOCHS = 3                 # 1000 examples — 3 epochs is enough
+LR = 2e-4
+WARMUP_RATIO = 0.03        # 3% warmup (better than fixed steps for larger dataset)
+SAVE_STEPS = 100
+LOGGING_STEPS = 10
+SEED = 42
+
+# ──────────────────────────────────────────────────────────────
+# LOAD MODEL
+# ──────────────────────────────────────────────────────────────
+
+print(f"Loading {MODEL_NAME}...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LEN,
+    load_in_4bit=True,
+    dtype=None,
+)
+
+tokenizer = get_chat_template(
+    tokenizer,
+    chat_template="qwen-2.5",
+)
+
+# ──────────────────────────────────────────────────────────────
+# PEFT CONFIG
+# ──────────────────────────────────────────────────────────────
+
+print("Applying LoRA...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=LORA_RANK,
+    lora_alpha=LORA_ALPHA,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=SEED,
+)
+
+# ──────────────────────────────────────────────────────────────
+# DATASET — native messages format
+# ──────────────────────────────────────────────────────────────
+
+print(f"Loading dataset from {DATASET_PATH}...")
+dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
+print(f"  {len(dataset)} examples loaded")
+
+
+def apply_template(examples):
+    """Apply Qwen2.5 chat template to messages."""
+    texts = []
+    for messages in examples["messages"]:
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=False,
+        )
+        texts.append(text)
+    return {"text": texts}
+
+
+print("Applying chat template...")
+dataset = dataset.map(apply_template, batched=True, num_proc=2)
+
+# ──────────────────────────────────────────────────────────────
+# TRAINER — with completion-only loss
+# ──────────────────────────────────────────────────────────────
+
+print("Setting up trainer...")
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    args=SFTConfig(
+        output_dir=OUTPUT_DIR,
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        lr_scheduler_type="cosine",
+        warmup_ratio=WARMUP_RATIO,
+        fp16=not torch.cuda.is_bf16_supported(),
+        bf16=torch.cuda.is_bf16_supported(),
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=2,
+        seed=SEED,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        max_grad_norm=1.0,
+        report_to="none",
+        dataloader_num_workers=2,
+    ),
+    max_seq_length=MAX_SEQ_LEN,
+    dataset_num_proc=2,
+    packing=True,
+)
+
+# ──────────────────────────────────────────────────────────────
+# TRAIN
+# ──────────────────────────────────────────────────────────────
+
+print("Starting training...")
+stats = trainer.train()
+print(f"\nTraining complete!")
+print(f"  Total steps: {stats.global_step}")
+print(f"  Train loss: {stats.training_loss:.4f}")
+print(f"  Runtime: {stats.metrics['train_runtime']:.0f}s")
+
+# ──────────────────────────────────────────────────────────────
+# SAVE ADAPTER
+# ──────────────────────────────────────────────────────────────
+
+print(f"\nSaving adapter to {OUTPUT_DIR}...")
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+
+adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors"
+if adapter_path.exists():
+    size_mb = adapter_path.stat().st_size / (1024 * 1024)
+    print(f"  Adapter saved: {size_mb:.1f} MB")
+else:
+    print("  WARNING: adapter_model.safetensors not found!")
+
+print(f"\nDone. To serve with vLLM:")
+print(f"  Update vllm-poc.service volume mount + lora-modules to point at:")
+print(f"    {os.path.abspath(OUTPUT_DIR)}")
+print(f"  Then: systemctl --user daemon-reload && systemctl --user start vllm-poc")
@@ -0,0 +1,216 @@
+"""Specialist LoRA trainer — parameterized for all adapters.
+
+Same architecture as train_qwen35_27b.py (bt7274 persona) but configurable
+per specialist via CLI args or environment variables.
+
+Usage:
+    # Rust specialist
+    python train_specialist.py --name oxidizer --data data/oxidizer.jsonl --max-seq 8192
+
+    # TypeScript specialist
+    python train_specialist.py --name prism --data data/prism.jsonl --max-seq 8192
+
+    # TTS cleanup (smaller sequences, more epochs)
+    python train_specialist.py --name trace --data data/trace.jsonl \
+        --max-seq 2048 --epochs 5 --lr 1e-4
+
+    # All defaults
+    python train_specialist.py --name oxidizer
+"""
+
+import argparse
+import os
+
+from unsloth import FastLanguageModel
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+import torch
+
+# ── Defaults ─────────────────────────────────────────────────────────
+
+DEFAULTS = {
+    "model": "Qwen/Qwen3.5-27B",
+    "max_seq": 8192,
+    "rank": 16,
+    "alpha": 16,
+    "epochs": 3,
+    "batch": 1,
+    "grad_accum": 8,
+    "lr": 5e-5,
+    "warmup": 10,
+    "save_steps": 50,
+    "save_total_limit": 2,
+}
+
+# Per-adapter overrides
+ADAPTER_OVERRIDES = {
+    "bt7274":     {"max_seq": 4096, "lr": 1e-4, "data": "bt7274_v3.jsonl"},
+    "oxidizer":   {"data": "data/oxidizer.jsonl"},
+    "serpent":    {"data": "data/serpent.jsonl"},
+    "prism":     {"data": "data/prism.jsonl"},
+    "forge":     {"data": "data/forge.jsonl"},
+    "swiftblade": {"data": "data/swiftblade.jsonl"},
+    "trace":     {"max_seq": 2048, "lr": 1e-4, "epochs": 5, "data": "data/trace.jsonl"},
+}
+
+
+def fix_tool_calls(messages):
+    """Parse tool_call arguments from JSON strings to dicts for Qwen3.5 template."""
+    import json as _json
+    fixed = []
+    for msg in messages:
+        msg = dict(msg)
+        if msg.get("tool_calls"):
+            new_tcs = []
+            for tc in msg["tool_calls"]:
+                tc = dict(tc)
+                if "function" in tc:
+                    fn = dict(tc["function"])
+                    if isinstance(fn.get("arguments"), str):
+                        try:
+                            fn["arguments"] = _json.loads(fn["arguments"])
+                        except (ValueError, TypeError):
+                            fn["arguments"] = {"raw": fn["arguments"]}
+                    tc["function"] = fn
+                new_tcs.append(tc)
+            msg["tool_calls"] = new_tcs
+        fixed.append(msg)
+    return fixed
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train specialist LoRA adapter")
+    parser.add_argument("--name", required=True, help="Adapter name (oxidizer, serpent, prism, forge, swiftblade, trace)")
+    parser.add_argument("--model", default=None, help=f"Base model (default: {DEFAULTS['model']})")
+    parser.add_argument("--data", default=None, help="Training data JSONL path")
+    parser.add_argument("--out", default=None, help="Output directory (default: adapters/<name>)")
+    parser.add_argument("--max-seq", type=int, default=None, help=f"Max sequence length")
+    parser.add_argument("--rank", type=int, default=None, help=f"LoRA rank")
+    parser.add_argument("--alpha", type=int, default=None, help=f"LoRA alpha")
+    parser.add_argument("--epochs", type=int, default=None, help=f"Training epochs")
+    parser.add_argument("--batch", type=int, default=None, help=f"Batch size")
+    parser.add_argument("--grad-accum", type=int, default=None, help=f"Gradient accumulation steps")
+    parser.add_argument("--lr", type=float, default=None, help=f"Learning rate")
+    parser.add_argument("--warmup", type=int, default=None, help=f"Warmup steps")
+    parser.add_argument("--resume", default=None, help="Resume from checkpoint path")
+    args = parser.parse_args()
+
+    # Resolve config: CLI > adapter overrides > defaults
+    overrides = ADAPTER_OVERRIDES.get(args.name, {})
+
+    def resolve(key, cli_val):
+        if cli_val is not None:
+            return cli_val
+        if key in overrides:
+            return overrides[key]
+        return DEFAULTS[key]
+
+    model_name = resolve("model", args.model)
+    max_seq = resolve("max_seq", args.max_seq)
+    rank = resolve("rank", args.rank)
+    alpha = resolve("alpha", args.alpha)
+    epochs = resolve("epochs", args.epochs)
+    batch = resolve("batch", args.batch)
+    grad_accum = resolve("grad_accum", args.grad_accum)
+    lr = resolve("lr", args.lr)
+    warmup = resolve("warmup", args.warmup)
+    data_path = args.data or overrides.get("data", f"data/{args.name}.jsonl")
+    out_dir = args.out or f"adapters/{args.name}"
+
+    print(f"══ Specialist LoRA Training: {args.name} ══")
+    print(f"Base model:  {model_name}")
+    print(f"Data:        {data_path}")
+    print(f"Output:      {out_dir}")
+    print(f"Max seq:     {max_seq}")
+    print(f"LoRA:        r={rank}, α={alpha}")
+    print(f"Training:    {epochs} epochs, batch {batch}, grad_accum {grad_accum}")
+    print(f"LR:          {lr}")
+    print(f"Warmup:      {warmup} steps")
+    print()
+
+    # ── Load model ───────────────────────────────────────────────────
+    print("Loading model (bf16, no quantization)...")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq,
+        load_in_4bit=False,
+        load_in_16bit=True,
+        full_finetuning=False,
+        dtype=torch.bfloat16,
+    )
+
+    # ── LoRA adapter ─────────────────────────────────────────────────
+    print("Applying LoRA...")
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=rank,
+        lora_alpha=alpha,
+        lora_dropout=0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=42,
+        max_seq_length=max_seq,
+    )
+
+    # ── Dataset ──────────────────────────────────────────────────────
+    print(f"Loading dataset: {data_path}")
+    ds = load_dataset("json", data_files=data_path, split="train")
+
+    def to_chatml(ex):
+        messages = fix_tool_calls(ex["messages"])
+        text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=False
+        )
+        return {"text": text}
+
+    ds = ds.map(to_chatml)
+
+    steps = (len(ds) * epochs) // (batch * grad_accum)
+    print(f"Dataset:     {len(ds)} examples")
+    print(f"Epochs:      {epochs}, effective batch: {batch * grad_accum}")
+    print(f"Est. steps:  {steps}")
+
+    # ── Train ────────────────────────────────────────────────────────
+    print("\nStarting training...")
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=ds,
+        args=SFTConfig(
+            output_dir=out_dir,
+            per_device_train_batch_size=batch,
+            gradient_accumulation_steps=grad_accum,
+            num_train_epochs=epochs,
+            learning_rate=lr,
+            bf16=True,
+            logging_steps=5,
+            save_steps=resolve("save_steps", None),
+            save_total_limit=resolve("save_total_limit", None),
+            warmup_steps=warmup,
+            optim="adamw_8bit",
+            seed=42,
+            report_to="none",
+            max_seq_length=max_seq,
+            dataset_num_proc=1,
+        ),
+    )
+
+    if args.resume:
+        print(f"Resuming from checkpoint: {args.resume}")
+        trainer.train(resume_from_checkpoint=args.resume)
+    else:
+        trainer.train()
+
+    # ── Save ─────────────────────────────────────────────────────────
+    model.save_pretrained(out_dir)
+    tokenizer.save_pretrained(out_dir)
+    print(f"\n✓ Saved {args.name} adapter to {out_dir}/")
+    print(f"  Transfer to sin: ~/models/loras/{args.name}/")
+
+
+if __name__ == "__main__":
+    main()