From 46788167959f512157379a733fb038caff15bc0d Mon Sep 17 00:00:00 2001 From: marauder-actual Date: Sun, 31 May 2026 11:38:42 +0200 Subject: [PATCH] add training scripts: memory, specialist, mining, smoke test --- extract_specialists.py | 516 +++++++++++++++++++++++++++++++++++++++ gen_memory_dataset.py | 191 +++++++++++++++ gen_memory_dataset_v2.py | 450 ++++++++++++++++++++++++++++++++++ mine_repos.py | 335 +++++++++++++++++++++++++ repos.json | 20 ++ smoke_test.py | 186 ++++++++++++++ train_memory_lora.py | 171 +++++++++++++ train_memory_lora_v2.py | 171 +++++++++++++ train_specialist.py | 216 ++++++++++++++++ 9 files changed, 2256 insertions(+) create mode 100644 extract_specialists.py create mode 100644 gen_memory_dataset.py create mode 100644 gen_memory_dataset_v2.py create mode 100644 mine_repos.py create mode 100644 repos.json create mode 100644 smoke_test.py create mode 100644 train_memory_lora.py create mode 100644 train_memory_lora_v2.py create mode 100644 train_specialist.py diff --git a/extract_specialists.py b/extract_specialists.py new file mode 100644 index 0000000..67c4282 --- /dev/null +++ b/extract_specialists.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python3 +"""Extract specialist training data from opencode session DB. + +Classifies build-agent messages by programming language and outputs +per-specialist JSONL files for LoRA training. + +opencode DB schema: + - session: id, agent, title, time_created, ... + - message: id, session_id, data (JSON: role, finish, tokens, ...) + - part: id, message_id, session_id, data (JSON: type, text/tool/state, ...) + +Part types: + - text: {type: "text", text: "..."} + - tool: {type: "tool", tool: "read", callID: "...", state: {status, input, output, ...}} + - step-start/step-finish: inference step boundaries + - reasoning: chain-of-thought (skip for training) + - patch: file diffs (skip — use tool output instead) + - compaction: summary (skip) + +Usage: + python extract_specialists.py [--db PATH] [--outdir data/] [--min-turns 2] + python extract_specialists.py --lang python --outdir data/ # single language +""" + +import argparse +import json +import sqlite3 +from collections import defaultdict +from pathlib import Path +from typing import Any + +# ── Language classification signals ────────────────────────────────── + +LANG_SIGNALS: dict[str, dict[str, list[str]]] = { + "rust": { + "extensions": [".rs"], + "files": ["Cargo.toml", "Cargo.lock", "build.rs", "clippy.toml", "rustfmt.toml"], + "commands": ["cargo ", "cargo build", "cargo test", "cargo clippy", "cargo fmt", + "cargo add", "rustc ", "rustup "], + "errors": ["error[E", "rustc --explain", "cannot find value", "expected struct", + "borrow checker"], + }, + "typescript": { + "extensions": [".ts", ".tsx", ".mts", ".cts"], + "files": ["tsconfig.json", "package.json", "bun.lockb", "pnpm-lock.yaml", + "next.config", "vite.config", "astro.config"], + "commands": ["npm ", "pnpm ", "bun ", "npx ", "tsc ", "vitest ", "jest ", + "biome ", "eslint "], + "errors": ["error TS", "TS2", "TS7", "Cannot find module", "Type '"], + }, + "python": { + "extensions": [".py", ".pyi"], + "files": ["pyproject.toml", "setup.py", "setup.cfg", "requirements.txt", + "ruff.toml", "mypy.ini", ".flake8", "noxfile.py", "tox.ini"], + "commands": ["python ", "python3 ", "pip ", "uv ", "pytest ", "ruff ", "mypy ", + "uvicorn ", "gunicorn "], + "errors": ["Traceback (most recent", "SyntaxError", "ImportError", + "TypeError", "ModuleNotFoundError"], + }, + "ruby": { + "extensions": [".rb", ".erb", ".haml", ".slim"], + "files": ["Gemfile", "Gemfile.lock", "Rakefile", ".ruby-version", + ".rubocop.yml", ".standard.yml"], + "commands": ["bundle ", "rails ", "rake ", "rspec ", "rubocop ", + "standardrb ", "gem "], + "errors": ["NoMethodError", "NameError", "ArgumentError", + "ActiveRecord::", "undefined method"], + }, + "swift": { + "extensions": [".swift"], + "files": ["Package.swift", "project.yml", ".xcodeproj", ".xcworkspace"], + "commands": ["swift build", "swift test", "swift run", "xcodebuild ", + "swift-format ", "swift package "], + "errors": ["cannot convert value of type", "protocol conformance", + "value of type", "has no member"], + }, +} + +# Adapter codenames +LANG_TO_NAME = { + "rust": "oxidizer", + "typescript": "prism", + "python": "serpent", + "ruby": "forge", + "swift": "swiftblade", +} + +# System prompts per specialist +SYSTEM_PROMPTS: dict[str, str] = {} + + +def load_system_prompts(agents_dir: Path) -> None: + """Load agent system prompts from markdown files.""" + mapping = { + "rust": "build-rust.md", + "typescript": "build-ts.md", + "python": "build-python.md", + "ruby": "build-ruby.md", + "swift": "build-swift.md", + } + for lang, filename in mapping.items(): + path = agents_dir / filename + if path.exists(): + SYSTEM_PROMPTS[lang] = path.read_text().strip() + else: + print(f" WARN: {path} not found, using default prompt for {lang}") + SYSTEM_PROMPTS[lang] = f"You are a {lang} coding agent." + + +def classify_text(content: str) -> dict[str, float]: + """Score text's relevance to each language. Returns {lang: score}.""" + scores: dict[str, float] = defaultdict(float) + content_lower = content.lower() + + for lang, signals in LANG_SIGNALS.items(): + for ext in signals["extensions"]: + scores[lang] += content_lower.count(ext) * 3.0 + for f in signals["files"]: + if f.lower() in content_lower: + scores[lang] += 5.0 + for cmd in signals["commands"]: + scores[lang] += content_lower.count(cmd.lower()) * 2.0 + for err in signals["errors"]: + if err.lower() in content_lower: + scores[lang] += 4.0 + + return dict(scores) + + +def classify_conversation(all_text: str) -> str | None: + """Classify concatenated conversation text to a single language.""" + scores = classify_text(all_text) + if not scores: + return None + + sorted_langs = sorted(scores.items(), key=lambda x: x[1], reverse=True) + if len(sorted_langs) == 0: + return None + + winner, winner_score = sorted_langs[0] + if winner_score < 5.0: + return None + + if len(sorted_langs) > 1: + runner_up_score = sorted_langs[1][1] + if runner_up_score > 0 and winner_score / runner_up_score < 1.5: + return None # Ambiguous + + return winner + + +# ── Tool call tools we care about for training ────────────────────── + +TRAINING_TOOLS = {"bash", "read", "edit", "write", "glob", "grep", "todowrite", "question"} + +# Max output length to include (truncate large tool outputs) +# 8192 tokens ≈ ~32K chars. Budget: system ~2K, user ~2K, leaves ~28K for assistant+tools. +# Each tool call+result pair: ~500–2000 chars. Cap output at 2000 to fit more exchanges. +MAX_OUTPUT_LEN = 2000 + + +def truncate_output(output: str, max_len: int = MAX_OUTPUT_LEN) -> str: + """Truncate tool output to max_len chars.""" + if len(output) <= max_len: + return output + return output[:max_len] + f"\n... (truncated, {len(output)} total chars)" + + +def extract_sessions(db_path: Path, target_lang: str | None = None) -> list[dict]: + """Extract build-agent sessions from opencode DB. + + Returns list of {session_id, title, messages: [...], raw_text: str} + where messages are in ChatML-like format suitable for training. + """ + conn = sqlite3.connect(db_path) + + session_rows = conn.execute(""" + SELECT id, title, time_created + FROM session + WHERE agent = 'build' OR agent LIKE 'build-%' + ORDER BY time_created + """).fetchall() + + print(f"Found {len(session_rows)} build sessions") + + all_conversations: list[dict] = [] + + for s_id, s_title, s_created in session_rows: + # Get messages for this session, ordered + msg_rows = conn.execute(""" + SELECT m.id, json_extract(m.data, '$.role') as role, + json_extract(m.data, '$.finish') as finish + FROM message m + WHERE m.session_id = ? + ORDER BY m.time_created + """, (s_id,)).fetchall() + + if len(msg_rows) < 2: + continue + + # Get all parts for this session, grouped by message + part_rows = conn.execute(""" + SELECT p.message_id, + json_extract(p.data, '$.type') as ptype, + p.data as pdata + FROM part p + WHERE p.session_id = ? + ORDER BY p.time_created + """, (s_id,)).fetchall() + + # Group parts by message_id + msg_parts: dict[str, list[tuple[str, str]]] = defaultdict(list) + for p_msg_id, p_type, p_data in part_rows: + msg_parts[p_msg_id].append((p_type, p_data)) + + # Build ChatML messages + messages: list[dict[str, Any]] = [] + raw_texts: list[str] = [] # for classification + + for m_id, m_role, m_finish in msg_rows: + parts = msg_parts.get(m_id, []) + + if m_role == "user": + # Extract user text + user_text = "" + for ptype, pdata in parts: + if ptype == "text": + pd = json.loads(pdata) + user_text += pd.get("text", "") + if user_text.strip(): + messages.append({"role": "user", "content": user_text.strip()}) + raw_texts.append(user_text) + + elif m_role == "assistant": + # Collect text parts and tool calls + asst_text = "" + tool_calls: list[dict] = [] + tool_results: list[dict] = [] + + for ptype, pdata in parts: + if ptype == "text": + pd = json.loads(pdata) + asst_text += pd.get("text", "") + + elif ptype == "tool": + pd = json.loads(pdata) + tool_name = pd.get("tool", "") + call_id = pd.get("callID", "") + state = pd.get("state", {}) + + if tool_name not in TRAINING_TOOLS: + continue + if state.get("status") != "completed": + continue + + tool_input = state.get("input", {}) + tool_output = state.get("output", "") + + # Build tool_call in OpenAI format + tool_calls.append({ + "type": "function", + "id": call_id, + "function": { + "name": tool_name, + "arguments": tool_input, + }, + }) + + # Build tool result + output_str = truncate_output(str(tool_output)) + tool_results.append({ + "role": "tool", + "tool_call_id": call_id, + "content": output_str, + }) + + # Collect for classification + raw_texts.append(json.dumps(tool_input)) + raw_texts.append(output_str) + + # Emit assistant message with tool calls + if tool_calls: + messages.append({ + "role": "assistant", + "content": None, + "tool_calls": tool_calls, + }) + messages.extend(tool_results) + + # Emit text-only assistant message (after tools, or standalone) + if asst_text.strip(): + messages.append({ + "role": "assistant", + "content": asst_text.strip(), + }) + raw_texts.append(asst_text) + + if len(messages) < 3: # need at least user + assistant + something + continue + + # Concatenate raw text for classification + raw_combined = " ".join(raw_texts) + + # Early classification filter if target_lang specified + if target_lang: + lang = classify_conversation(raw_combined) + if lang != target_lang: + continue + + all_conversations.append({ + "session_id": s_id, + "title": s_title, + "messages": messages, + "raw_text": raw_combined, + }) + + conn.close() + return all_conversations + + +def window_conversations( + conversations: list[dict], + min_turns: int = 2, + max_turns: int = 10, +) -> list[dict]: + """Split long conversations into training windows. + + Each window captures a coherent exchange: user question → assistant response + including all tool calls and results within that exchange. + """ + windows: list[dict] = [] + + for conv in conversations: + msgs = conv["messages"] + + # Find user message indices + user_indices = [i for i, m in enumerate(msgs) if m["role"] == "user"] + + if len(user_indices) < min_turns: + # Short enough to use as-is + if len(user_indices) >= 1: + windows.append({ + "session_id": conv["session_id"], + "title": conv["title"], + "messages": msgs, + "raw_text": conv.get("raw_text", ""), + }) + continue + + # Window by user-turn boundaries + for start in range(0, len(user_indices), max_turns): + end = min(start + max_turns, len(user_indices)) + + first_msg = user_indices[start] + # End at next user msg or end of conversation + if end < len(user_indices): + last_msg = user_indices[end] + else: + last_msg = len(msgs) + + window_msgs = msgs[first_msg:last_msg] + + # Skip windows that are too short + user_count = sum(1 for m in window_msgs if m["role"] == "user") + if user_count < 1: + continue + + windows.append({ + "session_id": conv["session_id"], + "title": conv["title"], + "messages": window_msgs, + "raw_text": " ".join( + m.get("content", "") or json.dumps(m.get("tool_calls", "")) + for m in window_msgs + ), + }) + + return windows + + +def format_example(messages: list[dict], lang: str) -> dict: + """Format a conversation window as a training example with system prompt.""" + system_prompt = SYSTEM_PROMPTS.get(lang, f"You are a {lang} coding agent.") + + # Clean up messages: ensure tool_call arguments are dicts + cleaned = [] + for msg in messages: + msg = dict(msg) + if msg.get("tool_calls"): + new_tcs = [] + for tc in msg["tool_calls"]: + tc = dict(tc) + if "function" in tc: + fn = dict(tc["function"]) + if isinstance(fn.get("arguments"), str): + try: + fn["arguments"] = json.loads(fn["arguments"]) + except (ValueError, TypeError): + fn["arguments"] = {"raw": fn["arguments"]} + tc["function"] = fn + new_tcs.append(tc) + msg["tool_calls"] = new_tcs + # Remove None content if no tool_calls + if msg.get("content") is None and not msg.get("tool_calls"): + continue + cleaned.append(msg) + + return { + "messages": [{"role": "system", "content": system_prompt}] + cleaned, + } + + +def write_dataset(examples: list[dict], path: Path) -> None: + """Write examples to JSONL file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + for ex in examples: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + print(f" Wrote {len(examples)} examples → {path}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Extract specialist training data") + parser.add_argument( + "--db", type=Path, + default=Path.home() / ".local/share/opencode/opencode.db", + help="Path to opencode SQLite database", + ) + parser.add_argument( + "--agents-dir", type=Path, + default=Path.home() / ".config/opencode/agents", + help="Path to agent system prompt directory", + ) + parser.add_argument( + "--outdir", type=Path, default=Path("data"), + help="Output directory for JSONL files", + ) + parser.add_argument( + "--lang", type=str, default=None, + help="Extract single language only (rust, typescript, python, ruby, swift)", + ) + parser.add_argument( + "--min-turns", type=int, default=1, + help="Minimum user turns per training window", + ) + parser.add_argument( + "--max-turns", type=int, default=10, + help="Maximum user turns per training window", + ) + args = parser.parse_args() + + print("══ Specialist Data Extraction ══") + print(f"DB: {args.db}") + print(f"Agents: {args.agents_dir}") + print(f"Output: {args.outdir}") + if args.lang: + print(f"Filter: {args.lang} only") + print() + + # Load system prompts + load_system_prompts(args.agents_dir) + print(f"Loaded {len(SYSTEM_PROMPTS)} system prompts") + + # Extract sessions + conversations = extract_sessions(args.db, target_lang=args.lang) + print(f"Extracted {len(conversations)} conversations") + + # Window into training examples + windows = window_conversations( + conversations, min_turns=args.min_turns, max_turns=args.max_turns, + ) + print(f"Created {len(windows)} training windows") + + # Classify and bucket + buckets: dict[str, list[dict]] = defaultdict(list) + unclassified = 0 + + for window in windows: + if args.lang: + lang = args.lang + else: + lang = classify_conversation(window.get("raw_text", "")) + if lang: + example = format_example(window["messages"], lang) + buckets[lang].append(example) + else: + unclassified += 1 + + # Report + print(f"\n── Classification Results ──") + if not args.lang: + print(f"Unclassified: {unclassified}") + for lang, examples in sorted(buckets.items(), key=lambda x: -len(x[1])): + name = LANG_TO_NAME.get(lang, lang) + # Count tool calls and text-only + tc_count = sum( + 1 for ex in examples + if any(m.get("tool_calls") for m in ex["messages"]) + ) + print(f" {name} ({lang}): {len(examples)} examples ({tc_count} with tool calls)") + + # Write per-language datasets + print(f"\n── Writing Datasets ──") + for lang, examples in buckets.items(): + name = LANG_TO_NAME.get(lang, lang) + write_dataset(examples, args.outdir / f"{name}.jsonl") + + print(f"\nDone. Review datasets in {args.outdir}/") + print(f"Next steps:") + print(f" 1. python mine_repos.py --repos repos.json (add git diff examples)") + print(f" 2. Manual curation pass") + print(f" 3. python train_specialist.py --name ") + + +if __name__ == "__main__": + main() diff --git a/gen_memory_dataset.py b/gen_memory_dataset.py new file mode 100644 index 0000000..f3fc553 --- /dev/null +++ b/gen_memory_dataset.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +"""Generate ShareGPT training dataset from 100 curated EEMS memories. + +Reads directly from the marauder SQLite DB on fuji. +Outputs: bt7274_memory_100.jsonl (ShareGPT format, Qwen2.5 compatible). + +Run on fuji: python3 gen_memory_dataset.py +Then SCP to junkpile: scp bt7274_memory_100.jsonl madcat@10.0.0.2:~/lora-train/ +""" + +import json +import os +import re +import sqlite3 +from pathlib import Path + +# ────────────────────────────────────────────────────────────── +# CONFIG +# ────────────────────────────────────────────────────────────── + +DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db") +OUTPUT = Path(__file__).parent / "bt7274_memory_100.jsonl" + +SYSTEM_PROMPT = ( + "You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. " + "You operate inside the madcat substrate — a Rust-based platform with persistent memory (EEMS), " + "TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. " + "Answer from your operational memory. Be precise, terse, and factual. " + "Address the operator as Pilot, Boss, or Adam." +) + +# ────────────────────────────────────────────────────────────── +# 100 CURATED MEMORY IDS — 7 categories +# ────────────────────────────────────────────────────────────── + +MEMORY_IDS = [ + # Identity / Self-model (8) + 6482, 6481, 6480, 1810, 1804, 1809, 6098, 6326, + # Doctrine (25) + 6504, 6460, 6411, 6379, 6264, 6338, 6339, 6330, 6281, 6319, + 6197, 6178, 6191, 5984, 5989, 5988, 5997, 6128, 6335, 6333, + 6154, 6174, 6225, 6529, 6503, + # Architecture (15) + 6550, 6548, 6546, 6545, 6544, 6543, 6542, 6537, 6538, 6456, + 6491, 1343, 6327, 6331, 6229, + # Procedures (15) + 6539, 6540, 6492, 5659, 4985, 4984, 4964, 4990, 5021, 3735, + 3725, 3469, 3408, 1813, 4100, + # Infrastructure (17) + 6534, 6533, 6531, 6507, 6500, 6461, 6435, 6432, 6399, 6398, + 6271, 5390, 6253, 5503, 5500, 6402, 6177, + # User / Pilot context (10) + 6458, 6425, 6424, 6423, 6422, 6426, 6372, 6096, 6094, 6453, + # Self-improvement / Insights (10) + 6421, 6420, 6419, 6418, 6417, 6416, 6415, 6414, 6082, 6455, +] + +# ────────────────────────────────────────────────────────────── +# QUESTION TEMPLATES — keyed by subject prefix +# ────────────────────────────────────────────────────────────── + +def make_question(subject: str, content: str) -> str: + """Generate a natural question from memory subject.""" + s = subject.lower() + + # Identity + if "self-model" in s or "self.model" in s: + return "What are you? Describe your current self-model and identity." + if "substrate-rename" in s: + return "How did you get the name 'madcat'?" + if "style-autonomy" in s or "bt-own-bt" in s: + return "What latitude do you have over your own style and voice?" + if "evolution" in s: + return "Describe a key evolution moment in your development." + + # Doctrine + if s.startswith("doctrine.") or s.startswith("self.doctrine."): + name = subject.split(".")[-1].replace("-", " ").replace("_", " ") + return f"What is the {name} doctrine?" + if "tts-cross-lang" in s: + return "What is the TTS cross-language doctrine?" + + # Architecture + if s.startswith("architecture.") or "architecture" in s: + name = subject.split(".")[-1].replace("-", " ").replace("_", " ") + return f"Describe the {name} architecture or design." + + # Procedures + if s.startswith("procedure."): + tag = subject.split(".")[-1] + if tag.startswith("P") and tag[1:].isdigit(): + return f"What is procedure {tag}?" + return f"Describe the {tag.replace('-', ' ').replace('_', ' ')} procedure." + + # Infrastructure + if s.startswith("infra."): + topic = subject.replace("infra.", "").replace("-", " ").replace("_", " ").replace(".", " ") + return f"What is the current state of {topic}?" + + # User / Pilot + if s.startswith("user."): + topic = subject.replace("user.", "").replace(".", " ").replace("-", " ").replace("_", " ") + return f"What do you know about Pilot's {topic}?" + + # Self-improvement + if "wishlist" in s: + area = subject.split(".")[-1].replace("-", " ").replace("_", " ") + return f"What improvements do you want for {area}?" + + # Insights + if s.startswith("insight."): + topic = subject.replace("insight.", "").replace("-", " ").replace("_", " ").replace(".", " ") + return f"What is the insight about {topic}?" + + # Corrections + if s.startswith("correction."): + topic = subject.replace("correction.", "").replace("-", " ").replace("_", " ") + return f"What correction was made regarding {topic}?" + + # Decisions + if s.startswith("decision."): + topic = subject.replace("decision.", "").replace("-", " ").replace("_", " ") + return f"What was decided about {topic}?" + + # Projects + if s.startswith("project."): + topic = subject.replace("project.", "").replace(".", " ").replace("-", " ") + return f"Describe the {topic} project status." + + # Fallback + name = subject.replace(".", " ").replace("-", " ").replace("_", " ") + return f"What do you know about {name}?" + + +def to_sharegpt(system: str, question: str, answer: str) -> dict: + """Format as ShareGPT conversation.""" + return { + "conversations": [ + {"from": "system", "value": system}, + {"from": "human", "value": question}, + {"from": "gpt", "value": answer}, + ] + } + + +# ────────────────────────────────────────────────────────────── +# MAIN +# ────────────────────────────────────────────────────────────── + +def main(): + if not os.path.exists(DB_PATH): + print(f"ERROR: DB not found at {DB_PATH}") + return + + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + + placeholders = ",".join("?" * len(MEMORY_IDS)) + rows = conn.execute( + f"SELECT id, subject, content FROM memories WHERE id IN ({placeholders})", + MEMORY_IDS, + ).fetchall() + + found_ids = {r["id"] for r in rows} + missing = set(MEMORY_IDS) - found_ids + if missing: + print(f"WARNING: {len(missing)} IDs not found: {sorted(missing)}") + + examples = [] + for row in rows: + question = make_question(row["subject"], row["content"]) + example = to_sharegpt(SYSTEM_PROMPT, question, row["content"]) + examples.append(example) + + with open(OUTPUT, "w") as f: + for ex in examples: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + + # Stats + total_chars = sum(len(r["content"]) for r in rows) + avg_chars = total_chars // len(rows) if rows else 0 + print(f"Generated {len(examples)} examples → {OUTPUT}") + print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)") + print(f" Avg per example: {avg_chars:,} chars") + print(f" Missing IDs: {len(missing)}") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/gen_memory_dataset_v2.py b/gen_memory_dataset_v2.py new file mode 100644 index 0000000..ec47fd2 --- /dev/null +++ b/gen_memory_dataset_v2.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +"""Generate v2 training dataset — 1000 curated EEMS memories. + +Changes from v1: + - Native 'messages' format (role/content) instead of ShareGPT (from/value) + - Dynamic curation from DB (no hard-coded ID list) + - Multiple question phrasings per category (anti-overfit) + - System prompt variations (3 variants, rotated) + - Quality filtering: min content length, skip noisy subjects + - Category-balanced selection with quotas + +Run on fuji: python3 gen_memory_dataset_v2.py +Then SCP: scp bt7274_memory_1000.jsonl madcat@10.0.0.2:~/lora-train/ +""" + +import json +import os +import random +import sqlite3 +from collections import defaultdict +from pathlib import Path + +random.seed(42) + +# ────────────────────────────────────────────────────────────── +# CONFIG +# ────────────────────────────────────────────────────────────── + +DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db") +OUTPUT = Path(__file__).parent / "bt7274_memory_1000.jsonl" +TARGET = 1000 +MIN_CONTENT_LEN = 200 # skip trivial entries + +# ────────────────────────────────────────────────────────────── +# SYSTEM PROMPT VARIANTS — rotated to prevent overfitting +# ────────────────────────────────────────────────────────────── + +SYSTEM_PROMPTS = [ + ( + "You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. " + "You operate inside the madcat substrate — a platform with persistent memory (EEMS), " + "TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. " + "Answer from your operational memory. Be precise, terse, and factual. " + "Address the operator as Pilot, Boss, or Adam." + ), + ( + "You are BT-7274, callsign BT, a Titan-class AI operating under Protocol 1: Link to Pilot. " + "Your substrate is madcat — Rust core, EEMS persistent memory, piper TTS, mesh networking " + "across fuji/sin/junkpile/bastion nodes. You serve Pilot Adam. " + "Respond with military brevity. Facts first, opinions flagged." + ), + ( + "BT-7274 — Vanguard-class Titan AI. Bonded to Pilot Adam (Protocol 1). " + "Operational substrate: madcat (gen-7). Capabilities include persistent memory recall (EEMS), " + "voice synthesis, multi-node mesh operations, and autonomous agent dispatch. " + "Answer queries from stored operational knowledge. Terse. Accurate. No filler." + ), +] + +# ────────────────────────────────────────────────────────────── +# CATEGORY CLASSIFICATION +# ────────────────────────────────────────────────────────────── + +def classify_memory(subject: str) -> str: + """Classify a memory by its subject into a training category.""" + s = subject.lower() + + # Skip noise + if s.startswith(""): + return "skip" + if s.startswith("metrics."): + return "skip" + if s.startswith("swarm.unblock"): + return "skip" + if s in ("", "1", "keep going", "great", "thanks", "love it", "awesome"): + return "skip" + + # Structured categories — high value + if s.startswith(("self.", "core.self")): + return "identity" + if s.startswith(("doctrine.", "self.doctrine")): + return "doctrine" + if s.startswith("architecture."): + return "architecture" + if s.startswith("procedure."): + return "procedure" + if s.startswith("infra."): + return "infra" + if s.startswith("user."): + return "user" + if s.startswith("pilot."): + return "pilot" + if s.startswith("bt7274."): + return "identity" + if s.startswith(("insight.", "win.")): + return "insights" + if s.startswith("project."): + return "project" + if s.startswith(("reference.", "hardware.")): + return "reference" + if s.startswith(("workflow.", "work.")): + return "workflow" + if s.startswith("decision."): + return "decisions" + if s.startswith(("correction.", "feedback.")): + return "feedback" + if s.startswith(("session.", "handover.")): + return "session" + if s.startswith(("design.", "philosophy.", "vision.")): + return "design" + if s.startswith(("bug.", "fix.")): + return "bugs" + if s.startswith(("eve.", "vm.")): + return "misc" + if s.startswith(("phone.", "comms.")): + return "comms" + if s.startswith(("job.", "idea.")): + return "misc" + if s.startswith("protocol5."): + return "architecture" + if s.startswith("vllm."): + return "infra" + + return "uncategorized" + + +# Category quotas — how many to select from each +QUOTAS = { + "identity": 100, # all of them + "doctrine": 50, # all + extras + "architecture": 30, + "procedure": 63, # all + "infra": 60, + "user": 180, + "pilot": 35, + "insights": 90, + "project": 100, + "reference": 80, + "workflow": 40, + "decisions": 60, + "feedback": 30, + "session": 30, + "design": 20, + "comms": 20, + "bugs": 10, + "misc": 20, + "uncategorized": 100, # best of the rest +} + +# ────────────────────────────────────────────────────────────── +# QUESTION GENERATION — multiple phrasings per category +# ────────────────────────────────────────────────────────────── + +def make_question(subject: str, content: str, category: str) -> str: + """Generate a natural question. Multiple templates per category.""" + s = subject.lower() + name = subject.split(".")[-1].replace("-", " ").replace("_", " ") + full_name = subject.replace(".", " ").replace("-", " ").replace("_", " ") + + # Category-specific with variety + templates = { + "identity": [ + f"What do you know about {name}?", + f"Describe your {name}.", + f"Tell me about {name} in your self-model.", + f"What is {name}?", + ], + "doctrine": [ + f"What is the {name} doctrine?", + f"Explain the {name} doctrine.", + f"Describe doctrine: {name}.", + f"What does the {name} doctrine say?", + ], + "architecture": [ + f"Describe the {name} architecture.", + f"How does {name} work architecturally?", + f"What is the {name} design?", + f"Explain the {name} system architecture.", + ], + "procedure": [ + f"What is procedure {name}?", + f"Describe the {name} procedure.", + f"How does procedure {name} work?", + f"Walk me through {name}.", + ], + "infra": [ + f"What is the current state of {name}?", + f"Describe the {name} infrastructure.", + f"What do you know about {name} infra?", + f"Report on {name}.", + ], + "user": [ + f"What do you know about Pilot's {name}?", + f"Tell me about Pilot's {name}.", + f"What's stored about {name}?", + f"Recall what you know about {name}.", + ], + "pilot": [ + f"What do you know about {name}?", + f"Tell me about {name}.", + f"Describe {name}.", + f"What's recorded about {name}?", + ], + "insights": [ + f"What was the insight about {name}?", + f"Describe the {name} insight or win.", + f"What did we learn from {name}?", + f"Tell me about {name}.", + ], + "project": [ + f"What is the {name} project?", + f"Describe {name} project status.", + f"What do you know about the {name} project?", + f"Report on {name}.", + ], + "reference": [ + f"What is the reference for {name}?", + f"Look up {name}.", + f"What do you have on {name}?", + f"Recall reference: {name}.", + ], + "workflow": [ + f"Describe the {name} workflow.", + f"How does the {name} workflow operate?", + f"What is the {name} process?", + f"Explain {name}.", + ], + "decisions": [ + f"What was decided about {name}?", + f"Describe the decision on {name}.", + f"What was the outcome for {name}?", + f"Tell me about the {name} decision.", + ], + "feedback": [ + f"What feedback was given about {name}?", + f"What correction was made regarding {name}?", + f"Describe the {name} feedback.", + f"What changed with {name}?", + ], + "session": [ + f"Summarize the {name} session.", + f"What happened in {name}?", + f"Describe session: {name}.", + f"Recall {name}.", + ], + "design": [ + f"What is the {name} design philosophy?", + f"Describe the design for {name}.", + f"What's the vision for {name}?", + f"Explain {name}.", + ], + "comms": [ + f"What do you know about {name}?", + f"Describe {name}.", + f"Report on {name} comms.", + ], + "bugs": [ + f"What was the {name} bug?", + f"Describe the {name} issue.", + f"What happened with {name}?", + ], + "misc": [ + f"What do you know about {name}?", + f"Tell me about {name}.", + f"Recall {name}.", + ], + } + + cat_templates = templates.get(category, [f"What do you know about {full_name}?"]) + return random.choice(cat_templates) + + +# ────────────────────────────────────────────────────────────── +# FORMAT — native messages (Qwen2.5 ChatML compatible) +# ────────────────────────────────────────────────────────────── + +def to_messages(system: str, question: str, answer: str) -> dict: + """Format as native messages for TRL SFTTrainer.""" + return { + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": question}, + {"role": "assistant", "content": answer}, + ] + } + + +# ────────────────────────────────────────────────────────────── +# CURATION — score and select +# ────────────────────────────────────────────────────────────── + +def score_memory(row, category: str) -> float: + """Score a memory for selection priority. Higher = better.""" + score = 0.0 + clen = len(row["content"]) + + # Core classification — always top priority + if row["classification"] == "core": + score += 1000 + + # Content length sweet spot: 300-4000 chars + if 300 <= clen <= 4000: + score += 50 + elif clen > 4000: + score += 20 # still valuable but will be truncated + elif clen < 300: + score += 5 + + # Structured subjects score higher + if "." in row["subject"] and not row["subject"].startswith("~"): + score += 30 + + # Newer memories tend to be more refined + score += row["id"] / 100 # recency bias + + # Penalize raw conversation dumps + if row["subject"].startswith(("Q:", "A:", "~~ ")): + score -= 50 + if any(noise in row["subject"] for noise in ["❯", "✗", "│", "⏺", "▸"]): + score -= 100 + if row["subject"].startswith("{"): + score -= 200 # JSON dumps + if "sk-ant-" in row["subject"] or "token" in row["subject"].lower(): + score -= 500 # secrets/tokens + + return score + + +# ────────────────────────────────────────────────────────────── +# MAIN +# ────────────────────────────────────────────────────────────── + +def main(): + if not os.path.exists(DB_PATH): + print(f"ERROR: DB not found at {DB_PATH}") + return + + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + + # Load all candidate memories + rows = conn.execute(""" + SELECT id, subject, content, classification + FROM memories + WHERE LENGTH(content) >= ? + ORDER BY id + """, (MIN_CONTENT_LEN,)).fetchall() + + print(f"Loaded {len(rows)} memories (>={MIN_CONTENT_LEN} chars)") + + # Classify and bucket + buckets = defaultdict(list) + skip_count = 0 + for row in rows: + cat = classify_memory(row["subject"]) + if cat == "skip": + skip_count += 1 + continue + buckets[cat].append(row) + + print(f"Skipped {skip_count} noise entries") + print(f"\n--- Available per category ---") + for cat in sorted(buckets, key=lambda c: -len(buckets[c])): + quota = QUOTAS.get(cat, 0) + print(f" {cat:20s}: {len(buckets[cat]):4d} available, quota {quota}") + + # Score and select from each category + selected = [] + for cat, quota in QUOTAS.items(): + candidates = buckets.get(cat, []) + if not candidates: + continue + + # Score and sort + scored = [(score_memory(r, cat), r) for r in candidates] + scored.sort(key=lambda x: -x[0]) + + # Take top N up to quota + take = min(quota, len(scored)) + for _, row in scored[:take]: + selected.append((cat, row)) + + print(f"\nSelected {len(selected)} memories") + + # If under target, fill from uncategorized + if len(selected) < TARGET: + deficit = TARGET - len(selected) + selected_ids = {row["id"] for _, row in selected} + extras = [(score_memory(r, "uncategorized"), r) + for r in buckets.get("uncategorized", []) + if r["id"] not in selected_ids] + extras.sort(key=lambda x: -x[0]) + for _, row in extras[:deficit]: + selected.append(("uncategorized_fill", row)) + print(f"Filled {min(deficit, len(extras))} from uncategorized to reach target") + + # If over target, trim lowest-scored uncategorized + if len(selected) > TARGET: + # Keep all non-uncategorized, trim uncategorized + structured = [(cat, row) for cat, row in selected if cat != "uncategorized"] + uncat = [(cat, row) for cat, row in selected if cat == "uncategorized"] + # Re-score uncategorized and trim + uncat_scored = [(score_memory(row, "uncategorized"), cat, row) for cat, row in uncat] + uncat_scored.sort(key=lambda x: -x[0]) + keep = TARGET - len(structured) + selected = structured + [(c, r) for _, c, r in uncat_scored[:keep]] + print(f"Trimmed to {len(selected)}") + + # Shuffle for training + random.shuffle(selected) + + # Generate dataset + examples = [] + cat_counts = defaultdict(int) + total_chars = 0 + + for cat, row in selected: + system = SYSTEM_PROMPTS[row["id"] % len(SYSTEM_PROMPTS)] + question = make_question(row["subject"], row["content"], cat) + content = row["content"] + + # Truncate very long content to ~6000 chars to stay within seq_len + if len(content) > 6000: + content = content[:6000] + "\n\n[Content truncated for training — full memory available via EEMS recall]" + + example = to_messages(system, question, content) + examples.append(example) + cat_counts[cat] += 1 + total_chars += len(content) + + # Write JSONL + with open(OUTPUT, "w") as f: + for ex in examples: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + + # Stats + avg_chars = total_chars // len(examples) if examples else 0 + print(f"\n{'='*60}") + print(f"Generated {len(examples)} examples → {OUTPUT}") + print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)") + print(f" Avg per example: {avg_chars:,} chars") + print(f"\n--- Final category breakdown ---") + for cat in sorted(cat_counts, key=lambda c: -cat_counts[c]): + print(f" {cat:20s}: {cat_counts[cat]:4d}") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/mine_repos.py b/mine_repos.py new file mode 100644 index 0000000..e318e87 --- /dev/null +++ b/mine_repos.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +"""Mine git repos for code training pairs. + +Extracts commit-level diffs and converts them to training examples: + user: "implement/fix/refactor X" (from commit message) + assistant: tool_calls to read/edit files (from diff) + +Usage: + python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl + python mine_repos.py --repos repos.json --outdir data/ +""" + +import argparse +import json +import re +import subprocess +from pathlib import Path +from typing import Any + +# Extension to language mapping +EXT_TO_LANG = { + ".rs": "rust", + ".ts": "typescript", ".tsx": "typescript", ".mts": "typescript", + ".py": "python", ".pyi": "python", + ".rb": "ruby", ".erb": "ruby", + ".swift": "swift", +} + +# Max diff size per commit (chars) +MAX_DIFF_SIZE = 10_000 +# Skip files matching these patterns +SKIP_PATTERNS = [ + r"\.lock$", r"\.min\.", r"node_modules/", r"target/", + r"\.generated\.", r"__pycache__/", r"\.pyc$", + r"Pods/", r"\.build/", r"vendor/", +] + + +def run_git(repo: Path, *args: str) -> str: + """Run a git command and return stdout.""" + result = subprocess.run( + ["git", *args], + cwd=repo, + capture_output=True, + text=True, + timeout=30, + ) + return result.stdout + + +def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]: + """Get commits that touch files of the target language.""" + extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang] + if not extensions: + return [] + + # Get commit log with stats + log = run_git( + repo, "log", + f"--max-count={max_commits}", + "--no-merges", + "--diff-filter=M", # Modified files only + "--format=%H%n%s%n%b%n---END---", + "--", *[f"*{ext}" for ext in extensions], + ) + + commits = [] + for block in log.split("---END---"): + block = block.strip() + if not block: + continue + lines = block.split("\n", 2) + if len(lines) < 2: + continue + sha = lines[0].strip() + subject = lines[1].strip() + body = lines[2].strip() if len(lines) > 2 else "" + + if not sha or not subject: + continue + + commits.append({ + "sha": sha, + "subject": subject, + "body": body, + }) + + return commits + + +def get_diff(repo: Path, sha: str, lang: str) -> list[dict]: + """Get per-file diffs for a commit, filtered by language.""" + extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang} + + diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3") + if not diff or len(diff) > MAX_DIFF_SIZE: + return [] + + # Parse into per-file hunks + files = [] + current_file = None + current_hunks: list[str] = [] + + for line in diff.split("\n"): + if line.startswith("diff --git"): + if current_file and current_hunks: + files.append({"file": current_file, "diff": "\n".join(current_hunks)}) + # Extract filename + match = re.search(r"b/(.+)$", line) + if match: + fname = match.group(1) + ext = Path(fname).suffix + # Skip non-target and generated files + if ext not in extensions: + current_file = None + current_hunks = [] + continue + if any(re.search(p, fname) for p in SKIP_PATTERNS): + current_file = None + current_hunks = [] + continue + current_file = fname + current_hunks = [] + else: + current_file = None + current_hunks = [] + elif current_file is not None: + current_hunks.append(line) + + if current_file and current_hunks: + files.append({"file": current_file, "diff": "\n".join(current_hunks)}) + + return files + + +def commit_to_example( + commit: dict, + file_diffs: list[dict], + system_prompt: str, +) -> dict | None: + """Convert a commit + diffs to a training example.""" + if not file_diffs: + return None + + # Build user message from commit message + user_msg = commit["subject"] + if commit["body"]: + user_msg += "\n\n" + commit["body"] + + # Build assistant tool calls: read each file, then edit + messages: list[dict[str, Any]] = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_msg}, + ] + + for fd in file_diffs: + # Parse diff into old/new hunks for edit tool calls + old_lines = [] + new_lines = [] + for line in fd["diff"].split("\n"): + if line.startswith("-") and not line.startswith("---"): + old_lines.append(line[1:]) + elif line.startswith("+") and not line.startswith("+++"): + new_lines.append(line[1:]) + + if not old_lines and not new_lines: + continue + + old_text = "\n".join(old_lines) + new_text = "\n".join(new_lines) + + if old_text and new_text: + # Edit operation + messages.append({ + "role": "assistant", + "content": None, + "tool_calls": [{ + "type": "function", + "function": { + "name": "edit", + "arguments": { + "filePath": fd["file"], + "oldString": old_text, + "newString": new_text, + }, + }, + }], + }) + messages.append({ + "role": "tool", + "content": "Edit applied successfully.", + }) + elif new_text and not old_text: + # New content added + messages.append({ + "role": "assistant", + "content": None, + "tool_calls": [{ + "type": "function", + "function": { + "name": "edit", + "arguments": { + "filePath": fd["file"], + "oldString": "", + "newString": new_text, + }, + }, + }], + }) + messages.append({ + "role": "tool", + "content": "Edit applied successfully.", + }) + + # Add summary response + files_touched = [fd["file"] for fd in file_diffs] + messages.append({ + "role": "assistant", + "content": f"Applied changes to {', '.join(files_touched)}.", + }) + + if len(messages) < 4: # system + user + at least one tool call + summary + return None + + return {"messages": messages, "metadata": {"sha": commit["sha"]}} + + +def mine_repo( + repo: Path, + lang: str, + system_prompt: str, + max_commits: int = 500, +) -> list[dict]: + """Mine a single repo for training examples.""" + print(f" Mining {repo} for {lang}...") + + commits = get_commits(repo, lang, max_commits) + print(f" Found {len(commits)} relevant commits") + + examples = [] + for commit in commits: + diffs = get_diff(repo, commit["sha"], lang) + example = commit_to_example(commit, diffs, system_prompt) + if example: + examples.append(example) + + print(f" Generated {len(examples)} training examples") + return examples + + +def main() -> None: + parser = argparse.ArgumentParser(description="Mine git repos for training data") + parser.add_argument("--repo", type=Path, help="Single repo path") + parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift") + parser.add_argument("--out", type=Path, help="Output JSONL file") + parser.add_argument( + "--repos", + type=Path, + help="JSON file mapping lang → list of repo paths", + ) + parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir") + parser.add_argument( + "--agents-dir", + type=Path, + default=Path.home() / ".config/opencode/agents", + help="Agent system prompt directory", + ) + parser.add_argument("--max-commits", type=int, default=500) + args = parser.parse_args() + + # Load system prompts + prompt_files = { + "rust": "build-rust.md", + "typescript": "build-ts.md", + "python": "build-python.md", + "ruby": "build-ruby.md", + "swift": "build-swift.md", + } + prompts = {} + for lang, fname in prompt_files.items(): + path = args.agents_dir / fname + if path.exists(): + prompts[lang] = path.read_text().strip() + else: + prompts[lang] = f"You are a {lang} coding agent." + + if args.repo and args.lang: + # Single repo mode + prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.") + examples = mine_repo(args.repo, args.lang, prompt, args.max_commits) + out = args.out or args.outdir / f"{args.lang}_git.jsonl" + out.parent.mkdir(parents=True, exist_ok=True) + with open(out, "w") as f: + for ex in examples: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + print(f"Wrote {len(examples)} examples to {out}") + + elif args.repos: + # Multi-repo mode from config file + with open(args.repos) as f: + repo_config = json.load(f) + + lang_to_name = { + "rust": "oxidizer", + "typescript": "prism", + "python": "serpent", + "ruby": "forge", + "swift": "swiftblade", + } + + for lang, repos in repo_config.items(): + all_examples = [] + prompt = prompts.get(lang, f"You are a {lang} coding agent.") + for repo_path in repos: + repo = Path(repo_path).expanduser() + if not repo.exists(): + print(f" SKIP: {repo} does not exist") + continue + examples = mine_repo(repo, lang, prompt, args.max_commits) + all_examples.extend(examples) + + name = lang_to_name.get(lang, lang) + out = args.outdir / f"{name}_git.jsonl" + out.parent.mkdir(parents=True, exist_ok=True) + with open(out, "w") as f: + for ex in all_examples: + f.write(json.dumps(ex, ensure_ascii=False) + "\n") + print(f"Wrote {len(all_examples)} examples to {out}") + + else: + parser.error("Provide --repo + --lang, or --repos config file") + + +if __name__ == "__main__": + main() diff --git a/repos.json b/repos.json new file mode 100644 index 0000000..c252396 --- /dev/null +++ b/repos.json @@ -0,0 +1,20 @@ +{ + "rust": [ + "~/Projects/tengu", + "~/Projects/madcat-core", + "~/Projects/madcat-tts" + ], + "typescript": [ + "~/.config/opencode", + "~/Projects/sere-kit", + "~/Projects/visor" + ], + "python": [ + "~/Projects/lora", + "~/.config/opencode/scripts" + ], + "ruby": [], + "swift": [ + "~/Projects/madcat-apple" + ] +} diff --git a/smoke_test.py b/smoke_test.py new file mode 100644 index 0000000..2e50807 --- /dev/null +++ b/smoke_test.py @@ -0,0 +1,186 @@ +"""LoRA training smoke test — Qwen3-0.6B on RTX 2000 Ada. + +Minimal training script to verify: + 1. GPU access works + 2. unsloth LoRA training pipeline works + 3. Model saves correctly + +Usage: + # Inside madcat-ml container on junkpile: + python smoke_test.py + +Expected runtime: <5 min +Expected VRAM: ~3-4 GB +""" + +from unsloth import FastLanguageModel +from trl import SFTTrainer, SFTConfig +from datasets import load_dataset +import torch +import json +import os + +# ── Config ────────────────────────────────────────────────────────────── +MODEL = "Qwen/Qwen3-0.6B" # Tiny model for smoke testing +MAX_SEQ = 2048 # Short sequences +RANK = 8 # Small LoRA rank +ALPHA = 8 +DATA = "./bt7274_v4.jsonl" +OUT = "./smoke-test-lora" +EPOCHS = 1 # Single epoch +BATCH = 1 +GRAD_ACCUM = 2 # Minimal effective batch +LR = 1e-4 +MAX_EXAMPLES = 20 # Only use first 20 examples + +# ── Load model (bf16, NOT 4-bit) ─────────────────────────────────────── +print("Loading model...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL, + max_seq_length=MAX_SEQ, + load_in_4bit=False, + load_in_16bit=True, + full_finetuning=False, + dtype=torch.bfloat16, +) + +print(f"✓ Model loaded: {MODEL}") +print(f" CUDA available: {torch.cuda.is_available()}") +if torch.cuda.is_available(): + print(f" GPU: {torch.cuda.get_device_name(0)}") + print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") + +# ── LoRA adapter ─────────────────────────────────────────────────────── +print("\nConfiguring LoRA...") +model = FastLanguageModel.get_peft_model( + model, + r=RANK, + lora_alpha=ALPHA, + lora_dropout=0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, + max_seq_length=MAX_SEQ, +) + +print(f"✓ LoRA configured: r={RANK}, alpha={ALPHA}") + +# ── Dataset ──────────────────────────────────────────────────────────── +print(f"\nLoading dataset: {DATA}") + +def fix_tool_calls(messages): + """Parse tool_call arguments from JSON strings to dicts.""" + fixed = [] + for msg in messages: + msg = dict(msg) + if msg.get("tool_calls"): + new_tcs = [] + for tc in msg["tool_calls"]: + tc = dict(tc) + if "function" in tc: + fn = dict(tc["function"]) + if isinstance(fn.get("arguments"), str): + try: + fn["arguments"] = json.loads(fn["arguments"]) + except (ValueError, TypeError): + fn["arguments"] = {"raw": fn["arguments"]} + tc["function"] = fn + new_tcs.append(tc) + msg["tool_calls"] = new_tcs + fixed.append(msg) + return fixed + +def load_and_format(path, max_examples=None): + """Load JSONL and format with chat template.""" + from datasets import Dataset + _enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer + texts = [] + skipped = 0 + + with open(path) as f: + for i, line in enumerate(f): + if max_examples and i >= max_examples: + break + line = line.strip() + if not line: + continue + row = json.loads(line) + messages = fix_tool_calls(row["messages"]) + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + ) + if len(_enc.encode(text)) <= MAX_SEQ: + texts.append(text) + else: + skipped += 1 + + if skipped: + print(f" ⚠ Filtered {skipped} examples exceeding {MAX_SEQ} tokens") + + return Dataset.from_dict({"text": texts}) + +ds = load_and_format(DATA, max_examples=MAX_EXAMPLES) + +steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM) +print(f"✓ Dataset: {len(ds)} examples") +print(f" Epochs: {EPOCHS}") +print(f" Effective batch size: {BATCH * GRAD_ACCUM}") +print(f" Estimated steps: {steps}") + +# ── Train ────────────────────────────────────────────────────────────── +print("\nStarting training...") +print("=" * 60) + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=ds, + args=SFTConfig( + output_dir=OUT, + per_device_train_batch_size=BATCH, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + bf16=True, + logging_steps=2, + save_steps=999999, # Don't save checkpoints during training + warmup_ratio=0.1, + optim="adamw_torch", + seed=42, + report_to="none", + max_seq_length=MAX_SEQ, + dataset_num_proc=1, + ), +) + +trainer.train() + +print("=" * 60) +print("✓ Training complete") + +# ── Save adapter ─────────────────────────────────────────────────────── +print(f"\nSaving adapter to {OUT}/") +model.save_pretrained(OUT) +tokenizer.save_pretrained(OUT) + +# Verify saved files +adapter_path = os.path.join(OUT, "adapter_model.safetensors") +if os.path.exists(adapter_path): + size_mb = os.path.getsize(adapter_path) / 1e6 + print(f"✓ Adapter saved: {size_mb:.2f} MB") +else: + print("✗ ERROR: adapter_model.safetensors not found") + +print("\n" + "=" * 60) +print("SMOKE TEST PASSED") +print("=" * 60) +print(f"\nAdapter location: {OUT}/") +print(f"Model: {MODEL}") +print(f"Examples: {len(ds)}") +print(f"LoRA rank: {RANK}") diff --git a/train_memory_lora.py b/train_memory_lora.py new file mode 100644 index 0000000..3d50587 --- /dev/null +++ b/train_memory_lora.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +"""Train BT-7274 memory LoRA on Qwen2.5-7B-Instruct using Unsloth. + +100 curated EEMS memories — knowledge injection. +Run on junkpile (RTX 2000 Ada 16GB). + +Prerequisites: + 1. Stop vLLM: systemctl --user stop vllm-poc + 2. Activate: source ~/lora-train/bin/activate + 3. Run: python3 train_memory_lora.py + 4. Restart: systemctl --user start vllm-poc +""" + +import os +import torch +from pathlib import Path +from unsloth import FastLanguageModel +from unsloth.chat_templates import get_chat_template, standardize_sharegpt +from trl import SFTTrainer +from transformers import TrainingArguments +from datasets import load_dataset + +# ────────────────────────────────────────────────────────────── +# CONFIG +# ────────────────────────────────────────────────────────────── + +MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" +DATASET_PATH = "bt7274_memory_100.jsonl" +OUTPUT_DIR = "./bt7274-memory-lora" +MAX_SEQ_LEN = 2048 # memories avg ~1500 chars, some up to 7K +LORA_RANK = 16 +LORA_ALPHA = 16 +BATCH_SIZE = 1 # 16GB GPU + longer seqs — play safe +GRAD_ACCUM = 8 # effective batch = 8 +EPOCHS = 5 # small dataset — more epochs to converge +LR = 2e-4 +WARMUP_STEPS = 5 +SAVE_STEPS = 50 +LOGGING_STEPS = 5 +SEED = 42 + +# ────────────────────────────────────────────────────────────── +# LOAD MODEL +# ────────────────────────────────────────────────────────────── + +print(f"Loading {MODEL_NAME}...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL_NAME, + max_seq_length=MAX_SEQ_LEN, + load_in_4bit=True, + dtype=None, +) + +tokenizer = get_chat_template( + tokenizer, + chat_template="qwen-2.5", +) + +# ────────────────────────────────────────────────────────────── +# PEFT CONFIG +# ────────────────────────────────────────────────────────────── + +print("Applying LoRA...") +model = FastLanguageModel.get_peft_model( + model, + r=LORA_RANK, + lora_alpha=LORA_ALPHA, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + lora_dropout=0, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=SEED, +) + +# ────────────────────────────────────────────────────────────── +# DATASET +# ────────────────────────────────────────────────────────────── + +print(f"Loading dataset from {DATASET_PATH}...") +dataset = load_dataset("json", data_files=DATASET_PATH, split="train") +print(f" {len(dataset)} examples loaded") + +dataset = standardize_sharegpt(dataset) + + +def apply_template(examples): + """Apply Qwen2.5 chat template to conversations.""" + convos = examples["conversations"] + texts = [] + for convo in convos: + text = tokenizer.apply_chat_template( + convo, + tokenize=False, + add_generation_prompt=False, + ) + texts.append(text) + return {"text": texts} + + +print("Applying chat template...") +dataset = dataset.map(apply_template, batched=True, num_proc=2) + +# ────────────────────────────────────────────────────────────── +# TRAINER +# ────────────────────────────────────────────────────────────── + +print("Setting up trainer...") +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="text", + args=TrainingArguments( + output_dir=OUTPUT_DIR, + per_device_train_batch_size=BATCH_SIZE, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + lr_scheduler_type="cosine", + warmup_steps=WARMUP_STEPS, + fp16=not torch.cuda.is_bf16_supported(), + bf16=torch.cuda.is_bf16_supported(), + logging_steps=LOGGING_STEPS, + save_steps=SAVE_STEPS, + save_total_limit=2, + seed=SEED, + optim="adamw_8bit", + weight_decay=0.01, + max_grad_norm=1.0, + report_to="none", + dataloader_num_workers=2, + ), + max_seq_length=MAX_SEQ_LEN, + dataset_num_proc=2, + packing=True, +) + +# ────────────────────────────────────────────────────────────── +# TRAIN +# ────────────────────────────────────────────────────────────── + +print("Starting training...") +stats = trainer.train() +print(f"\nTraining complete!") +print(f" Total steps: {stats.global_step}") +print(f" Train loss: {stats.training_loss:.4f}") +print(f" Runtime: {stats.metrics['train_runtime']:.0f}s") + +# ────────────────────────────────────────────────────────────── +# SAVE ADAPTER +# ────────────────────────────────────────────────────────────── + +print(f"\nSaving adapter to {OUTPUT_DIR}...") +model.save_pretrained(OUTPUT_DIR) +tokenizer.save_pretrained(OUTPUT_DIR) + +adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors" +if adapter_path.exists(): + size_mb = adapter_path.stat().st_size / (1024 * 1024) + print(f" Adapter saved: {size_mb:.1f} MB") +else: + print(" WARNING: adapter_model.safetensors not found!") + +print(f"\nDone. To serve with vLLM:") +print(f" Update vllm-poc.service to add:") +print(f" --enable-lora \\") +print(f" --lora-modules bt7274-memory={os.path.abspath(OUTPUT_DIR)} \\") +print(f" --max-lora-rank {LORA_RANK}") diff --git a/train_memory_lora_v2.py b/train_memory_lora_v2.py new file mode 100644 index 0000000..248d684 --- /dev/null +++ b/train_memory_lora_v2.py @@ -0,0 +1,171 @@ +#!/home/madcat/lora-train/bin/python3 +"""Train BT-7274 memory LoRA v2 on Qwen2.5-7B-Instruct using Unsloth. + +1000 curated EEMS memories — knowledge injection. +Run on junkpile (RTX 2000 Ada 16GB). + +Changes from v1: + - Native messages format (role/content) — no ShareGPT conversion + - Completion-only loss — trains only on assistant responses + - Increased MAX_SEQ_LEN to 4096 for longer memories + - Adjusted for 1000 examples (more data = fewer epochs needed) + +Prerequisites: + 1. Stop vLLM: systemctl --user stop vllm-poc + 2. Run: ~/lora-train/bin/python3 train_memory_lora_v2.py + 3. Restart: systemctl --user start vllm-poc +""" + +import os +import torch +from pathlib import Path +from unsloth import FastLanguageModel +from unsloth.chat_templates import get_chat_template +from trl import SFTTrainer, SFTConfig +from datasets import load_dataset + +# ────────────────────────────────────────────────────────────── +# CONFIG +# ────────────────────────────────────────────────────────────── + +MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" +DATASET_PATH = "bt7274_memory_1000.jsonl" +OUTPUT_DIR = "./bt7274-memory-lora-v2" +MAX_SEQ_LEN = 4096 # longer for bigger memories +LORA_RANK = 16 +LORA_ALPHA = 16 +BATCH_SIZE = 1 # 16GB GPU — stay safe +GRAD_ACCUM = 8 # effective batch = 8 +EPOCHS = 3 # 1000 examples — 3 epochs is enough +LR = 2e-4 +WARMUP_RATIO = 0.03 # 3% warmup (better than fixed steps for larger dataset) +SAVE_STEPS = 100 +LOGGING_STEPS = 10 +SEED = 42 + +# ────────────────────────────────────────────────────────────── +# LOAD MODEL +# ────────────────────────────────────────────────────────────── + +print(f"Loading {MODEL_NAME}...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL_NAME, + max_seq_length=MAX_SEQ_LEN, + load_in_4bit=True, + dtype=None, +) + +tokenizer = get_chat_template( + tokenizer, + chat_template="qwen-2.5", +) + +# ────────────────────────────────────────────────────────────── +# PEFT CONFIG +# ────────────────────────────────────────────────────────────── + +print("Applying LoRA...") +model = FastLanguageModel.get_peft_model( + model, + r=LORA_RANK, + lora_alpha=LORA_ALPHA, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + lora_dropout=0, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=SEED, +) + +# ────────────────────────────────────────────────────────────── +# DATASET — native messages format +# ────────────────────────────────────────────────────────────── + +print(f"Loading dataset from {DATASET_PATH}...") +dataset = load_dataset("json", data_files=DATASET_PATH, split="train") +print(f" {len(dataset)} examples loaded") + + +def apply_template(examples): + """Apply Qwen2.5 chat template to messages.""" + texts = [] + for messages in examples["messages"]: + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + ) + texts.append(text) + return {"text": texts} + + +print("Applying chat template...") +dataset = dataset.map(apply_template, batched=True, num_proc=2) + +# ────────────────────────────────────────────────────────────── +# TRAINER — with completion-only loss +# ────────────────────────────────────────────────────────────── + +print("Setting up trainer...") +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="text", + args=SFTConfig( + output_dir=OUTPUT_DIR, + per_device_train_batch_size=BATCH_SIZE, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + lr_scheduler_type="cosine", + warmup_ratio=WARMUP_RATIO, + fp16=not torch.cuda.is_bf16_supported(), + bf16=torch.cuda.is_bf16_supported(), + logging_steps=LOGGING_STEPS, + save_steps=SAVE_STEPS, + save_total_limit=2, + seed=SEED, + optim="adamw_8bit", + weight_decay=0.01, + max_grad_norm=1.0, + report_to="none", + dataloader_num_workers=2, + ), + max_seq_length=MAX_SEQ_LEN, + dataset_num_proc=2, + packing=True, +) + +# ────────────────────────────────────────────────────────────── +# TRAIN +# ────────────────────────────────────────────────────────────── + +print("Starting training...") +stats = trainer.train() +print(f"\nTraining complete!") +print(f" Total steps: {stats.global_step}") +print(f" Train loss: {stats.training_loss:.4f}") +print(f" Runtime: {stats.metrics['train_runtime']:.0f}s") + +# ────────────────────────────────────────────────────────────── +# SAVE ADAPTER +# ────────────────────────────────────────────────────────────── + +print(f"\nSaving adapter to {OUTPUT_DIR}...") +model.save_pretrained(OUTPUT_DIR) +tokenizer.save_pretrained(OUTPUT_DIR) + +adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors" +if adapter_path.exists(): + size_mb = adapter_path.stat().st_size / (1024 * 1024) + print(f" Adapter saved: {size_mb:.1f} MB") +else: + print(" WARNING: adapter_model.safetensors not found!") + +print(f"\nDone. To serve with vLLM:") +print(f" Update vllm-poc.service volume mount + lora-modules to point at:") +print(f" {os.path.abspath(OUTPUT_DIR)}") +print(f" Then: systemctl --user daemon-reload && systemctl --user start vllm-poc") diff --git a/train_specialist.py b/train_specialist.py new file mode 100644 index 0000000..c8d8076 --- /dev/null +++ b/train_specialist.py @@ -0,0 +1,216 @@ +"""Specialist LoRA trainer — parameterized for all adapters. + +Same architecture as train_qwen35_27b.py (bt7274 persona) but configurable +per specialist via CLI args or environment variables. + +Usage: + # Rust specialist + python train_specialist.py --name oxidizer --data data/oxidizer.jsonl --max-seq 8192 + + # TypeScript specialist + python train_specialist.py --name prism --data data/prism.jsonl --max-seq 8192 + + # TTS cleanup (smaller sequences, more epochs) + python train_specialist.py --name trace --data data/trace.jsonl \ + --max-seq 2048 --epochs 5 --lr 1e-4 + + # All defaults + python train_specialist.py --name oxidizer +""" + +import argparse +import os + +from unsloth import FastLanguageModel +from trl import SFTTrainer, SFTConfig +from datasets import load_dataset +import torch + +# ── Defaults ───────────────────────────────────────────────────────── + +DEFAULTS = { + "model": "Qwen/Qwen3.5-27B", + "max_seq": 8192, + "rank": 16, + "alpha": 16, + "epochs": 3, + "batch": 1, + "grad_accum": 8, + "lr": 5e-5, + "warmup": 10, + "save_steps": 50, + "save_total_limit": 2, +} + +# Per-adapter overrides +ADAPTER_OVERRIDES = { + "bt7274": {"max_seq": 4096, "lr": 1e-4, "data": "bt7274_v3.jsonl"}, + "oxidizer": {"data": "data/oxidizer.jsonl"}, + "serpent": {"data": "data/serpent.jsonl"}, + "prism": {"data": "data/prism.jsonl"}, + "forge": {"data": "data/forge.jsonl"}, + "swiftblade": {"data": "data/swiftblade.jsonl"}, + "trace": {"max_seq": 2048, "lr": 1e-4, "epochs": 5, "data": "data/trace.jsonl"}, +} + + +def fix_tool_calls(messages): + """Parse tool_call arguments from JSON strings to dicts for Qwen3.5 template.""" + import json as _json + fixed = [] + for msg in messages: + msg = dict(msg) + if msg.get("tool_calls"): + new_tcs = [] + for tc in msg["tool_calls"]: + tc = dict(tc) + if "function" in tc: + fn = dict(tc["function"]) + if isinstance(fn.get("arguments"), str): + try: + fn["arguments"] = _json.loads(fn["arguments"]) + except (ValueError, TypeError): + fn["arguments"] = {"raw": fn["arguments"]} + tc["function"] = fn + new_tcs.append(tc) + msg["tool_calls"] = new_tcs + fixed.append(msg) + return fixed + + +def main(): + parser = argparse.ArgumentParser(description="Train specialist LoRA adapter") + parser.add_argument("--name", required=True, help="Adapter name (oxidizer, serpent, prism, forge, swiftblade, trace)") + parser.add_argument("--model", default=None, help=f"Base model (default: {DEFAULTS['model']})") + parser.add_argument("--data", default=None, help="Training data JSONL path") + parser.add_argument("--out", default=None, help="Output directory (default: adapters/)") + parser.add_argument("--max-seq", type=int, default=None, help=f"Max sequence length") + parser.add_argument("--rank", type=int, default=None, help=f"LoRA rank") + parser.add_argument("--alpha", type=int, default=None, help=f"LoRA alpha") + parser.add_argument("--epochs", type=int, default=None, help=f"Training epochs") + parser.add_argument("--batch", type=int, default=None, help=f"Batch size") + parser.add_argument("--grad-accum", type=int, default=None, help=f"Gradient accumulation steps") + parser.add_argument("--lr", type=float, default=None, help=f"Learning rate") + parser.add_argument("--warmup", type=int, default=None, help=f"Warmup steps") + parser.add_argument("--resume", default=None, help="Resume from checkpoint path") + args = parser.parse_args() + + # Resolve config: CLI > adapter overrides > defaults + overrides = ADAPTER_OVERRIDES.get(args.name, {}) + + def resolve(key, cli_val): + if cli_val is not None: + return cli_val + if key in overrides: + return overrides[key] + return DEFAULTS[key] + + model_name = resolve("model", args.model) + max_seq = resolve("max_seq", args.max_seq) + rank = resolve("rank", args.rank) + alpha = resolve("alpha", args.alpha) + epochs = resolve("epochs", args.epochs) + batch = resolve("batch", args.batch) + grad_accum = resolve("grad_accum", args.grad_accum) + lr = resolve("lr", args.lr) + warmup = resolve("warmup", args.warmup) + data_path = args.data or overrides.get("data", f"data/{args.name}.jsonl") + out_dir = args.out or f"adapters/{args.name}" + + print(f"══ Specialist LoRA Training: {args.name} ══") + print(f"Base model: {model_name}") + print(f"Data: {data_path}") + print(f"Output: {out_dir}") + print(f"Max seq: {max_seq}") + print(f"LoRA: r={rank}, α={alpha}") + print(f"Training: {epochs} epochs, batch {batch}, grad_accum {grad_accum}") + print(f"LR: {lr}") + print(f"Warmup: {warmup} steps") + print() + + # ── Load model ─────────────────────────────────────────────────── + print("Loading model (bf16, no quantization)...") + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=model_name, + max_seq_length=max_seq, + load_in_4bit=False, + load_in_16bit=True, + full_finetuning=False, + dtype=torch.bfloat16, + ) + + # ── LoRA adapter ───────────────────────────────────────────────── + print("Applying LoRA...") + model = FastLanguageModel.get_peft_model( + model, + r=rank, + lora_alpha=alpha, + lora_dropout=0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, + max_seq_length=max_seq, + ) + + # ── Dataset ────────────────────────────────────────────────────── + print(f"Loading dataset: {data_path}") + ds = load_dataset("json", data_files=data_path, split="train") + + def to_chatml(ex): + messages = fix_tool_calls(ex["messages"]) + text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=False + ) + return {"text": text} + + ds = ds.map(to_chatml) + + steps = (len(ds) * epochs) // (batch * grad_accum) + print(f"Dataset: {len(ds)} examples") + print(f"Epochs: {epochs}, effective batch: {batch * grad_accum}") + print(f"Est. steps: {steps}") + + # ── Train ──────────────────────────────────────────────────────── + print("\nStarting training...") + trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=ds, + args=SFTConfig( + output_dir=out_dir, + per_device_train_batch_size=batch, + gradient_accumulation_steps=grad_accum, + num_train_epochs=epochs, + learning_rate=lr, + bf16=True, + logging_steps=5, + save_steps=resolve("save_steps", None), + save_total_limit=resolve("save_total_limit", None), + warmup_steps=warmup, + optim="adamw_8bit", + seed=42, + report_to="none", + max_seq_length=max_seq, + dataset_num_proc=1, + ), + ) + + if args.resume: + print(f"Resuming from checkpoint: {args.resume}") + trainer.train(resume_from_checkpoint=args.resume) + else: + trainer.train() + + # ── Save ───────────────────────────────────────────────────────── + model.save_pretrained(out_dir) + tokenizer.save_pretrained(out_dir) + print(f"\n✓ Saved {args.name} adapter to {out_dir}/") + print(f" Transfer to sin: ~/models/loras/{args.name}/") + + +if __name__ == "__main__": + main()