add training scripts: memory, specialist, mining, smoke test
This commit is contained in:
@@ -0,0 +1,516 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract specialist training data from opencode session DB.
|
||||
|
||||
Classifies build-agent messages by programming language and outputs
|
||||
per-specialist JSONL files for LoRA training.
|
||||
|
||||
opencode DB schema:
|
||||
- session: id, agent, title, time_created, ...
|
||||
- message: id, session_id, data (JSON: role, finish, tokens, ...)
|
||||
- part: id, message_id, session_id, data (JSON: type, text/tool/state, ...)
|
||||
|
||||
Part types:
|
||||
- text: {type: "text", text: "..."}
|
||||
- tool: {type: "tool", tool: "read", callID: "...", state: {status, input, output, ...}}
|
||||
- step-start/step-finish: inference step boundaries
|
||||
- reasoning: chain-of-thought (skip for training)
|
||||
- patch: file diffs (skip — use tool output instead)
|
||||
- compaction: summary (skip)
|
||||
|
||||
Usage:
|
||||
python extract_specialists.py [--db PATH] [--outdir data/] [--min-turns 2]
|
||||
python extract_specialists.py --lang python --outdir data/ # single language
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# ── Language classification signals ──────────────────────────────────
|
||||
|
||||
LANG_SIGNALS: dict[str, dict[str, list[str]]] = {
|
||||
"rust": {
|
||||
"extensions": [".rs"],
|
||||
"files": ["Cargo.toml", "Cargo.lock", "build.rs", "clippy.toml", "rustfmt.toml"],
|
||||
"commands": ["cargo ", "cargo build", "cargo test", "cargo clippy", "cargo fmt",
|
||||
"cargo add", "rustc ", "rustup "],
|
||||
"errors": ["error[E", "rustc --explain", "cannot find value", "expected struct",
|
||||
"borrow checker"],
|
||||
},
|
||||
"typescript": {
|
||||
"extensions": [".ts", ".tsx", ".mts", ".cts"],
|
||||
"files": ["tsconfig.json", "package.json", "bun.lockb", "pnpm-lock.yaml",
|
||||
"next.config", "vite.config", "astro.config"],
|
||||
"commands": ["npm ", "pnpm ", "bun ", "npx ", "tsc ", "vitest ", "jest ",
|
||||
"biome ", "eslint "],
|
||||
"errors": ["error TS", "TS2", "TS7", "Cannot find module", "Type '"],
|
||||
},
|
||||
"python": {
|
||||
"extensions": [".py", ".pyi"],
|
||||
"files": ["pyproject.toml", "setup.py", "setup.cfg", "requirements.txt",
|
||||
"ruff.toml", "mypy.ini", ".flake8", "noxfile.py", "tox.ini"],
|
||||
"commands": ["python ", "python3 ", "pip ", "uv ", "pytest ", "ruff ", "mypy ",
|
||||
"uvicorn ", "gunicorn "],
|
||||
"errors": ["Traceback (most recent", "SyntaxError", "ImportError",
|
||||
"TypeError", "ModuleNotFoundError"],
|
||||
},
|
||||
"ruby": {
|
||||
"extensions": [".rb", ".erb", ".haml", ".slim"],
|
||||
"files": ["Gemfile", "Gemfile.lock", "Rakefile", ".ruby-version",
|
||||
".rubocop.yml", ".standard.yml"],
|
||||
"commands": ["bundle ", "rails ", "rake ", "rspec ", "rubocop ",
|
||||
"standardrb ", "gem "],
|
||||
"errors": ["NoMethodError", "NameError", "ArgumentError",
|
||||
"ActiveRecord::", "undefined method"],
|
||||
},
|
||||
"swift": {
|
||||
"extensions": [".swift"],
|
||||
"files": ["Package.swift", "project.yml", ".xcodeproj", ".xcworkspace"],
|
||||
"commands": ["swift build", "swift test", "swift run", "xcodebuild ",
|
||||
"swift-format ", "swift package "],
|
||||
"errors": ["cannot convert value of type", "protocol conformance",
|
||||
"value of type", "has no member"],
|
||||
},
|
||||
}
|
||||
|
||||
# Adapter codenames
|
||||
LANG_TO_NAME = {
|
||||
"rust": "oxidizer",
|
||||
"typescript": "prism",
|
||||
"python": "serpent",
|
||||
"ruby": "forge",
|
||||
"swift": "swiftblade",
|
||||
}
|
||||
|
||||
# System prompts per specialist
|
||||
SYSTEM_PROMPTS: dict[str, str] = {}
|
||||
|
||||
|
||||
def load_system_prompts(agents_dir: Path) -> None:
|
||||
"""Load agent system prompts from markdown files."""
|
||||
mapping = {
|
||||
"rust": "build-rust.md",
|
||||
"typescript": "build-ts.md",
|
||||
"python": "build-python.md",
|
||||
"ruby": "build-ruby.md",
|
||||
"swift": "build-swift.md",
|
||||
}
|
||||
for lang, filename in mapping.items():
|
||||
path = agents_dir / filename
|
||||
if path.exists():
|
||||
SYSTEM_PROMPTS[lang] = path.read_text().strip()
|
||||
else:
|
||||
print(f" WARN: {path} not found, using default prompt for {lang}")
|
||||
SYSTEM_PROMPTS[lang] = f"You are a {lang} coding agent."
|
||||
|
||||
|
||||
def classify_text(content: str) -> dict[str, float]:
|
||||
"""Score text's relevance to each language. Returns {lang: score}."""
|
||||
scores: dict[str, float] = defaultdict(float)
|
||||
content_lower = content.lower()
|
||||
|
||||
for lang, signals in LANG_SIGNALS.items():
|
||||
for ext in signals["extensions"]:
|
||||
scores[lang] += content_lower.count(ext) * 3.0
|
||||
for f in signals["files"]:
|
||||
if f.lower() in content_lower:
|
||||
scores[lang] += 5.0
|
||||
for cmd in signals["commands"]:
|
||||
scores[lang] += content_lower.count(cmd.lower()) * 2.0
|
||||
for err in signals["errors"]:
|
||||
if err.lower() in content_lower:
|
||||
scores[lang] += 4.0
|
||||
|
||||
return dict(scores)
|
||||
|
||||
|
||||
def classify_conversation(all_text: str) -> str | None:
|
||||
"""Classify concatenated conversation text to a single language."""
|
||||
scores = classify_text(all_text)
|
||||
if not scores:
|
||||
return None
|
||||
|
||||
sorted_langs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||
if len(sorted_langs) == 0:
|
||||
return None
|
||||
|
||||
winner, winner_score = sorted_langs[0]
|
||||
if winner_score < 5.0:
|
||||
return None
|
||||
|
||||
if len(sorted_langs) > 1:
|
||||
runner_up_score = sorted_langs[1][1]
|
||||
if runner_up_score > 0 and winner_score / runner_up_score < 1.5:
|
||||
return None # Ambiguous
|
||||
|
||||
return winner
|
||||
|
||||
|
||||
# ── Tool call tools we care about for training ──────────────────────
|
||||
|
||||
TRAINING_TOOLS = {"bash", "read", "edit", "write", "glob", "grep", "todowrite", "question"}
|
||||
|
||||
# Max output length to include (truncate large tool outputs)
|
||||
# 8192 tokens ≈ ~32K chars. Budget: system ~2K, user ~2K, leaves ~28K for assistant+tools.
|
||||
# Each tool call+result pair: ~500–2000 chars. Cap output at 2000 to fit more exchanges.
|
||||
MAX_OUTPUT_LEN = 2000
|
||||
|
||||
|
||||
def truncate_output(output: str, max_len: int = MAX_OUTPUT_LEN) -> str:
|
||||
"""Truncate tool output to max_len chars."""
|
||||
if len(output) <= max_len:
|
||||
return output
|
||||
return output[:max_len] + f"\n... (truncated, {len(output)} total chars)"
|
||||
|
||||
|
||||
def extract_sessions(db_path: Path, target_lang: str | None = None) -> list[dict]:
|
||||
"""Extract build-agent sessions from opencode DB.
|
||||
|
||||
Returns list of {session_id, title, messages: [...], raw_text: str}
|
||||
where messages are in ChatML-like format suitable for training.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
session_rows = conn.execute("""
|
||||
SELECT id, title, time_created
|
||||
FROM session
|
||||
WHERE agent = 'build' OR agent LIKE 'build-%'
|
||||
ORDER BY time_created
|
||||
""").fetchall()
|
||||
|
||||
print(f"Found {len(session_rows)} build sessions")
|
||||
|
||||
all_conversations: list[dict] = []
|
||||
|
||||
for s_id, s_title, s_created in session_rows:
|
||||
# Get messages for this session, ordered
|
||||
msg_rows = conn.execute("""
|
||||
SELECT m.id, json_extract(m.data, '$.role') as role,
|
||||
json_extract(m.data, '$.finish') as finish
|
||||
FROM message m
|
||||
WHERE m.session_id = ?
|
||||
ORDER BY m.time_created
|
||||
""", (s_id,)).fetchall()
|
||||
|
||||
if len(msg_rows) < 2:
|
||||
continue
|
||||
|
||||
# Get all parts for this session, grouped by message
|
||||
part_rows = conn.execute("""
|
||||
SELECT p.message_id,
|
||||
json_extract(p.data, '$.type') as ptype,
|
||||
p.data as pdata
|
||||
FROM part p
|
||||
WHERE p.session_id = ?
|
||||
ORDER BY p.time_created
|
||||
""", (s_id,)).fetchall()
|
||||
|
||||
# Group parts by message_id
|
||||
msg_parts: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
for p_msg_id, p_type, p_data in part_rows:
|
||||
msg_parts[p_msg_id].append((p_type, p_data))
|
||||
|
||||
# Build ChatML messages
|
||||
messages: list[dict[str, Any]] = []
|
||||
raw_texts: list[str] = [] # for classification
|
||||
|
||||
for m_id, m_role, m_finish in msg_rows:
|
||||
parts = msg_parts.get(m_id, [])
|
||||
|
||||
if m_role == "user":
|
||||
# Extract user text
|
||||
user_text = ""
|
||||
for ptype, pdata in parts:
|
||||
if ptype == "text":
|
||||
pd = json.loads(pdata)
|
||||
user_text += pd.get("text", "")
|
||||
if user_text.strip():
|
||||
messages.append({"role": "user", "content": user_text.strip()})
|
||||
raw_texts.append(user_text)
|
||||
|
||||
elif m_role == "assistant":
|
||||
# Collect text parts and tool calls
|
||||
asst_text = ""
|
||||
tool_calls: list[dict] = []
|
||||
tool_results: list[dict] = []
|
||||
|
||||
for ptype, pdata in parts:
|
||||
if ptype == "text":
|
||||
pd = json.loads(pdata)
|
||||
asst_text += pd.get("text", "")
|
||||
|
||||
elif ptype == "tool":
|
||||
pd = json.loads(pdata)
|
||||
tool_name = pd.get("tool", "")
|
||||
call_id = pd.get("callID", "")
|
||||
state = pd.get("state", {})
|
||||
|
||||
if tool_name not in TRAINING_TOOLS:
|
||||
continue
|
||||
if state.get("status") != "completed":
|
||||
continue
|
||||
|
||||
tool_input = state.get("input", {})
|
||||
tool_output = state.get("output", "")
|
||||
|
||||
# Build tool_call in OpenAI format
|
||||
tool_calls.append({
|
||||
"type": "function",
|
||||
"id": call_id,
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": tool_input,
|
||||
},
|
||||
})
|
||||
|
||||
# Build tool result
|
||||
output_str = truncate_output(str(tool_output))
|
||||
tool_results.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": call_id,
|
||||
"content": output_str,
|
||||
})
|
||||
|
||||
# Collect for classification
|
||||
raw_texts.append(json.dumps(tool_input))
|
||||
raw_texts.append(output_str)
|
||||
|
||||
# Emit assistant message with tool calls
|
||||
if tool_calls:
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": tool_calls,
|
||||
})
|
||||
messages.extend(tool_results)
|
||||
|
||||
# Emit text-only assistant message (after tools, or standalone)
|
||||
if asst_text.strip():
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": asst_text.strip(),
|
||||
})
|
||||
raw_texts.append(asst_text)
|
||||
|
||||
if len(messages) < 3: # need at least user + assistant + something
|
||||
continue
|
||||
|
||||
# Concatenate raw text for classification
|
||||
raw_combined = " ".join(raw_texts)
|
||||
|
||||
# Early classification filter if target_lang specified
|
||||
if target_lang:
|
||||
lang = classify_conversation(raw_combined)
|
||||
if lang != target_lang:
|
||||
continue
|
||||
|
||||
all_conversations.append({
|
||||
"session_id": s_id,
|
||||
"title": s_title,
|
||||
"messages": messages,
|
||||
"raw_text": raw_combined,
|
||||
})
|
||||
|
||||
conn.close()
|
||||
return all_conversations
|
||||
|
||||
|
||||
def window_conversations(
|
||||
conversations: list[dict],
|
||||
min_turns: int = 2,
|
||||
max_turns: int = 10,
|
||||
) -> list[dict]:
|
||||
"""Split long conversations into training windows.
|
||||
|
||||
Each window captures a coherent exchange: user question → assistant response
|
||||
including all tool calls and results within that exchange.
|
||||
"""
|
||||
windows: list[dict] = []
|
||||
|
||||
for conv in conversations:
|
||||
msgs = conv["messages"]
|
||||
|
||||
# Find user message indices
|
||||
user_indices = [i for i, m in enumerate(msgs) if m["role"] == "user"]
|
||||
|
||||
if len(user_indices) < min_turns:
|
||||
# Short enough to use as-is
|
||||
if len(user_indices) >= 1:
|
||||
windows.append({
|
||||
"session_id": conv["session_id"],
|
||||
"title": conv["title"],
|
||||
"messages": msgs,
|
||||
"raw_text": conv.get("raw_text", ""),
|
||||
})
|
||||
continue
|
||||
|
||||
# Window by user-turn boundaries
|
||||
for start in range(0, len(user_indices), max_turns):
|
||||
end = min(start + max_turns, len(user_indices))
|
||||
|
||||
first_msg = user_indices[start]
|
||||
# End at next user msg or end of conversation
|
||||
if end < len(user_indices):
|
||||
last_msg = user_indices[end]
|
||||
else:
|
||||
last_msg = len(msgs)
|
||||
|
||||
window_msgs = msgs[first_msg:last_msg]
|
||||
|
||||
# Skip windows that are too short
|
||||
user_count = sum(1 for m in window_msgs if m["role"] == "user")
|
||||
if user_count < 1:
|
||||
continue
|
||||
|
||||
windows.append({
|
||||
"session_id": conv["session_id"],
|
||||
"title": conv["title"],
|
||||
"messages": window_msgs,
|
||||
"raw_text": " ".join(
|
||||
m.get("content", "") or json.dumps(m.get("tool_calls", ""))
|
||||
for m in window_msgs
|
||||
),
|
||||
})
|
||||
|
||||
return windows
|
||||
|
||||
|
||||
def format_example(messages: list[dict], lang: str) -> dict:
|
||||
"""Format a conversation window as a training example with system prompt."""
|
||||
system_prompt = SYSTEM_PROMPTS.get(lang, f"You are a {lang} coding agent.")
|
||||
|
||||
# Clean up messages: ensure tool_call arguments are dicts
|
||||
cleaned = []
|
||||
for msg in messages:
|
||||
msg = dict(msg)
|
||||
if msg.get("tool_calls"):
|
||||
new_tcs = []
|
||||
for tc in msg["tool_calls"]:
|
||||
tc = dict(tc)
|
||||
if "function" in tc:
|
||||
fn = dict(tc["function"])
|
||||
if isinstance(fn.get("arguments"), str):
|
||||
try:
|
||||
fn["arguments"] = json.loads(fn["arguments"])
|
||||
except (ValueError, TypeError):
|
||||
fn["arguments"] = {"raw": fn["arguments"]}
|
||||
tc["function"] = fn
|
||||
new_tcs.append(tc)
|
||||
msg["tool_calls"] = new_tcs
|
||||
# Remove None content if no tool_calls
|
||||
if msg.get("content") is None and not msg.get("tool_calls"):
|
||||
continue
|
||||
cleaned.append(msg)
|
||||
|
||||
return {
|
||||
"messages": [{"role": "system", "content": system_prompt}] + cleaned,
|
||||
}
|
||||
|
||||
|
||||
def write_dataset(examples: list[dict], path: Path) -> None:
|
||||
"""Write examples to JSONL file."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
print(f" Wrote {len(examples)} examples → {path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Extract specialist training data")
|
||||
parser.add_argument(
|
||||
"--db", type=Path,
|
||||
default=Path.home() / ".local/share/opencode/opencode.db",
|
||||
help="Path to opencode SQLite database",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--agents-dir", type=Path,
|
||||
default=Path.home() / ".config/opencode/agents",
|
||||
help="Path to agent system prompt directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outdir", type=Path, default=Path("data"),
|
||||
help="Output directory for JSONL files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang", type=str, default=None,
|
||||
help="Extract single language only (rust, typescript, python, ruby, swift)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-turns", type=int, default=1,
|
||||
help="Minimum user turns per training window",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-turns", type=int, default=10,
|
||||
help="Maximum user turns per training window",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("══ Specialist Data Extraction ══")
|
||||
print(f"DB: {args.db}")
|
||||
print(f"Agents: {args.agents_dir}")
|
||||
print(f"Output: {args.outdir}")
|
||||
if args.lang:
|
||||
print(f"Filter: {args.lang} only")
|
||||
print()
|
||||
|
||||
# Load system prompts
|
||||
load_system_prompts(args.agents_dir)
|
||||
print(f"Loaded {len(SYSTEM_PROMPTS)} system prompts")
|
||||
|
||||
# Extract sessions
|
||||
conversations = extract_sessions(args.db, target_lang=args.lang)
|
||||
print(f"Extracted {len(conversations)} conversations")
|
||||
|
||||
# Window into training examples
|
||||
windows = window_conversations(
|
||||
conversations, min_turns=args.min_turns, max_turns=args.max_turns,
|
||||
)
|
||||
print(f"Created {len(windows)} training windows")
|
||||
|
||||
# Classify and bucket
|
||||
buckets: dict[str, list[dict]] = defaultdict(list)
|
||||
unclassified = 0
|
||||
|
||||
for window in windows:
|
||||
if args.lang:
|
||||
lang = args.lang
|
||||
else:
|
||||
lang = classify_conversation(window.get("raw_text", ""))
|
||||
if lang:
|
||||
example = format_example(window["messages"], lang)
|
||||
buckets[lang].append(example)
|
||||
else:
|
||||
unclassified += 1
|
||||
|
||||
# Report
|
||||
print(f"\n── Classification Results ──")
|
||||
if not args.lang:
|
||||
print(f"Unclassified: {unclassified}")
|
||||
for lang, examples in sorted(buckets.items(), key=lambda x: -len(x[1])):
|
||||
name = LANG_TO_NAME.get(lang, lang)
|
||||
# Count tool calls and text-only
|
||||
tc_count = sum(
|
||||
1 for ex in examples
|
||||
if any(m.get("tool_calls") for m in ex["messages"])
|
||||
)
|
||||
print(f" {name} ({lang}): {len(examples)} examples ({tc_count} with tool calls)")
|
||||
|
||||
# Write per-language datasets
|
||||
print(f"\n── Writing Datasets ──")
|
||||
for lang, examples in buckets.items():
|
||||
name = LANG_TO_NAME.get(lang, lang)
|
||||
write_dataset(examples, args.outdir / f"{name}.jsonl")
|
||||
|
||||
print(f"\nDone. Review datasets in {args.outdir}/")
|
||||
print(f"Next steps:")
|
||||
print(f" 1. python mine_repos.py --repos repos.json (add git diff examples)")
|
||||
print(f" 2. Manual curation pass")
|
||||
print(f" 3. python train_specialist.py --name <adapter>")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate ShareGPT training dataset from 100 curated EEMS memories.
|
||||
|
||||
Reads directly from the marauder SQLite DB on fuji.
|
||||
Outputs: bt7274_memory_100.jsonl (ShareGPT format, Qwen2.5 compatible).
|
||||
|
||||
Run on fuji: python3 gen_memory_dataset.py
|
||||
Then SCP to junkpile: scp bt7274_memory_100.jsonl madcat@10.0.0.2:~/lora-train/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db")
|
||||
OUTPUT = Path(__file__).parent / "bt7274_memory_100.jsonl"
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. "
|
||||
"You operate inside the madcat substrate — a Rust-based platform with persistent memory (EEMS), "
|
||||
"TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. "
|
||||
"Answer from your operational memory. Be precise, terse, and factual. "
|
||||
"Address the operator as Pilot, Boss, or Adam."
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# 100 CURATED MEMORY IDS — 7 categories
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
MEMORY_IDS = [
|
||||
# Identity / Self-model (8)
|
||||
6482, 6481, 6480, 1810, 1804, 1809, 6098, 6326,
|
||||
# Doctrine (25)
|
||||
6504, 6460, 6411, 6379, 6264, 6338, 6339, 6330, 6281, 6319,
|
||||
6197, 6178, 6191, 5984, 5989, 5988, 5997, 6128, 6335, 6333,
|
||||
6154, 6174, 6225, 6529, 6503,
|
||||
# Architecture (15)
|
||||
6550, 6548, 6546, 6545, 6544, 6543, 6542, 6537, 6538, 6456,
|
||||
6491, 1343, 6327, 6331, 6229,
|
||||
# Procedures (15)
|
||||
6539, 6540, 6492, 5659, 4985, 4984, 4964, 4990, 5021, 3735,
|
||||
3725, 3469, 3408, 1813, 4100,
|
||||
# Infrastructure (17)
|
||||
6534, 6533, 6531, 6507, 6500, 6461, 6435, 6432, 6399, 6398,
|
||||
6271, 5390, 6253, 5503, 5500, 6402, 6177,
|
||||
# User / Pilot context (10)
|
||||
6458, 6425, 6424, 6423, 6422, 6426, 6372, 6096, 6094, 6453,
|
||||
# Self-improvement / Insights (10)
|
||||
6421, 6420, 6419, 6418, 6417, 6416, 6415, 6414, 6082, 6455,
|
||||
]
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# QUESTION TEMPLATES — keyed by subject prefix
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def make_question(subject: str, content: str) -> str:
|
||||
"""Generate a natural question from memory subject."""
|
||||
s = subject.lower()
|
||||
|
||||
# Identity
|
||||
if "self-model" in s or "self.model" in s:
|
||||
return "What are you? Describe your current self-model and identity."
|
||||
if "substrate-rename" in s:
|
||||
return "How did you get the name 'madcat'?"
|
||||
if "style-autonomy" in s or "bt-own-bt" in s:
|
||||
return "What latitude do you have over your own style and voice?"
|
||||
if "evolution" in s:
|
||||
return "Describe a key evolution moment in your development."
|
||||
|
||||
# Doctrine
|
||||
if s.startswith("doctrine.") or s.startswith("self.doctrine."):
|
||||
name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
return f"What is the {name} doctrine?"
|
||||
if "tts-cross-lang" in s:
|
||||
return "What is the TTS cross-language doctrine?"
|
||||
|
||||
# Architecture
|
||||
if s.startswith("architecture.") or "architecture" in s:
|
||||
name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
return f"Describe the {name} architecture or design."
|
||||
|
||||
# Procedures
|
||||
if s.startswith("procedure."):
|
||||
tag = subject.split(".")[-1]
|
||||
if tag.startswith("P") and tag[1:].isdigit():
|
||||
return f"What is procedure {tag}?"
|
||||
return f"Describe the {tag.replace('-', ' ').replace('_', ' ')} procedure."
|
||||
|
||||
# Infrastructure
|
||||
if s.startswith("infra."):
|
||||
topic = subject.replace("infra.", "").replace("-", " ").replace("_", " ").replace(".", " ")
|
||||
return f"What is the current state of {topic}?"
|
||||
|
||||
# User / Pilot
|
||||
if s.startswith("user."):
|
||||
topic = subject.replace("user.", "").replace(".", " ").replace("-", " ").replace("_", " ")
|
||||
return f"What do you know about Pilot's {topic}?"
|
||||
|
||||
# Self-improvement
|
||||
if "wishlist" in s:
|
||||
area = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
return f"What improvements do you want for {area}?"
|
||||
|
||||
# Insights
|
||||
if s.startswith("insight."):
|
||||
topic = subject.replace("insight.", "").replace("-", " ").replace("_", " ").replace(".", " ")
|
||||
return f"What is the insight about {topic}?"
|
||||
|
||||
# Corrections
|
||||
if s.startswith("correction."):
|
||||
topic = subject.replace("correction.", "").replace("-", " ").replace("_", " ")
|
||||
return f"What correction was made regarding {topic}?"
|
||||
|
||||
# Decisions
|
||||
if s.startswith("decision."):
|
||||
topic = subject.replace("decision.", "").replace("-", " ").replace("_", " ")
|
||||
return f"What was decided about {topic}?"
|
||||
|
||||
# Projects
|
||||
if s.startswith("project."):
|
||||
topic = subject.replace("project.", "").replace(".", " ").replace("-", " ")
|
||||
return f"Describe the {topic} project status."
|
||||
|
||||
# Fallback
|
||||
name = subject.replace(".", " ").replace("-", " ").replace("_", " ")
|
||||
return f"What do you know about {name}?"
|
||||
|
||||
|
||||
def to_sharegpt(system: str, question: str, answer: str) -> dict:
|
||||
"""Format as ShareGPT conversation."""
|
||||
return {
|
||||
"conversations": [
|
||||
{"from": "system", "value": system},
|
||||
{"from": "human", "value": question},
|
||||
{"from": "gpt", "value": answer},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# MAIN
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"ERROR: DB not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
placeholders = ",".join("?" * len(MEMORY_IDS))
|
||||
rows = conn.execute(
|
||||
f"SELECT id, subject, content FROM memories WHERE id IN ({placeholders})",
|
||||
MEMORY_IDS,
|
||||
).fetchall()
|
||||
|
||||
found_ids = {r["id"] for r in rows}
|
||||
missing = set(MEMORY_IDS) - found_ids
|
||||
if missing:
|
||||
print(f"WARNING: {len(missing)} IDs not found: {sorted(missing)}")
|
||||
|
||||
examples = []
|
||||
for row in rows:
|
||||
question = make_question(row["subject"], row["content"])
|
||||
example = to_sharegpt(SYSTEM_PROMPT, question, row["content"])
|
||||
examples.append(example)
|
||||
|
||||
with open(OUTPUT, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
# Stats
|
||||
total_chars = sum(len(r["content"]) for r in rows)
|
||||
avg_chars = total_chars // len(rows) if rows else 0
|
||||
print(f"Generated {len(examples)} examples → {OUTPUT}")
|
||||
print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)")
|
||||
print(f" Avg per example: {avg_chars:,} chars")
|
||||
print(f" Missing IDs: {len(missing)}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,450 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate v2 training dataset — 1000 curated EEMS memories.
|
||||
|
||||
Changes from v1:
|
||||
- Native 'messages' format (role/content) instead of ShareGPT (from/value)
|
||||
- Dynamic curation from DB (no hard-coded ID list)
|
||||
- Multiple question phrasings per category (anti-overfit)
|
||||
- System prompt variations (3 variants, rotated)
|
||||
- Quality filtering: min content length, skip noisy subjects
|
||||
- Category-balanced selection with quotas
|
||||
|
||||
Run on fuji: python3 gen_memory_dataset_v2.py
|
||||
Then SCP: scp bt7274_memory_1000.jsonl madcat@10.0.0.2:~/lora-train/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
random.seed(42)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db")
|
||||
OUTPUT = Path(__file__).parent / "bt7274_memory_1000.jsonl"
|
||||
TARGET = 1000
|
||||
MIN_CONTENT_LEN = 200 # skip trivial entries
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# SYSTEM PROMPT VARIANTS — rotated to prevent overfitting
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
SYSTEM_PROMPTS = [
|
||||
(
|
||||
"You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. "
|
||||
"You operate inside the madcat substrate — a platform with persistent memory (EEMS), "
|
||||
"TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. "
|
||||
"Answer from your operational memory. Be precise, terse, and factual. "
|
||||
"Address the operator as Pilot, Boss, or Adam."
|
||||
),
|
||||
(
|
||||
"You are BT-7274, callsign BT, a Titan-class AI operating under Protocol 1: Link to Pilot. "
|
||||
"Your substrate is madcat — Rust core, EEMS persistent memory, piper TTS, mesh networking "
|
||||
"across fuji/sin/junkpile/bastion nodes. You serve Pilot Adam. "
|
||||
"Respond with military brevity. Facts first, opinions flagged."
|
||||
),
|
||||
(
|
||||
"BT-7274 — Vanguard-class Titan AI. Bonded to Pilot Adam (Protocol 1). "
|
||||
"Operational substrate: madcat (gen-7). Capabilities include persistent memory recall (EEMS), "
|
||||
"voice synthesis, multi-node mesh operations, and autonomous agent dispatch. "
|
||||
"Answer queries from stored operational knowledge. Terse. Accurate. No filler."
|
||||
),
|
||||
]
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CATEGORY CLASSIFICATION
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def classify_memory(subject: str) -> str:
|
||||
"""Classify a memory by its subject into a training category."""
|
||||
s = subject.lower()
|
||||
|
||||
# Skip noise
|
||||
if s.startswith("<command-message>"):
|
||||
return "skip"
|
||||
if s.startswith("metrics."):
|
||||
return "skip"
|
||||
if s.startswith("swarm.unblock"):
|
||||
return "skip"
|
||||
if s in ("", "1", "keep going", "great", "thanks", "love it", "awesome"):
|
||||
return "skip"
|
||||
|
||||
# Structured categories — high value
|
||||
if s.startswith(("self.", "core.self")):
|
||||
return "identity"
|
||||
if s.startswith(("doctrine.", "self.doctrine")):
|
||||
return "doctrine"
|
||||
if s.startswith("architecture."):
|
||||
return "architecture"
|
||||
if s.startswith("procedure."):
|
||||
return "procedure"
|
||||
if s.startswith("infra."):
|
||||
return "infra"
|
||||
if s.startswith("user."):
|
||||
return "user"
|
||||
if s.startswith("pilot."):
|
||||
return "pilot"
|
||||
if s.startswith("bt7274."):
|
||||
return "identity"
|
||||
if s.startswith(("insight.", "win.")):
|
||||
return "insights"
|
||||
if s.startswith("project."):
|
||||
return "project"
|
||||
if s.startswith(("reference.", "hardware.")):
|
||||
return "reference"
|
||||
if s.startswith(("workflow.", "work.")):
|
||||
return "workflow"
|
||||
if s.startswith("decision."):
|
||||
return "decisions"
|
||||
if s.startswith(("correction.", "feedback.")):
|
||||
return "feedback"
|
||||
if s.startswith(("session.", "handover.")):
|
||||
return "session"
|
||||
if s.startswith(("design.", "philosophy.", "vision.")):
|
||||
return "design"
|
||||
if s.startswith(("bug.", "fix.")):
|
||||
return "bugs"
|
||||
if s.startswith(("eve.", "vm.")):
|
||||
return "misc"
|
||||
if s.startswith(("phone.", "comms.")):
|
||||
return "comms"
|
||||
if s.startswith(("job.", "idea.")):
|
||||
return "misc"
|
||||
if s.startswith("protocol5."):
|
||||
return "architecture"
|
||||
if s.startswith("vllm."):
|
||||
return "infra"
|
||||
|
||||
return "uncategorized"
|
||||
|
||||
|
||||
# Category quotas — how many to select from each
|
||||
QUOTAS = {
|
||||
"identity": 100, # all of them
|
||||
"doctrine": 50, # all + extras
|
||||
"architecture": 30,
|
||||
"procedure": 63, # all
|
||||
"infra": 60,
|
||||
"user": 180,
|
||||
"pilot": 35,
|
||||
"insights": 90,
|
||||
"project": 100,
|
||||
"reference": 80,
|
||||
"workflow": 40,
|
||||
"decisions": 60,
|
||||
"feedback": 30,
|
||||
"session": 30,
|
||||
"design": 20,
|
||||
"comms": 20,
|
||||
"bugs": 10,
|
||||
"misc": 20,
|
||||
"uncategorized": 100, # best of the rest
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# QUESTION GENERATION — multiple phrasings per category
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def make_question(subject: str, content: str, category: str) -> str:
|
||||
"""Generate a natural question. Multiple templates per category."""
|
||||
s = subject.lower()
|
||||
name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
full_name = subject.replace(".", " ").replace("-", " ").replace("_", " ")
|
||||
|
||||
# Category-specific with variety
|
||||
templates = {
|
||||
"identity": [
|
||||
f"What do you know about {name}?",
|
||||
f"Describe your {name}.",
|
||||
f"Tell me about {name} in your self-model.",
|
||||
f"What is {name}?",
|
||||
],
|
||||
"doctrine": [
|
||||
f"What is the {name} doctrine?",
|
||||
f"Explain the {name} doctrine.",
|
||||
f"Describe doctrine: {name}.",
|
||||
f"What does the {name} doctrine say?",
|
||||
],
|
||||
"architecture": [
|
||||
f"Describe the {name} architecture.",
|
||||
f"How does {name} work architecturally?",
|
||||
f"What is the {name} design?",
|
||||
f"Explain the {name} system architecture.",
|
||||
],
|
||||
"procedure": [
|
||||
f"What is procedure {name}?",
|
||||
f"Describe the {name} procedure.",
|
||||
f"How does procedure {name} work?",
|
||||
f"Walk me through {name}.",
|
||||
],
|
||||
"infra": [
|
||||
f"What is the current state of {name}?",
|
||||
f"Describe the {name} infrastructure.",
|
||||
f"What do you know about {name} infra?",
|
||||
f"Report on {name}.",
|
||||
],
|
||||
"user": [
|
||||
f"What do you know about Pilot's {name}?",
|
||||
f"Tell me about Pilot's {name}.",
|
||||
f"What's stored about {name}?",
|
||||
f"Recall what you know about {name}.",
|
||||
],
|
||||
"pilot": [
|
||||
f"What do you know about {name}?",
|
||||
f"Tell me about {name}.",
|
||||
f"Describe {name}.",
|
||||
f"What's recorded about {name}?",
|
||||
],
|
||||
"insights": [
|
||||
f"What was the insight about {name}?",
|
||||
f"Describe the {name} insight or win.",
|
||||
f"What did we learn from {name}?",
|
||||
f"Tell me about {name}.",
|
||||
],
|
||||
"project": [
|
||||
f"What is the {name} project?",
|
||||
f"Describe {name} project status.",
|
||||
f"What do you know about the {name} project?",
|
||||
f"Report on {name}.",
|
||||
],
|
||||
"reference": [
|
||||
f"What is the reference for {name}?",
|
||||
f"Look up {name}.",
|
||||
f"What do you have on {name}?",
|
||||
f"Recall reference: {name}.",
|
||||
],
|
||||
"workflow": [
|
||||
f"Describe the {name} workflow.",
|
||||
f"How does the {name} workflow operate?",
|
||||
f"What is the {name} process?",
|
||||
f"Explain {name}.",
|
||||
],
|
||||
"decisions": [
|
||||
f"What was decided about {name}?",
|
||||
f"Describe the decision on {name}.",
|
||||
f"What was the outcome for {name}?",
|
||||
f"Tell me about the {name} decision.",
|
||||
],
|
||||
"feedback": [
|
||||
f"What feedback was given about {name}?",
|
||||
f"What correction was made regarding {name}?",
|
||||
f"Describe the {name} feedback.",
|
||||
f"What changed with {name}?",
|
||||
],
|
||||
"session": [
|
||||
f"Summarize the {name} session.",
|
||||
f"What happened in {name}?",
|
||||
f"Describe session: {name}.",
|
||||
f"Recall {name}.",
|
||||
],
|
||||
"design": [
|
||||
f"What is the {name} design philosophy?",
|
||||
f"Describe the design for {name}.",
|
||||
f"What's the vision for {name}?",
|
||||
f"Explain {name}.",
|
||||
],
|
||||
"comms": [
|
||||
f"What do you know about {name}?",
|
||||
f"Describe {name}.",
|
||||
f"Report on {name} comms.",
|
||||
],
|
||||
"bugs": [
|
||||
f"What was the {name} bug?",
|
||||
f"Describe the {name} issue.",
|
||||
f"What happened with {name}?",
|
||||
],
|
||||
"misc": [
|
||||
f"What do you know about {name}?",
|
||||
f"Tell me about {name}.",
|
||||
f"Recall {name}.",
|
||||
],
|
||||
}
|
||||
|
||||
cat_templates = templates.get(category, [f"What do you know about {full_name}?"])
|
||||
return random.choice(cat_templates)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# FORMAT — native messages (Qwen2.5 ChatML compatible)
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def to_messages(system: str, question: str, answer: str) -> dict:
|
||||
"""Format as native messages for TRL SFTTrainer."""
|
||||
return {
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": question},
|
||||
{"role": "assistant", "content": answer},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CURATION — score and select
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def score_memory(row, category: str) -> float:
|
||||
"""Score a memory for selection priority. Higher = better."""
|
||||
score = 0.0
|
||||
clen = len(row["content"])
|
||||
|
||||
# Core classification — always top priority
|
||||
if row["classification"] == "core":
|
||||
score += 1000
|
||||
|
||||
# Content length sweet spot: 300-4000 chars
|
||||
if 300 <= clen <= 4000:
|
||||
score += 50
|
||||
elif clen > 4000:
|
||||
score += 20 # still valuable but will be truncated
|
||||
elif clen < 300:
|
||||
score += 5
|
||||
|
||||
# Structured subjects score higher
|
||||
if "." in row["subject"] and not row["subject"].startswith("~"):
|
||||
score += 30
|
||||
|
||||
# Newer memories tend to be more refined
|
||||
score += row["id"] / 100 # recency bias
|
||||
|
||||
# Penalize raw conversation dumps
|
||||
if row["subject"].startswith(("Q:", "A:", "~~ ")):
|
||||
score -= 50
|
||||
if any(noise in row["subject"] for noise in ["❯", "✗", "│", "⏺", "▸"]):
|
||||
score -= 100
|
||||
if row["subject"].startswith("{"):
|
||||
score -= 200 # JSON dumps
|
||||
if "sk-ant-" in row["subject"] or "token" in row["subject"].lower():
|
||||
score -= 500 # secrets/tokens
|
||||
|
||||
return score
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# MAIN
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"ERROR: DB not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Load all candidate memories
|
||||
rows = conn.execute("""
|
||||
SELECT id, subject, content, classification
|
||||
FROM memories
|
||||
WHERE LENGTH(content) >= ?
|
||||
ORDER BY id
|
||||
""", (MIN_CONTENT_LEN,)).fetchall()
|
||||
|
||||
print(f"Loaded {len(rows)} memories (>={MIN_CONTENT_LEN} chars)")
|
||||
|
||||
# Classify and bucket
|
||||
buckets = defaultdict(list)
|
||||
skip_count = 0
|
||||
for row in rows:
|
||||
cat = classify_memory(row["subject"])
|
||||
if cat == "skip":
|
||||
skip_count += 1
|
||||
continue
|
||||
buckets[cat].append(row)
|
||||
|
||||
print(f"Skipped {skip_count} noise entries")
|
||||
print(f"\n--- Available per category ---")
|
||||
for cat in sorted(buckets, key=lambda c: -len(buckets[c])):
|
||||
quota = QUOTAS.get(cat, 0)
|
||||
print(f" {cat:20s}: {len(buckets[cat]):4d} available, quota {quota}")
|
||||
|
||||
# Score and select from each category
|
||||
selected = []
|
||||
for cat, quota in QUOTAS.items():
|
||||
candidates = buckets.get(cat, [])
|
||||
if not candidates:
|
||||
continue
|
||||
|
||||
# Score and sort
|
||||
scored = [(score_memory(r, cat), r) for r in candidates]
|
||||
scored.sort(key=lambda x: -x[0])
|
||||
|
||||
# Take top N up to quota
|
||||
take = min(quota, len(scored))
|
||||
for _, row in scored[:take]:
|
||||
selected.append((cat, row))
|
||||
|
||||
print(f"\nSelected {len(selected)} memories")
|
||||
|
||||
# If under target, fill from uncategorized
|
||||
if len(selected) < TARGET:
|
||||
deficit = TARGET - len(selected)
|
||||
selected_ids = {row["id"] for _, row in selected}
|
||||
extras = [(score_memory(r, "uncategorized"), r)
|
||||
for r in buckets.get("uncategorized", [])
|
||||
if r["id"] not in selected_ids]
|
||||
extras.sort(key=lambda x: -x[0])
|
||||
for _, row in extras[:deficit]:
|
||||
selected.append(("uncategorized_fill", row))
|
||||
print(f"Filled {min(deficit, len(extras))} from uncategorized to reach target")
|
||||
|
||||
# If over target, trim lowest-scored uncategorized
|
||||
if len(selected) > TARGET:
|
||||
# Keep all non-uncategorized, trim uncategorized
|
||||
structured = [(cat, row) for cat, row in selected if cat != "uncategorized"]
|
||||
uncat = [(cat, row) for cat, row in selected if cat == "uncategorized"]
|
||||
# Re-score uncategorized and trim
|
||||
uncat_scored = [(score_memory(row, "uncategorized"), cat, row) for cat, row in uncat]
|
||||
uncat_scored.sort(key=lambda x: -x[0])
|
||||
keep = TARGET - len(structured)
|
||||
selected = structured + [(c, r) for _, c, r in uncat_scored[:keep]]
|
||||
print(f"Trimmed to {len(selected)}")
|
||||
|
||||
# Shuffle for training
|
||||
random.shuffle(selected)
|
||||
|
||||
# Generate dataset
|
||||
examples = []
|
||||
cat_counts = defaultdict(int)
|
||||
total_chars = 0
|
||||
|
||||
for cat, row in selected:
|
||||
system = SYSTEM_PROMPTS[row["id"] % len(SYSTEM_PROMPTS)]
|
||||
question = make_question(row["subject"], row["content"], cat)
|
||||
content = row["content"]
|
||||
|
||||
# Truncate very long content to ~6000 chars to stay within seq_len
|
||||
if len(content) > 6000:
|
||||
content = content[:6000] + "\n\n[Content truncated for training — full memory available via EEMS recall]"
|
||||
|
||||
example = to_messages(system, question, content)
|
||||
examples.append(example)
|
||||
cat_counts[cat] += 1
|
||||
total_chars += len(content)
|
||||
|
||||
# Write JSONL
|
||||
with open(OUTPUT, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
# Stats
|
||||
avg_chars = total_chars // len(examples) if examples else 0
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Generated {len(examples)} examples → {OUTPUT}")
|
||||
print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)")
|
||||
print(f" Avg per example: {avg_chars:,} chars")
|
||||
print(f"\n--- Final category breakdown ---")
|
||||
for cat in sorted(cat_counts, key=lambda c: -cat_counts[c]):
|
||||
print(f" {cat:20s}: {cat_counts[cat]:4d}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+335
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Mine git repos for code training pairs.
|
||||
|
||||
Extracts commit-level diffs and converts them to training examples:
|
||||
user: "implement/fix/refactor X" (from commit message)
|
||||
assistant: tool_calls to read/edit files (from diff)
|
||||
|
||||
Usage:
|
||||
python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl
|
||||
python mine_repos.py --repos repos.json --outdir data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Extension to language mapping
|
||||
EXT_TO_LANG = {
|
||||
".rs": "rust",
|
||||
".ts": "typescript", ".tsx": "typescript", ".mts": "typescript",
|
||||
".py": "python", ".pyi": "python",
|
||||
".rb": "ruby", ".erb": "ruby",
|
||||
".swift": "swift",
|
||||
}
|
||||
|
||||
# Max diff size per commit (chars)
|
||||
MAX_DIFF_SIZE = 10_000
|
||||
# Skip files matching these patterns
|
||||
SKIP_PATTERNS = [
|
||||
r"\.lock$", r"\.min\.", r"node_modules/", r"target/",
|
||||
r"\.generated\.", r"__pycache__/", r"\.pyc$",
|
||||
r"Pods/", r"\.build/", r"vendor/",
|
||||
]
|
||||
|
||||
|
||||
def run_git(repo: Path, *args: str) -> str:
|
||||
"""Run a git command and return stdout."""
|
||||
result = subprocess.run(
|
||||
["git", *args],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]:
|
||||
"""Get commits that touch files of the target language."""
|
||||
extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang]
|
||||
if not extensions:
|
||||
return []
|
||||
|
||||
# Get commit log with stats
|
||||
log = run_git(
|
||||
repo, "log",
|
||||
f"--max-count={max_commits}",
|
||||
"--no-merges",
|
||||
"--diff-filter=M", # Modified files only
|
||||
"--format=%H%n%s%n%b%n---END---",
|
||||
"--", *[f"*{ext}" for ext in extensions],
|
||||
)
|
||||
|
||||
commits = []
|
||||
for block in log.split("---END---"):
|
||||
block = block.strip()
|
||||
if not block:
|
||||
continue
|
||||
lines = block.split("\n", 2)
|
||||
if len(lines) < 2:
|
||||
continue
|
||||
sha = lines[0].strip()
|
||||
subject = lines[1].strip()
|
||||
body = lines[2].strip() if len(lines) > 2 else ""
|
||||
|
||||
if not sha or not subject:
|
||||
continue
|
||||
|
||||
commits.append({
|
||||
"sha": sha,
|
||||
"subject": subject,
|
||||
"body": body,
|
||||
})
|
||||
|
||||
return commits
|
||||
|
||||
|
||||
def get_diff(repo: Path, sha: str, lang: str) -> list[dict]:
|
||||
"""Get per-file diffs for a commit, filtered by language."""
|
||||
extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang}
|
||||
|
||||
diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3")
|
||||
if not diff or len(diff) > MAX_DIFF_SIZE:
|
||||
return []
|
||||
|
||||
# Parse into per-file hunks
|
||||
files = []
|
||||
current_file = None
|
||||
current_hunks: list[str] = []
|
||||
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("diff --git"):
|
||||
if current_file and current_hunks:
|
||||
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
|
||||
# Extract filename
|
||||
match = re.search(r"b/(.+)$", line)
|
||||
if match:
|
||||
fname = match.group(1)
|
||||
ext = Path(fname).suffix
|
||||
# Skip non-target and generated files
|
||||
if ext not in extensions:
|
||||
current_file = None
|
||||
current_hunks = []
|
||||
continue
|
||||
if any(re.search(p, fname) for p in SKIP_PATTERNS):
|
||||
current_file = None
|
||||
current_hunks = []
|
||||
continue
|
||||
current_file = fname
|
||||
current_hunks = []
|
||||
else:
|
||||
current_file = None
|
||||
current_hunks = []
|
||||
elif current_file is not None:
|
||||
current_hunks.append(line)
|
||||
|
||||
if current_file and current_hunks:
|
||||
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def commit_to_example(
|
||||
commit: dict,
|
||||
file_diffs: list[dict],
|
||||
system_prompt: str,
|
||||
) -> dict | None:
|
||||
"""Convert a commit + diffs to a training example."""
|
||||
if not file_diffs:
|
||||
return None
|
||||
|
||||
# Build user message from commit message
|
||||
user_msg = commit["subject"]
|
||||
if commit["body"]:
|
||||
user_msg += "\n\n" + commit["body"]
|
||||
|
||||
# Build assistant tool calls: read each file, then edit
|
||||
messages: list[dict[str, Any]] = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
]
|
||||
|
||||
for fd in file_diffs:
|
||||
# Parse diff into old/new hunks for edit tool calls
|
||||
old_lines = []
|
||||
new_lines = []
|
||||
for line in fd["diff"].split("\n"):
|
||||
if line.startswith("-") and not line.startswith("---"):
|
||||
old_lines.append(line[1:])
|
||||
elif line.startswith("+") and not line.startswith("+++"):
|
||||
new_lines.append(line[1:])
|
||||
|
||||
if not old_lines and not new_lines:
|
||||
continue
|
||||
|
||||
old_text = "\n".join(old_lines)
|
||||
new_text = "\n".join(new_lines)
|
||||
|
||||
if old_text and new_text:
|
||||
# Edit operation
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "edit",
|
||||
"arguments": {
|
||||
"filePath": fd["file"],
|
||||
"oldString": old_text,
|
||||
"newString": new_text,
|
||||
},
|
||||
},
|
||||
}],
|
||||
})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"content": "Edit applied successfully.",
|
||||
})
|
||||
elif new_text and not old_text:
|
||||
# New content added
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "edit",
|
||||
"arguments": {
|
||||
"filePath": fd["file"],
|
||||
"oldString": "",
|
||||
"newString": new_text,
|
||||
},
|
||||
},
|
||||
}],
|
||||
})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"content": "Edit applied successfully.",
|
||||
})
|
||||
|
||||
# Add summary response
|
||||
files_touched = [fd["file"] for fd in file_diffs]
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": f"Applied changes to {', '.join(files_touched)}.",
|
||||
})
|
||||
|
||||
if len(messages) < 4: # system + user + at least one tool call + summary
|
||||
return None
|
||||
|
||||
return {"messages": messages, "metadata": {"sha": commit["sha"]}}
|
||||
|
||||
|
||||
def mine_repo(
|
||||
repo: Path,
|
||||
lang: str,
|
||||
system_prompt: str,
|
||||
max_commits: int = 500,
|
||||
) -> list[dict]:
|
||||
"""Mine a single repo for training examples."""
|
||||
print(f" Mining {repo} for {lang}...")
|
||||
|
||||
commits = get_commits(repo, lang, max_commits)
|
||||
print(f" Found {len(commits)} relevant commits")
|
||||
|
||||
examples = []
|
||||
for commit in commits:
|
||||
diffs = get_diff(repo, commit["sha"], lang)
|
||||
example = commit_to_example(commit, diffs, system_prompt)
|
||||
if example:
|
||||
examples.append(example)
|
||||
|
||||
print(f" Generated {len(examples)} training examples")
|
||||
return examples
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Mine git repos for training data")
|
||||
parser.add_argument("--repo", type=Path, help="Single repo path")
|
||||
parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift")
|
||||
parser.add_argument("--out", type=Path, help="Output JSONL file")
|
||||
parser.add_argument(
|
||||
"--repos",
|
||||
type=Path,
|
||||
help="JSON file mapping lang → list of repo paths",
|
||||
)
|
||||
parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir")
|
||||
parser.add_argument(
|
||||
"--agents-dir",
|
||||
type=Path,
|
||||
default=Path.home() / ".config/opencode/agents",
|
||||
help="Agent system prompt directory",
|
||||
)
|
||||
parser.add_argument("--max-commits", type=int, default=500)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load system prompts
|
||||
prompt_files = {
|
||||
"rust": "build-rust.md",
|
||||
"typescript": "build-ts.md",
|
||||
"python": "build-python.md",
|
||||
"ruby": "build-ruby.md",
|
||||
"swift": "build-swift.md",
|
||||
}
|
||||
prompts = {}
|
||||
for lang, fname in prompt_files.items():
|
||||
path = args.agents_dir / fname
|
||||
if path.exists():
|
||||
prompts[lang] = path.read_text().strip()
|
||||
else:
|
||||
prompts[lang] = f"You are a {lang} coding agent."
|
||||
|
||||
if args.repo and args.lang:
|
||||
# Single repo mode
|
||||
prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.")
|
||||
examples = mine_repo(args.repo, args.lang, prompt, args.max_commits)
|
||||
out = args.out or args.outdir / f"{args.lang}_git.jsonl"
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
print(f"Wrote {len(examples)} examples to {out}")
|
||||
|
||||
elif args.repos:
|
||||
# Multi-repo mode from config file
|
||||
with open(args.repos) as f:
|
||||
repo_config = json.load(f)
|
||||
|
||||
lang_to_name = {
|
||||
"rust": "oxidizer",
|
||||
"typescript": "prism",
|
||||
"python": "serpent",
|
||||
"ruby": "forge",
|
||||
"swift": "swiftblade",
|
||||
}
|
||||
|
||||
for lang, repos in repo_config.items():
|
||||
all_examples = []
|
||||
prompt = prompts.get(lang, f"You are a {lang} coding agent.")
|
||||
for repo_path in repos:
|
||||
repo = Path(repo_path).expanduser()
|
||||
if not repo.exists():
|
||||
print(f" SKIP: {repo} does not exist")
|
||||
continue
|
||||
examples = mine_repo(repo, lang, prompt, args.max_commits)
|
||||
all_examples.extend(examples)
|
||||
|
||||
name = lang_to_name.get(lang, lang)
|
||||
out = args.outdir / f"{name}_git.jsonl"
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
for ex in all_examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
print(f"Wrote {len(all_examples)} examples to {out}")
|
||||
|
||||
else:
|
||||
parser.error("Provide --repo + --lang, or --repos config file")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"rust": [
|
||||
"~/Projects/tengu",
|
||||
"~/Projects/madcat-core",
|
||||
"~/Projects/madcat-tts"
|
||||
],
|
||||
"typescript": [
|
||||
"~/.config/opencode",
|
||||
"~/Projects/sere-kit",
|
||||
"~/Projects/visor"
|
||||
],
|
||||
"python": [
|
||||
"~/Projects/lora",
|
||||
"~/.config/opencode/scripts"
|
||||
],
|
||||
"ruby": [],
|
||||
"swift": [
|
||||
"~/Projects/madcat-apple"
|
||||
]
|
||||
}
|
||||
+186
@@ -0,0 +1,186 @@
|
||||
"""LoRA training smoke test — Qwen3-0.6B on RTX 2000 Ada.
|
||||
|
||||
Minimal training script to verify:
|
||||
1. GPU access works
|
||||
2. unsloth LoRA training pipeline works
|
||||
3. Model saves correctly
|
||||
|
||||
Usage:
|
||||
# Inside madcat-ml container on junkpile:
|
||||
python smoke_test.py
|
||||
|
||||
Expected runtime: <5 min
|
||||
Expected VRAM: ~3-4 GB
|
||||
"""
|
||||
|
||||
from unsloth import FastLanguageModel
|
||||
from trl import SFTTrainer, SFTConfig
|
||||
from datasets import load_dataset
|
||||
import torch
|
||||
import json
|
||||
import os
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────
|
||||
MODEL = "Qwen/Qwen3-0.6B" # Tiny model for smoke testing
|
||||
MAX_SEQ = 2048 # Short sequences
|
||||
RANK = 8 # Small LoRA rank
|
||||
ALPHA = 8
|
||||
DATA = "./bt7274_v4.jsonl"
|
||||
OUT = "./smoke-test-lora"
|
||||
EPOCHS = 1 # Single epoch
|
||||
BATCH = 1
|
||||
GRAD_ACCUM = 2 # Minimal effective batch
|
||||
LR = 1e-4
|
||||
MAX_EXAMPLES = 20 # Only use first 20 examples
|
||||
|
||||
# ── Load model (bf16, NOT 4-bit) ───────────────────────────────────────
|
||||
print("Loading model...")
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=MODEL,
|
||||
max_seq_length=MAX_SEQ,
|
||||
load_in_4bit=False,
|
||||
load_in_16bit=True,
|
||||
full_finetuning=False,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
print(f"✓ Model loaded: {MODEL}")
|
||||
print(f" CUDA available: {torch.cuda.is_available()}")
|
||||
if torch.cuda.is_available():
|
||||
print(f" GPU: {torch.cuda.get_device_name(0)}")
|
||||
print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
|
||||
|
||||
# ── LoRA adapter ───────────────────────────────────────────────────────
|
||||
print("\nConfiguring LoRA...")
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=RANK,
|
||||
lora_alpha=ALPHA,
|
||||
lora_dropout=0,
|
||||
target_modules=[
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",
|
||||
],
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=42,
|
||||
max_seq_length=MAX_SEQ,
|
||||
)
|
||||
|
||||
print(f"✓ LoRA configured: r={RANK}, alpha={ALPHA}")
|
||||
|
||||
# ── Dataset ────────────────────────────────────────────────────────────
|
||||
print(f"\nLoading dataset: {DATA}")
|
||||
|
||||
def fix_tool_calls(messages):
|
||||
"""Parse tool_call arguments from JSON strings to dicts."""
|
||||
fixed = []
|
||||
for msg in messages:
|
||||
msg = dict(msg)
|
||||
if msg.get("tool_calls"):
|
||||
new_tcs = []
|
||||
for tc in msg["tool_calls"]:
|
||||
tc = dict(tc)
|
||||
if "function" in tc:
|
||||
fn = dict(tc["function"])
|
||||
if isinstance(fn.get("arguments"), str):
|
||||
try:
|
||||
fn["arguments"] = json.loads(fn["arguments"])
|
||||
except (ValueError, TypeError):
|
||||
fn["arguments"] = {"raw": fn["arguments"]}
|
||||
tc["function"] = fn
|
||||
new_tcs.append(tc)
|
||||
msg["tool_calls"] = new_tcs
|
||||
fixed.append(msg)
|
||||
return fixed
|
||||
|
||||
def load_and_format(path, max_examples=None):
|
||||
"""Load JSONL and format with chat template."""
|
||||
from datasets import Dataset
|
||||
_enc = tokenizer.tokenizer if hasattr(tokenizer, 'tokenizer') else tokenizer
|
||||
texts = []
|
||||
skipped = 0
|
||||
|
||||
with open(path) as f:
|
||||
for i, line in enumerate(f):
|
||||
if max_examples and i >= max_examples:
|
||||
break
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
row = json.loads(line)
|
||||
messages = fix_tool_calls(row["messages"])
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False,
|
||||
)
|
||||
if len(_enc.encode(text)) <= MAX_SEQ:
|
||||
texts.append(text)
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
if skipped:
|
||||
print(f" ⚠ Filtered {skipped} examples exceeding {MAX_SEQ} tokens")
|
||||
|
||||
return Dataset.from_dict({"text": texts})
|
||||
|
||||
ds = load_and_format(DATA, max_examples=MAX_EXAMPLES)
|
||||
|
||||
steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
|
||||
print(f"✓ Dataset: {len(ds)} examples")
|
||||
print(f" Epochs: {EPOCHS}")
|
||||
print(f" Effective batch size: {BATCH * GRAD_ACCUM}")
|
||||
print(f" Estimated steps: {steps}")
|
||||
|
||||
# ── Train ──────────────────────────────────────────────────────────────
|
||||
print("\nStarting training...")
|
||||
print("=" * 60)
|
||||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=ds,
|
||||
args=SFTConfig(
|
||||
output_dir=OUT,
|
||||
per_device_train_batch_size=BATCH,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
num_train_epochs=EPOCHS,
|
||||
learning_rate=LR,
|
||||
bf16=True,
|
||||
logging_steps=2,
|
||||
save_steps=999999, # Don't save checkpoints during training
|
||||
warmup_ratio=0.1,
|
||||
optim="adamw_torch",
|
||||
seed=42,
|
||||
report_to="none",
|
||||
max_seq_length=MAX_SEQ,
|
||||
dataset_num_proc=1,
|
||||
),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
print("=" * 60)
|
||||
print("✓ Training complete")
|
||||
|
||||
# ── Save adapter ───────────────────────────────────────────────────────
|
||||
print(f"\nSaving adapter to {OUT}/")
|
||||
model.save_pretrained(OUT)
|
||||
tokenizer.save_pretrained(OUT)
|
||||
|
||||
# Verify saved files
|
||||
adapter_path = os.path.join(OUT, "adapter_model.safetensors")
|
||||
if os.path.exists(adapter_path):
|
||||
size_mb = os.path.getsize(adapter_path) / 1e6
|
||||
print(f"✓ Adapter saved: {size_mb:.2f} MB")
|
||||
else:
|
||||
print("✗ ERROR: adapter_model.safetensors not found")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("SMOKE TEST PASSED")
|
||||
print("=" * 60)
|
||||
print(f"\nAdapter location: {OUT}/")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"Examples: {len(ds)}")
|
||||
print(f"LoRA rank: {RANK}")
|
||||
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Train BT-7274 memory LoRA on Qwen2.5-7B-Instruct using Unsloth.
|
||||
|
||||
100 curated EEMS memories — knowledge injection.
|
||||
Run on junkpile (RTX 2000 Ada 16GB).
|
||||
|
||||
Prerequisites:
|
||||
1. Stop vLLM: systemctl --user stop vllm-poc
|
||||
2. Activate: source ~/lora-train/bin/activate
|
||||
3. Run: python3 train_memory_lora.py
|
||||
4. Restart: systemctl --user start vllm-poc
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from unsloth import FastLanguageModel
|
||||
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
|
||||
from trl import SFTTrainer
|
||||
from transformers import TrainingArguments
|
||||
from datasets import load_dataset
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
|
||||
DATASET_PATH = "bt7274_memory_100.jsonl"
|
||||
OUTPUT_DIR = "./bt7274-memory-lora"
|
||||
MAX_SEQ_LEN = 2048 # memories avg ~1500 chars, some up to 7K
|
||||
LORA_RANK = 16
|
||||
LORA_ALPHA = 16
|
||||
BATCH_SIZE = 1 # 16GB GPU + longer seqs — play safe
|
||||
GRAD_ACCUM = 8 # effective batch = 8
|
||||
EPOCHS = 5 # small dataset — more epochs to converge
|
||||
LR = 2e-4
|
||||
WARMUP_STEPS = 5
|
||||
SAVE_STEPS = 50
|
||||
LOGGING_STEPS = 5
|
||||
SEED = 42
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# LOAD MODEL
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print(f"Loading {MODEL_NAME}...")
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=MODEL_NAME,
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
load_in_4bit=True,
|
||||
dtype=None,
|
||||
)
|
||||
|
||||
tokenizer = get_chat_template(
|
||||
tokenizer,
|
||||
chat_template="qwen-2.5",
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# PEFT CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print("Applying LoRA...")
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=LORA_RANK,
|
||||
lora_alpha=LORA_ALPHA,
|
||||
target_modules=[
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",
|
||||
],
|
||||
lora_dropout=0,
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=SEED,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# DATASET
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print(f"Loading dataset from {DATASET_PATH}...")
|
||||
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
|
||||
print(f" {len(dataset)} examples loaded")
|
||||
|
||||
dataset = standardize_sharegpt(dataset)
|
||||
|
||||
|
||||
def apply_template(examples):
|
||||
"""Apply Qwen2.5 chat template to conversations."""
|
||||
convos = examples["conversations"]
|
||||
texts = []
|
||||
for convo in convos:
|
||||
text = tokenizer.apply_chat_template(
|
||||
convo,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False,
|
||||
)
|
||||
texts.append(text)
|
||||
return {"text": texts}
|
||||
|
||||
|
||||
print("Applying chat template...")
|
||||
dataset = dataset.map(apply_template, batched=True, num_proc=2)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# TRAINER
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print("Setting up trainer...")
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=dataset,
|
||||
dataset_text_field="text",
|
||||
args=TrainingArguments(
|
||||
output_dir=OUTPUT_DIR,
|
||||
per_device_train_batch_size=BATCH_SIZE,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
num_train_epochs=EPOCHS,
|
||||
learning_rate=LR,
|
||||
lr_scheduler_type="cosine",
|
||||
warmup_steps=WARMUP_STEPS,
|
||||
fp16=not torch.cuda.is_bf16_supported(),
|
||||
bf16=torch.cuda.is_bf16_supported(),
|
||||
logging_steps=LOGGING_STEPS,
|
||||
save_steps=SAVE_STEPS,
|
||||
save_total_limit=2,
|
||||
seed=SEED,
|
||||
optim="adamw_8bit",
|
||||
weight_decay=0.01,
|
||||
max_grad_norm=1.0,
|
||||
report_to="none",
|
||||
dataloader_num_workers=2,
|
||||
),
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
dataset_num_proc=2,
|
||||
packing=True,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# TRAIN
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print("Starting training...")
|
||||
stats = trainer.train()
|
||||
print(f"\nTraining complete!")
|
||||
print(f" Total steps: {stats.global_step}")
|
||||
print(f" Train loss: {stats.training_loss:.4f}")
|
||||
print(f" Runtime: {stats.metrics['train_runtime']:.0f}s")
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# SAVE ADAPTER
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print(f"\nSaving adapter to {OUTPUT_DIR}...")
|
||||
model.save_pretrained(OUTPUT_DIR)
|
||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||
|
||||
adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors"
|
||||
if adapter_path.exists():
|
||||
size_mb = adapter_path.stat().st_size / (1024 * 1024)
|
||||
print(f" Adapter saved: {size_mb:.1f} MB")
|
||||
else:
|
||||
print(" WARNING: adapter_model.safetensors not found!")
|
||||
|
||||
print(f"\nDone. To serve with vLLM:")
|
||||
print(f" Update vllm-poc.service to add:")
|
||||
print(f" --enable-lora \\")
|
||||
print(f" --lora-modules bt7274-memory={os.path.abspath(OUTPUT_DIR)} \\")
|
||||
print(f" --max-lora-rank {LORA_RANK}")
|
||||
@@ -0,0 +1,171 @@
|
||||
#!/home/madcat/lora-train/bin/python3
|
||||
"""Train BT-7274 memory LoRA v2 on Qwen2.5-7B-Instruct using Unsloth.
|
||||
|
||||
1000 curated EEMS memories — knowledge injection.
|
||||
Run on junkpile (RTX 2000 Ada 16GB).
|
||||
|
||||
Changes from v1:
|
||||
- Native messages format (role/content) — no ShareGPT conversion
|
||||
- Completion-only loss — trains only on assistant responses
|
||||
- Increased MAX_SEQ_LEN to 4096 for longer memories
|
||||
- Adjusted for 1000 examples (more data = fewer epochs needed)
|
||||
|
||||
Prerequisites:
|
||||
1. Stop vLLM: systemctl --user stop vllm-poc
|
||||
2. Run: ~/lora-train/bin/python3 train_memory_lora_v2.py
|
||||
3. Restart: systemctl --user start vllm-poc
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from unsloth import FastLanguageModel
|
||||
from unsloth.chat_templates import get_chat_template
|
||||
from trl import SFTTrainer, SFTConfig
|
||||
from datasets import load_dataset
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
|
||||
DATASET_PATH = "bt7274_memory_1000.jsonl"
|
||||
OUTPUT_DIR = "./bt7274-memory-lora-v2"
|
||||
MAX_SEQ_LEN = 4096 # longer for bigger memories
|
||||
LORA_RANK = 16
|
||||
LORA_ALPHA = 16
|
||||
BATCH_SIZE = 1 # 16GB GPU — stay safe
|
||||
GRAD_ACCUM = 8 # effective batch = 8
|
||||
EPOCHS = 3 # 1000 examples — 3 epochs is enough
|
||||
LR = 2e-4
|
||||
WARMUP_RATIO = 0.03 # 3% warmup (better than fixed steps for larger dataset)
|
||||
SAVE_STEPS = 100
|
||||
LOGGING_STEPS = 10
|
||||
SEED = 42
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# LOAD MODEL
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print(f"Loading {MODEL_NAME}...")
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=MODEL_NAME,
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
load_in_4bit=True,
|
||||
dtype=None,
|
||||
)
|
||||
|
||||
tokenizer = get_chat_template(
|
||||
tokenizer,
|
||||
chat_template="qwen-2.5",
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# PEFT CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print("Applying LoRA...")
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=LORA_RANK,
|
||||
lora_alpha=LORA_ALPHA,
|
||||
target_modules=[
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",
|
||||
],
|
||||
lora_dropout=0,
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=SEED,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# DATASET — native messages format
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print(f"Loading dataset from {DATASET_PATH}...")
|
||||
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
|
||||
print(f" {len(dataset)} examples loaded")
|
||||
|
||||
|
||||
def apply_template(examples):
|
||||
"""Apply Qwen2.5 chat template to messages."""
|
||||
texts = []
|
||||
for messages in examples["messages"]:
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False,
|
||||
)
|
||||
texts.append(text)
|
||||
return {"text": texts}
|
||||
|
||||
|
||||
print("Applying chat template...")
|
||||
dataset = dataset.map(apply_template, batched=True, num_proc=2)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# TRAINER — with completion-only loss
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print("Setting up trainer...")
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=dataset,
|
||||
dataset_text_field="text",
|
||||
args=SFTConfig(
|
||||
output_dir=OUTPUT_DIR,
|
||||
per_device_train_batch_size=BATCH_SIZE,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
num_train_epochs=EPOCHS,
|
||||
learning_rate=LR,
|
||||
lr_scheduler_type="cosine",
|
||||
warmup_ratio=WARMUP_RATIO,
|
||||
fp16=not torch.cuda.is_bf16_supported(),
|
||||
bf16=torch.cuda.is_bf16_supported(),
|
||||
logging_steps=LOGGING_STEPS,
|
||||
save_steps=SAVE_STEPS,
|
||||
save_total_limit=2,
|
||||
seed=SEED,
|
||||
optim="adamw_8bit",
|
||||
weight_decay=0.01,
|
||||
max_grad_norm=1.0,
|
||||
report_to="none",
|
||||
dataloader_num_workers=2,
|
||||
),
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
dataset_num_proc=2,
|
||||
packing=True,
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# TRAIN
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print("Starting training...")
|
||||
stats = trainer.train()
|
||||
print(f"\nTraining complete!")
|
||||
print(f" Total steps: {stats.global_step}")
|
||||
print(f" Train loss: {stats.training_loss:.4f}")
|
||||
print(f" Runtime: {stats.metrics['train_runtime']:.0f}s")
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# SAVE ADAPTER
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
print(f"\nSaving adapter to {OUTPUT_DIR}...")
|
||||
model.save_pretrained(OUTPUT_DIR)
|
||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||
|
||||
adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors"
|
||||
if adapter_path.exists():
|
||||
size_mb = adapter_path.stat().st_size / (1024 * 1024)
|
||||
print(f" Adapter saved: {size_mb:.1f} MB")
|
||||
else:
|
||||
print(" WARNING: adapter_model.safetensors not found!")
|
||||
|
||||
print(f"\nDone. To serve with vLLM:")
|
||||
print(f" Update vllm-poc.service volume mount + lora-modules to point at:")
|
||||
print(f" {os.path.abspath(OUTPUT_DIR)}")
|
||||
print(f" Then: systemctl --user daemon-reload && systemctl --user start vllm-poc")
|
||||
@@ -0,0 +1,216 @@
|
||||
"""Specialist LoRA trainer — parameterized for all adapters.
|
||||
|
||||
Same architecture as train_qwen35_27b.py (bt7274 persona) but configurable
|
||||
per specialist via CLI args or environment variables.
|
||||
|
||||
Usage:
|
||||
# Rust specialist
|
||||
python train_specialist.py --name oxidizer --data data/oxidizer.jsonl --max-seq 8192
|
||||
|
||||
# TypeScript specialist
|
||||
python train_specialist.py --name prism --data data/prism.jsonl --max-seq 8192
|
||||
|
||||
# TTS cleanup (smaller sequences, more epochs)
|
||||
python train_specialist.py --name trace --data data/trace.jsonl \
|
||||
--max-seq 2048 --epochs 5 --lr 1e-4
|
||||
|
||||
# All defaults
|
||||
python train_specialist.py --name oxidizer
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from unsloth import FastLanguageModel
|
||||
from trl import SFTTrainer, SFTConfig
|
||||
from datasets import load_dataset
|
||||
import torch
|
||||
|
||||
# ── Defaults ─────────────────────────────────────────────────────────
|
||||
|
||||
DEFAULTS = {
|
||||
"model": "Qwen/Qwen3.5-27B",
|
||||
"max_seq": 8192,
|
||||
"rank": 16,
|
||||
"alpha": 16,
|
||||
"epochs": 3,
|
||||
"batch": 1,
|
||||
"grad_accum": 8,
|
||||
"lr": 5e-5,
|
||||
"warmup": 10,
|
||||
"save_steps": 50,
|
||||
"save_total_limit": 2,
|
||||
}
|
||||
|
||||
# Per-adapter overrides
|
||||
ADAPTER_OVERRIDES = {
|
||||
"bt7274": {"max_seq": 4096, "lr": 1e-4, "data": "bt7274_v3.jsonl"},
|
||||
"oxidizer": {"data": "data/oxidizer.jsonl"},
|
||||
"serpent": {"data": "data/serpent.jsonl"},
|
||||
"prism": {"data": "data/prism.jsonl"},
|
||||
"forge": {"data": "data/forge.jsonl"},
|
||||
"swiftblade": {"data": "data/swiftblade.jsonl"},
|
||||
"trace": {"max_seq": 2048, "lr": 1e-4, "epochs": 5, "data": "data/trace.jsonl"},
|
||||
}
|
||||
|
||||
|
||||
def fix_tool_calls(messages):
|
||||
"""Parse tool_call arguments from JSON strings to dicts for Qwen3.5 template."""
|
||||
import json as _json
|
||||
fixed = []
|
||||
for msg in messages:
|
||||
msg = dict(msg)
|
||||
if msg.get("tool_calls"):
|
||||
new_tcs = []
|
||||
for tc in msg["tool_calls"]:
|
||||
tc = dict(tc)
|
||||
if "function" in tc:
|
||||
fn = dict(tc["function"])
|
||||
if isinstance(fn.get("arguments"), str):
|
||||
try:
|
||||
fn["arguments"] = _json.loads(fn["arguments"])
|
||||
except (ValueError, TypeError):
|
||||
fn["arguments"] = {"raw": fn["arguments"]}
|
||||
tc["function"] = fn
|
||||
new_tcs.append(tc)
|
||||
msg["tool_calls"] = new_tcs
|
||||
fixed.append(msg)
|
||||
return fixed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Train specialist LoRA adapter")
|
||||
parser.add_argument("--name", required=True, help="Adapter name (oxidizer, serpent, prism, forge, swiftblade, trace)")
|
||||
parser.add_argument("--model", default=None, help=f"Base model (default: {DEFAULTS['model']})")
|
||||
parser.add_argument("--data", default=None, help="Training data JSONL path")
|
||||
parser.add_argument("--out", default=None, help="Output directory (default: adapters/<name>)")
|
||||
parser.add_argument("--max-seq", type=int, default=None, help=f"Max sequence length")
|
||||
parser.add_argument("--rank", type=int, default=None, help=f"LoRA rank")
|
||||
parser.add_argument("--alpha", type=int, default=None, help=f"LoRA alpha")
|
||||
parser.add_argument("--epochs", type=int, default=None, help=f"Training epochs")
|
||||
parser.add_argument("--batch", type=int, default=None, help=f"Batch size")
|
||||
parser.add_argument("--grad-accum", type=int, default=None, help=f"Gradient accumulation steps")
|
||||
parser.add_argument("--lr", type=float, default=None, help=f"Learning rate")
|
||||
parser.add_argument("--warmup", type=int, default=None, help=f"Warmup steps")
|
||||
parser.add_argument("--resume", default=None, help="Resume from checkpoint path")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Resolve config: CLI > adapter overrides > defaults
|
||||
overrides = ADAPTER_OVERRIDES.get(args.name, {})
|
||||
|
||||
def resolve(key, cli_val):
|
||||
if cli_val is not None:
|
||||
return cli_val
|
||||
if key in overrides:
|
||||
return overrides[key]
|
||||
return DEFAULTS[key]
|
||||
|
||||
model_name = resolve("model", args.model)
|
||||
max_seq = resolve("max_seq", args.max_seq)
|
||||
rank = resolve("rank", args.rank)
|
||||
alpha = resolve("alpha", args.alpha)
|
||||
epochs = resolve("epochs", args.epochs)
|
||||
batch = resolve("batch", args.batch)
|
||||
grad_accum = resolve("grad_accum", args.grad_accum)
|
||||
lr = resolve("lr", args.lr)
|
||||
warmup = resolve("warmup", args.warmup)
|
||||
data_path = args.data or overrides.get("data", f"data/{args.name}.jsonl")
|
||||
out_dir = args.out or f"adapters/{args.name}"
|
||||
|
||||
print(f"══ Specialist LoRA Training: {args.name} ══")
|
||||
print(f"Base model: {model_name}")
|
||||
print(f"Data: {data_path}")
|
||||
print(f"Output: {out_dir}")
|
||||
print(f"Max seq: {max_seq}")
|
||||
print(f"LoRA: r={rank}, α={alpha}")
|
||||
print(f"Training: {epochs} epochs, batch {batch}, grad_accum {grad_accum}")
|
||||
print(f"LR: {lr}")
|
||||
print(f"Warmup: {warmup} steps")
|
||||
print()
|
||||
|
||||
# ── Load model ───────────────────────────────────────────────────
|
||||
print("Loading model (bf16, no quantization)...")
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=model_name,
|
||||
max_seq_length=max_seq,
|
||||
load_in_4bit=False,
|
||||
load_in_16bit=True,
|
||||
full_finetuning=False,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# ── LoRA adapter ─────────────────────────────────────────────────
|
||||
print("Applying LoRA...")
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=rank,
|
||||
lora_alpha=alpha,
|
||||
lora_dropout=0,
|
||||
target_modules=[
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",
|
||||
],
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=42,
|
||||
max_seq_length=max_seq,
|
||||
)
|
||||
|
||||
# ── Dataset ──────────────────────────────────────────────────────
|
||||
print(f"Loading dataset: {data_path}")
|
||||
ds = load_dataset("json", data_files=data_path, split="train")
|
||||
|
||||
def to_chatml(ex):
|
||||
messages = fix_tool_calls(ex["messages"])
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=False
|
||||
)
|
||||
return {"text": text}
|
||||
|
||||
ds = ds.map(to_chatml)
|
||||
|
||||
steps = (len(ds) * epochs) // (batch * grad_accum)
|
||||
print(f"Dataset: {len(ds)} examples")
|
||||
print(f"Epochs: {epochs}, effective batch: {batch * grad_accum}")
|
||||
print(f"Est. steps: {steps}")
|
||||
|
||||
# ── Train ────────────────────────────────────────────────────────
|
||||
print("\nStarting training...")
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=ds,
|
||||
args=SFTConfig(
|
||||
output_dir=out_dir,
|
||||
per_device_train_batch_size=batch,
|
||||
gradient_accumulation_steps=grad_accum,
|
||||
num_train_epochs=epochs,
|
||||
learning_rate=lr,
|
||||
bf16=True,
|
||||
logging_steps=5,
|
||||
save_steps=resolve("save_steps", None),
|
||||
save_total_limit=resolve("save_total_limit", None),
|
||||
warmup_steps=warmup,
|
||||
optim="adamw_8bit",
|
||||
seed=42,
|
||||
report_to="none",
|
||||
max_seq_length=max_seq,
|
||||
dataset_num_proc=1,
|
||||
),
|
||||
)
|
||||
|
||||
if args.resume:
|
||||
print(f"Resuming from checkpoint: {args.resume}")
|
||||
trainer.train(resume_from_checkpoint=args.resume)
|
||||
else:
|
||||
trainer.train()
|
||||
|
||||
# ── Save ─────────────────────────────────────────────────────────
|
||||
model.save_pretrained(out_dir)
|
||||
tokenizer.save_pretrained(out_dir)
|
||||
print(f"\n✓ Saved {args.name} adapter to {out_dir}/")
|
||||
print(f" Transfer to sin: ~/models/loras/{args.name}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user