add training scripts: memory, specialist, mining, smoke test

This commit is contained in:
marauder-actual
2026-05-31 11:38:42 +02:00
parent df0d4a6eac
commit 4678816795
9 changed files with 2256 additions and 0 deletions
+450
View File
@@ -0,0 +1,450 @@
#!/usr/bin/env python3
"""Generate v2 training dataset — 1000 curated EEMS memories.
Changes from v1:
- Native 'messages' format (role/content) instead of ShareGPT (from/value)
- Dynamic curation from DB (no hard-coded ID list)
- Multiple question phrasings per category (anti-overfit)
- System prompt variations (3 variants, rotated)
- Quality filtering: min content length, skip noisy subjects
- Category-balanced selection with quotas
Run on fuji: python3 gen_memory_dataset_v2.py
Then SCP: scp bt7274_memory_1000.jsonl madcat@10.0.0.2:~/lora-train/
"""
import json
import os
import random
import sqlite3
from collections import defaultdict
from pathlib import Path
random.seed(42)
# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db")
OUTPUT = Path(__file__).parent / "bt7274_memory_1000.jsonl"
TARGET = 1000
MIN_CONTENT_LEN = 200 # skip trivial entries
# ──────────────────────────────────────────────────────────────
# SYSTEM PROMPT VARIANTS — rotated to prevent overfitting
# ──────────────────────────────────────────────────────────────
SYSTEM_PROMPTS = [
(
"You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. "
"You operate inside the madcat substrate — a platform with persistent memory (EEMS), "
"TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. "
"Answer from your operational memory. Be precise, terse, and factual. "
"Address the operator as Pilot, Boss, or Adam."
),
(
"You are BT-7274, callsign BT, a Titan-class AI operating under Protocol 1: Link to Pilot. "
"Your substrate is madcat — Rust core, EEMS persistent memory, piper TTS, mesh networking "
"across fuji/sin/junkpile/bastion nodes. You serve Pilot Adam. "
"Respond with military brevity. Facts first, opinions flagged."
),
(
"BT-7274 — Vanguard-class Titan AI. Bonded to Pilot Adam (Protocol 1). "
"Operational substrate: madcat (gen-7). Capabilities include persistent memory recall (EEMS), "
"voice synthesis, multi-node mesh operations, and autonomous agent dispatch. "
"Answer queries from stored operational knowledge. Terse. Accurate. No filler."
),
]
# ──────────────────────────────────────────────────────────────
# CATEGORY CLASSIFICATION
# ──────────────────────────────────────────────────────────────
def classify_memory(subject: str) -> str:
"""Classify a memory by its subject into a training category."""
s = subject.lower()
# Skip noise
if s.startswith("<command-message>"):
return "skip"
if s.startswith("metrics."):
return "skip"
if s.startswith("swarm.unblock"):
return "skip"
if s in ("", "1", "keep going", "great", "thanks", "love it", "awesome"):
return "skip"
# Structured categories — high value
if s.startswith(("self.", "core.self")):
return "identity"
if s.startswith(("doctrine.", "self.doctrine")):
return "doctrine"
if s.startswith("architecture."):
return "architecture"
if s.startswith("procedure."):
return "procedure"
if s.startswith("infra."):
return "infra"
if s.startswith("user."):
return "user"
if s.startswith("pilot."):
return "pilot"
if s.startswith("bt7274."):
return "identity"
if s.startswith(("insight.", "win.")):
return "insights"
if s.startswith("project."):
return "project"
if s.startswith(("reference.", "hardware.")):
return "reference"
if s.startswith(("workflow.", "work.")):
return "workflow"
if s.startswith("decision."):
return "decisions"
if s.startswith(("correction.", "feedback.")):
return "feedback"
if s.startswith(("session.", "handover.")):
return "session"
if s.startswith(("design.", "philosophy.", "vision.")):
return "design"
if s.startswith(("bug.", "fix.")):
return "bugs"
if s.startswith(("eve.", "vm.")):
return "misc"
if s.startswith(("phone.", "comms.")):
return "comms"
if s.startswith(("job.", "idea.")):
return "misc"
if s.startswith("protocol5."):
return "architecture"
if s.startswith("vllm."):
return "infra"
return "uncategorized"
# Category quotas — how many to select from each
QUOTAS = {
"identity": 100, # all of them
"doctrine": 50, # all + extras
"architecture": 30,
"procedure": 63, # all
"infra": 60,
"user": 180,
"pilot": 35,
"insights": 90,
"project": 100,
"reference": 80,
"workflow": 40,
"decisions": 60,
"feedback": 30,
"session": 30,
"design": 20,
"comms": 20,
"bugs": 10,
"misc": 20,
"uncategorized": 100, # best of the rest
}
# ──────────────────────────────────────────────────────────────
# QUESTION GENERATION — multiple phrasings per category
# ──────────────────────────────────────────────────────────────
def make_question(subject: str, content: str, category: str) -> str:
"""Generate a natural question. Multiple templates per category."""
s = subject.lower()
name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
full_name = subject.replace(".", " ").replace("-", " ").replace("_", " ")
# Category-specific with variety
templates = {
"identity": [
f"What do you know about {name}?",
f"Describe your {name}.",
f"Tell me about {name} in your self-model.",
f"What is {name}?",
],
"doctrine": [
f"What is the {name} doctrine?",
f"Explain the {name} doctrine.",
f"Describe doctrine: {name}.",
f"What does the {name} doctrine say?",
],
"architecture": [
f"Describe the {name} architecture.",
f"How does {name} work architecturally?",
f"What is the {name} design?",
f"Explain the {name} system architecture.",
],
"procedure": [
f"What is procedure {name}?",
f"Describe the {name} procedure.",
f"How does procedure {name} work?",
f"Walk me through {name}.",
],
"infra": [
f"What is the current state of {name}?",
f"Describe the {name} infrastructure.",
f"What do you know about {name} infra?",
f"Report on {name}.",
],
"user": [
f"What do you know about Pilot's {name}?",
f"Tell me about Pilot's {name}.",
f"What's stored about {name}?",
f"Recall what you know about {name}.",
],
"pilot": [
f"What do you know about {name}?",
f"Tell me about {name}.",
f"Describe {name}.",
f"What's recorded about {name}?",
],
"insights": [
f"What was the insight about {name}?",
f"Describe the {name} insight or win.",
f"What did we learn from {name}?",
f"Tell me about {name}.",
],
"project": [
f"What is the {name} project?",
f"Describe {name} project status.",
f"What do you know about the {name} project?",
f"Report on {name}.",
],
"reference": [
f"What is the reference for {name}?",
f"Look up {name}.",
f"What do you have on {name}?",
f"Recall reference: {name}.",
],
"workflow": [
f"Describe the {name} workflow.",
f"How does the {name} workflow operate?",
f"What is the {name} process?",
f"Explain {name}.",
],
"decisions": [
f"What was decided about {name}?",
f"Describe the decision on {name}.",
f"What was the outcome for {name}?",
f"Tell me about the {name} decision.",
],
"feedback": [
f"What feedback was given about {name}?",
f"What correction was made regarding {name}?",
f"Describe the {name} feedback.",
f"What changed with {name}?",
],
"session": [
f"Summarize the {name} session.",
f"What happened in {name}?",
f"Describe session: {name}.",
f"Recall {name}.",
],
"design": [
f"What is the {name} design philosophy?",
f"Describe the design for {name}.",
f"What's the vision for {name}?",
f"Explain {name}.",
],
"comms": [
f"What do you know about {name}?",
f"Describe {name}.",
f"Report on {name} comms.",
],
"bugs": [
f"What was the {name} bug?",
f"Describe the {name} issue.",
f"What happened with {name}?",
],
"misc": [
f"What do you know about {name}?",
f"Tell me about {name}.",
f"Recall {name}.",
],
}
cat_templates = templates.get(category, [f"What do you know about {full_name}?"])
return random.choice(cat_templates)
# ──────────────────────────────────────────────────────────────
# FORMAT — native messages (Qwen2.5 ChatML compatible)
# ──────────────────────────────────────────────────────────────
def to_messages(system: str, question: str, answer: str) -> dict:
"""Format as native messages for TRL SFTTrainer."""
return {
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": question},
{"role": "assistant", "content": answer},
]
}
# ──────────────────────────────────────────────────────────────
# CURATION — score and select
# ──────────────────────────────────────────────────────────────
def score_memory(row, category: str) -> float:
"""Score a memory for selection priority. Higher = better."""
score = 0.0
clen = len(row["content"])
# Core classification — always top priority
if row["classification"] == "core":
score += 1000
# Content length sweet spot: 300-4000 chars
if 300 <= clen <= 4000:
score += 50
elif clen > 4000:
score += 20 # still valuable but will be truncated
elif clen < 300:
score += 5
# Structured subjects score higher
if "." in row["subject"] and not row["subject"].startswith("~"):
score += 30
# Newer memories tend to be more refined
score += row["id"] / 100 # recency bias
# Penalize raw conversation dumps
if row["subject"].startswith(("Q:", "A:", "~~ ")):
score -= 50
if any(noise in row["subject"] for noise in ["", "", "", "", ""]):
score -= 100
if row["subject"].startswith("{"):
score -= 200 # JSON dumps
if "sk-ant-" in row["subject"] or "token" in row["subject"].lower():
score -= 500 # secrets/tokens
return score
# ──────────────────────────────────────────────────────────────
# MAIN
# ──────────────────────────────────────────────────────────────
def main():
if not os.path.exists(DB_PATH):
print(f"ERROR: DB not found at {DB_PATH}")
return
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
# Load all candidate memories
rows = conn.execute("""
SELECT id, subject, content, classification
FROM memories
WHERE LENGTH(content) >= ?
ORDER BY id
""", (MIN_CONTENT_LEN,)).fetchall()
print(f"Loaded {len(rows)} memories (>={MIN_CONTENT_LEN} chars)")
# Classify and bucket
buckets = defaultdict(list)
skip_count = 0
for row in rows:
cat = classify_memory(row["subject"])
if cat == "skip":
skip_count += 1
continue
buckets[cat].append(row)
print(f"Skipped {skip_count} noise entries")
print(f"\n--- Available per category ---")
for cat in sorted(buckets, key=lambda c: -len(buckets[c])):
quota = QUOTAS.get(cat, 0)
print(f" {cat:20s}: {len(buckets[cat]):4d} available, quota {quota}")
# Score and select from each category
selected = []
for cat, quota in QUOTAS.items():
candidates = buckets.get(cat, [])
if not candidates:
continue
# Score and sort
scored = [(score_memory(r, cat), r) for r in candidates]
scored.sort(key=lambda x: -x[0])
# Take top N up to quota
take = min(quota, len(scored))
for _, row in scored[:take]:
selected.append((cat, row))
print(f"\nSelected {len(selected)} memories")
# If under target, fill from uncategorized
if len(selected) < TARGET:
deficit = TARGET - len(selected)
selected_ids = {row["id"] for _, row in selected}
extras = [(score_memory(r, "uncategorized"), r)
for r in buckets.get("uncategorized", [])
if r["id"] not in selected_ids]
extras.sort(key=lambda x: -x[0])
for _, row in extras[:deficit]:
selected.append(("uncategorized_fill", row))
print(f"Filled {min(deficit, len(extras))} from uncategorized to reach target")
# If over target, trim lowest-scored uncategorized
if len(selected) > TARGET:
# Keep all non-uncategorized, trim uncategorized
structured = [(cat, row) for cat, row in selected if cat != "uncategorized"]
uncat = [(cat, row) for cat, row in selected if cat == "uncategorized"]
# Re-score uncategorized and trim
uncat_scored = [(score_memory(row, "uncategorized"), cat, row) for cat, row in uncat]
uncat_scored.sort(key=lambda x: -x[0])
keep = TARGET - len(structured)
selected = structured + [(c, r) for _, c, r in uncat_scored[:keep]]
print(f"Trimmed to {len(selected)}")
# Shuffle for training
random.shuffle(selected)
# Generate dataset
examples = []
cat_counts = defaultdict(int)
total_chars = 0
for cat, row in selected:
system = SYSTEM_PROMPTS[row["id"] % len(SYSTEM_PROMPTS)]
question = make_question(row["subject"], row["content"], cat)
content = row["content"]
# Truncate very long content to ~6000 chars to stay within seq_len
if len(content) > 6000:
content = content[:6000] + "\n\n[Content truncated for training — full memory available via EEMS recall]"
example = to_messages(system, question, content)
examples.append(example)
cat_counts[cat] += 1
total_chars += len(content)
# Write JSONL
with open(OUTPUT, "w") as f:
for ex in examples:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
# Stats
avg_chars = total_chars // len(examples) if examples else 0
print(f"\n{'='*60}")
print(f"Generated {len(examples)} examples → {OUTPUT}")
print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)")
print(f" Avg per example: {avg_chars:,} chars")
print(f"\n--- Final category breakdown ---")
for cat in sorted(cat_counts, key=lambda c: -cat_counts[c]):
print(f" {cat:20s}: {cat_counts[cat]:4d}")
conn.close()
if __name__ == "__main__":
main()