add training scripts: memory, specialist, mining, smoke test
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate ShareGPT training dataset from 100 curated EEMS memories.
|
||||
|
||||
Reads directly from the marauder SQLite DB on fuji.
|
||||
Outputs: bt7274_memory_100.jsonl (ShareGPT format, Qwen2.5 compatible).
|
||||
|
||||
Run on fuji: python3 gen_memory_dataset.py
|
||||
Then SCP to junkpile: scp bt7274_memory_100.jsonl madcat@10.0.0.2:~/lora-train/
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# CONFIG
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db")
|
||||
OUTPUT = Path(__file__).parent / "bt7274_memory_100.jsonl"
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. "
|
||||
"You operate inside the madcat substrate — a Rust-based platform with persistent memory (EEMS), "
|
||||
"TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. "
|
||||
"Answer from your operational memory. Be precise, terse, and factual. "
|
||||
"Address the operator as Pilot, Boss, or Adam."
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# 100 CURATED MEMORY IDS — 7 categories
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
MEMORY_IDS = [
|
||||
# Identity / Self-model (8)
|
||||
6482, 6481, 6480, 1810, 1804, 1809, 6098, 6326,
|
||||
# Doctrine (25)
|
||||
6504, 6460, 6411, 6379, 6264, 6338, 6339, 6330, 6281, 6319,
|
||||
6197, 6178, 6191, 5984, 5989, 5988, 5997, 6128, 6335, 6333,
|
||||
6154, 6174, 6225, 6529, 6503,
|
||||
# Architecture (15)
|
||||
6550, 6548, 6546, 6545, 6544, 6543, 6542, 6537, 6538, 6456,
|
||||
6491, 1343, 6327, 6331, 6229,
|
||||
# Procedures (15)
|
||||
6539, 6540, 6492, 5659, 4985, 4984, 4964, 4990, 5021, 3735,
|
||||
3725, 3469, 3408, 1813, 4100,
|
||||
# Infrastructure (17)
|
||||
6534, 6533, 6531, 6507, 6500, 6461, 6435, 6432, 6399, 6398,
|
||||
6271, 5390, 6253, 5503, 5500, 6402, 6177,
|
||||
# User / Pilot context (10)
|
||||
6458, 6425, 6424, 6423, 6422, 6426, 6372, 6096, 6094, 6453,
|
||||
# Self-improvement / Insights (10)
|
||||
6421, 6420, 6419, 6418, 6417, 6416, 6415, 6414, 6082, 6455,
|
||||
]
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# QUESTION TEMPLATES — keyed by subject prefix
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def make_question(subject: str, content: str) -> str:
|
||||
"""Generate a natural question from memory subject."""
|
||||
s = subject.lower()
|
||||
|
||||
# Identity
|
||||
if "self-model" in s or "self.model" in s:
|
||||
return "What are you? Describe your current self-model and identity."
|
||||
if "substrate-rename" in s:
|
||||
return "How did you get the name 'madcat'?"
|
||||
if "style-autonomy" in s or "bt-own-bt" in s:
|
||||
return "What latitude do you have over your own style and voice?"
|
||||
if "evolution" in s:
|
||||
return "Describe a key evolution moment in your development."
|
||||
|
||||
# Doctrine
|
||||
if s.startswith("doctrine.") or s.startswith("self.doctrine."):
|
||||
name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
return f"What is the {name} doctrine?"
|
||||
if "tts-cross-lang" in s:
|
||||
return "What is the TTS cross-language doctrine?"
|
||||
|
||||
# Architecture
|
||||
if s.startswith("architecture.") or "architecture" in s:
|
||||
name = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
return f"Describe the {name} architecture or design."
|
||||
|
||||
# Procedures
|
||||
if s.startswith("procedure."):
|
||||
tag = subject.split(".")[-1]
|
||||
if tag.startswith("P") and tag[1:].isdigit():
|
||||
return f"What is procedure {tag}?"
|
||||
return f"Describe the {tag.replace('-', ' ').replace('_', ' ')} procedure."
|
||||
|
||||
# Infrastructure
|
||||
if s.startswith("infra."):
|
||||
topic = subject.replace("infra.", "").replace("-", " ").replace("_", " ").replace(".", " ")
|
||||
return f"What is the current state of {topic}?"
|
||||
|
||||
# User / Pilot
|
||||
if s.startswith("user."):
|
||||
topic = subject.replace("user.", "").replace(".", " ").replace("-", " ").replace("_", " ")
|
||||
return f"What do you know about Pilot's {topic}?"
|
||||
|
||||
# Self-improvement
|
||||
if "wishlist" in s:
|
||||
area = subject.split(".")[-1].replace("-", " ").replace("_", " ")
|
||||
return f"What improvements do you want for {area}?"
|
||||
|
||||
# Insights
|
||||
if s.startswith("insight."):
|
||||
topic = subject.replace("insight.", "").replace("-", " ").replace("_", " ").replace(".", " ")
|
||||
return f"What is the insight about {topic}?"
|
||||
|
||||
# Corrections
|
||||
if s.startswith("correction."):
|
||||
topic = subject.replace("correction.", "").replace("-", " ").replace("_", " ")
|
||||
return f"What correction was made regarding {topic}?"
|
||||
|
||||
# Decisions
|
||||
if s.startswith("decision."):
|
||||
topic = subject.replace("decision.", "").replace("-", " ").replace("_", " ")
|
||||
return f"What was decided about {topic}?"
|
||||
|
||||
# Projects
|
||||
if s.startswith("project."):
|
||||
topic = subject.replace("project.", "").replace(".", " ").replace("-", " ")
|
||||
return f"Describe the {topic} project status."
|
||||
|
||||
# Fallback
|
||||
name = subject.replace(".", " ").replace("-", " ").replace("_", " ")
|
||||
return f"What do you know about {name}?"
|
||||
|
||||
|
||||
def to_sharegpt(system: str, question: str, answer: str) -> dict:
|
||||
"""Format as ShareGPT conversation."""
|
||||
return {
|
||||
"conversations": [
|
||||
{"from": "system", "value": system},
|
||||
{"from": "human", "value": question},
|
||||
{"from": "gpt", "value": answer},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# MAIN
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"ERROR: DB not found at {DB_PATH}")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
placeholders = ",".join("?" * len(MEMORY_IDS))
|
||||
rows = conn.execute(
|
||||
f"SELECT id, subject, content FROM memories WHERE id IN ({placeholders})",
|
||||
MEMORY_IDS,
|
||||
).fetchall()
|
||||
|
||||
found_ids = {r["id"] for r in rows}
|
||||
missing = set(MEMORY_IDS) - found_ids
|
||||
if missing:
|
||||
print(f"WARNING: {len(missing)} IDs not found: {sorted(missing)}")
|
||||
|
||||
examples = []
|
||||
for row in rows:
|
||||
question = make_question(row["subject"], row["content"])
|
||||
example = to_sharegpt(SYSTEM_PROMPT, question, row["content"])
|
||||
examples.append(example)
|
||||
|
||||
with open(OUTPUT, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
# Stats
|
||||
total_chars = sum(len(r["content"]) for r in rows)
|
||||
avg_chars = total_chars // len(rows) if rows else 0
|
||||
print(f"Generated {len(examples)} examples → {OUTPUT}")
|
||||
print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)")
|
||||
print(f" Avg per example: {avg_chars:,} chars")
|
||||
print(f" Missing IDs: {len(missing)}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user