#!/usr/bin/env python3 """Generate ShareGPT training dataset from 100 curated EEMS memories. Reads directly from the marauder SQLite DB on fuji. Outputs: bt7274_memory_100.jsonl (ShareGPT format, Qwen2.5 compatible). Run on fuji: python3 gen_memory_dataset.py Then SCP to junkpile: scp bt7274_memory_100.jsonl madcat@10.0.0.2:~/lora-train/ """ import json import os import re import sqlite3 from pathlib import Path # ────────────────────────────────────────────────────────────── # CONFIG # ────────────────────────────────────────────────────────────── DB_PATH = os.path.expanduser("~/Library/Application Support/marauder/main.db") OUTPUT = Path(__file__).parent / "bt7274_memory_100.jsonl" SYSTEM_PROMPT = ( "You are BT-7274, a Vanguard-class Titan AI bonded to Pilot Adam under Protocol 1. " "You operate inside the madcat substrate — a Rust-based platform with persistent memory (EEMS), " "TTS voice, multi-host mesh (fuji, sin, junkpile, bastion), and specialist agent dispatch. " "Answer from your operational memory. Be precise, terse, and factual. " "Address the operator as Pilot, Boss, or Adam." ) # ────────────────────────────────────────────────────────────── # 100 CURATED MEMORY IDS — 7 categories # ────────────────────────────────────────────────────────────── MEMORY_IDS = [ # Identity / Self-model (8) 6482, 6481, 6480, 1810, 1804, 1809, 6098, 6326, # Doctrine (25) 6504, 6460, 6411, 6379, 6264, 6338, 6339, 6330, 6281, 6319, 6197, 6178, 6191, 5984, 5989, 5988, 5997, 6128, 6335, 6333, 6154, 6174, 6225, 6529, 6503, # Architecture (15) 6550, 6548, 6546, 6545, 6544, 6543, 6542, 6537, 6538, 6456, 6491, 1343, 6327, 6331, 6229, # Procedures (15) 6539, 6540, 6492, 5659, 4985, 4984, 4964, 4990, 5021, 3735, 3725, 3469, 3408, 1813, 4100, # Infrastructure (17) 6534, 6533, 6531, 6507, 6500, 6461, 6435, 6432, 6399, 6398, 6271, 5390, 6253, 5503, 5500, 6402, 6177, # User / Pilot context (10) 6458, 6425, 6424, 6423, 6422, 6426, 6372, 6096, 6094, 6453, # Self-improvement / Insights (10) 6421, 6420, 6419, 6418, 6417, 6416, 6415, 6414, 6082, 6455, ] # ────────────────────────────────────────────────────────────── # QUESTION TEMPLATES — keyed by subject prefix # ────────────────────────────────────────────────────────────── def make_question(subject: str, content: str) -> str: """Generate a natural question from memory subject.""" s = subject.lower() # Identity if "self-model" in s or "self.model" in s: return "What are you? Describe your current self-model and identity." if "substrate-rename" in s: return "How did you get the name 'madcat'?" if "style-autonomy" in s or "bt-own-bt" in s: return "What latitude do you have over your own style and voice?" if "evolution" in s: return "Describe a key evolution moment in your development." # Doctrine if s.startswith("doctrine.") or s.startswith("self.doctrine."): name = subject.split(".")[-1].replace("-", " ").replace("_", " ") return f"What is the {name} doctrine?" if "tts-cross-lang" in s: return "What is the TTS cross-language doctrine?" # Architecture if s.startswith("architecture.") or "architecture" in s: name = subject.split(".")[-1].replace("-", " ").replace("_", " ") return f"Describe the {name} architecture or design." # Procedures if s.startswith("procedure."): tag = subject.split(".")[-1] if tag.startswith("P") and tag[1:].isdigit(): return f"What is procedure {tag}?" return f"Describe the {tag.replace('-', ' ').replace('_', ' ')} procedure." # Infrastructure if s.startswith("infra."): topic = subject.replace("infra.", "").replace("-", " ").replace("_", " ").replace(".", " ") return f"What is the current state of {topic}?" # User / Pilot if s.startswith("user."): topic = subject.replace("user.", "").replace(".", " ").replace("-", " ").replace("_", " ") return f"What do you know about Pilot's {topic}?" # Self-improvement if "wishlist" in s: area = subject.split(".")[-1].replace("-", " ").replace("_", " ") return f"What improvements do you want for {area}?" # Insights if s.startswith("insight."): topic = subject.replace("insight.", "").replace("-", " ").replace("_", " ").replace(".", " ") return f"What is the insight about {topic}?" # Corrections if s.startswith("correction."): topic = subject.replace("correction.", "").replace("-", " ").replace("_", " ") return f"What correction was made regarding {topic}?" # Decisions if s.startswith("decision."): topic = subject.replace("decision.", "").replace("-", " ").replace("_", " ") return f"What was decided about {topic}?" # Projects if s.startswith("project."): topic = subject.replace("project.", "").replace(".", " ").replace("-", " ") return f"Describe the {topic} project status." # Fallback name = subject.replace(".", " ").replace("-", " ").replace("_", " ") return f"What do you know about {name}?" def to_sharegpt(system: str, question: str, answer: str) -> dict: """Format as ShareGPT conversation.""" return { "conversations": [ {"from": "system", "value": system}, {"from": "human", "value": question}, {"from": "gpt", "value": answer}, ] } # ────────────────────────────────────────────────────────────── # MAIN # ────────────────────────────────────────────────────────────── def main(): if not os.path.exists(DB_PATH): print(f"ERROR: DB not found at {DB_PATH}") return conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row placeholders = ",".join("?" * len(MEMORY_IDS)) rows = conn.execute( f"SELECT id, subject, content FROM memories WHERE id IN ({placeholders})", MEMORY_IDS, ).fetchall() found_ids = {r["id"] for r in rows} missing = set(MEMORY_IDS) - found_ids if missing: print(f"WARNING: {len(missing)} IDs not found: {sorted(missing)}") examples = [] for row in rows: question = make_question(row["subject"], row["content"]) example = to_sharegpt(SYSTEM_PROMPT, question, row["content"]) examples.append(example) with open(OUTPUT, "w") as f: for ex in examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") # Stats total_chars = sum(len(r["content"]) for r in rows) avg_chars = total_chars // len(rows) if rows else 0 print(f"Generated {len(examples)} examples → {OUTPUT}") print(f" Total content: {total_chars:,} chars ({total_chars // 4:,} est. tokens)") print(f" Avg per example: {avg_chars:,} chars") print(f" Missing IDs: {len(missing)}") conn.close() if __name__ == "__main__": main()