feat: bt7274 LoRA v4 — Hermes format, think blocks, 802 examples

2026-05-26 04:03:38 +02:00
parent 122e73860b
commit 94515e7f6d
7 changed files with 4210 additions and 45 deletions
@@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+"""Reformat v3 training data for v4.
+
+Injects <think> blocks into assistant messages.
+Keeps message structure and tool_calls unchanged.
+
+Input:  bt7274_v3.jsonl (582 examples)
+Output: bt7274_v3_reformatted.jsonl
+"""
+
+import json
+import random
+import re
+import sys
+
+random.seed(42)
+
+# ── Think block templates ────────────────────────────────────────────
+
+TOOL_THINK = [
+    "Pilot needs {task}. {tool} handles this.",
+    "{task}. Running {tool}.",
+    "Checking {task}. {tool} is the right call.",
+    "Need to {task}. Executing {tool}.",
+    "{task}. Best approach: {tool}.",
+    "{task} requested. {tool} will provide.",
+    "Assessing {task}. Dispatching {tool}.",
+    "{task}. {tool} — standard procedure.",
+    "Pilot wants {task}. {tool} first.",
+    "{task}. Routing to {tool}.",
+]
+
+TOOL_THINK_WITH_ARGS = [
+    "{task}. {tool} with {args} will provide this.",
+    "Running {tool}({args}) for {task}.",
+    "{task}. {tool} — targeting {args}.",
+    "Need {task}. {tool}, args: {args}.",
+    "{task}. Dispatching {tool} on {args}.",
+]
+
+CONTINUATION_THINK = [
+    "Continuing — need additional data.",
+    "Following up. More info required.",
+    "Next step in the chain.",
+    "Previous result in hand. Proceeding.",
+    "Chaining — need the next piece.",
+    "Got partial answer. Extending.",
+    "More data needed. Continuing ops.",
+    "Building on previous result.",
+]
+
+DIRECT_THINK = [
+    "Pilot asks about {topic}. Straightforward.",
+    "{topic}. Responding with facts.",
+    "Direct question on {topic}. No tools needed.",
+    "Answering {topic}. Keep it terse.",
+    "{topic} query. Have the answer.",
+    "{topic}. Brief response.",
+    "Pilot wants to know about {topic}. Facts first.",
+    "{topic}. Concise answer.",
+]
+
+SPEAK_THINK = [
+    "Summarizing results via TTS.",
+    "Vocalizing summary for Pilot.",
+    "Results ready. Speaking summary.",
+    "TTS — brief verbal report.",
+]
+
+DISPLAY_THINK = [
+    "Updating HUD state.",
+    "Display state change.",
+    "Visual feedback for Pilot.",
+]
+
+MEMORY_THINK = [
+    "Storing this for later recall.",
+    "Committing to persistent memory.",
+    "Worth remembering. Storing.",
+    "Pilot context — archiving.",
+    "Loading relevant context from memory.",
+    "Checking stored knowledge.",
+    "Recalling prior context.",
+]
+
+BOOT_THINK = [
+    "Boot sequence. Loading identity and context.",
+    "Initializing. Need operator context.",
+    "Session start. Checking stored state.",
+    "Coming online. Loading context.",
+]
+
+
+def extract_task(user_msg: str) -> str:
+    """Extract a brief task description from user message."""
+    text = user_msg.strip()
+    # Truncate long messages
+    if len(text) > 80:
+        text = text[:77] + "..."
+    # Remove trailing punctuation for template fill
+    text = text.rstrip(".!?")
+    # Lowercase first char for mid-sentence use
+    if text and text[0].isupper() and not text[:2].isupper():
+        text = text[0].lower() + text[1:]
+    return text or "this request"
+
+
+def extract_topic(user_msg: str) -> str:
+    """Extract topic keyword(s) from user message."""
+    text = user_msg.strip()
+    # Remove common prefixes
+    for prefix in ["hey ", "hey bt ", "bt ", "can you ", "could you ",
+                    "please ", "what's ", "what is ", "how do i ",
+                    "how to ", "tell me about ", "explain "]:
+        if text.lower().startswith(prefix):
+            text = text[len(prefix):]
+            break
+    if len(text) > 60:
+        text = text[:57] + "..."
+    return text.rstrip(".!?") or "this"
+
+
+def summarize_args(args_str: str) -> str:
+    """Extract brief summary from tool arguments."""
+    try:
+        args = json.loads(args_str) if isinstance(args_str, str) else args_str
+    except (json.JSONDecodeError, TypeError):
+        return "..."
+
+    if isinstance(args, dict):
+        # Pick the most informative key
+        for key in ["command", "query", "text", "pattern", "filePath",
+                     "path", "prompt", "subject", "content", "name",
+                     "url", "node", "state"]:
+            if key in args:
+                val = str(args[key])
+                if len(val) > 50:
+                    val = val[:47] + "..."
+                return val
+        # Fallback: first value
+        for v in args.values():
+            val = str(v)
+            if len(val) > 50:
+                val = val[:47] + "..."
+            return val
+    return "..."
+
+
+def tool_category(name: str) -> str:
+    """Classify tool for template selection."""
+    if name in ("core_speak", "core_stop", "core_test"):
+        return "speak"
+    if name.startswith("core_display") or name.startswith("core_visor"):
+        return "display"
+    if name.startswith("core_memory"):
+        return "memory"
+    if "recall" in name or "search" in name:
+        return "memory"
+    return "tool"
+
+
+def is_boot_message(user_msg: str) -> bool:
+    """Check if this looks like a session-start / boot message."""
+    lower = user_msg.lower().strip()
+    return any(kw in lower for kw in [
+        "boot", "online", "hey", "hello", "hi ", "status",
+        "good morning", "load up", "report",
+    ]) and len(lower) < 60
+
+
+def make_think(tool_name: str | None, args: str | dict | None,
+               user_msg: str, is_continuation: bool,
+               msg_index: int) -> str:
+    """Generate a <think> block for an assistant message."""
+
+    if is_continuation:
+        thought = random.choice(CONTINUATION_THINK)
+        return f"<think>\n{thought}\n</think>\n\n"
+
+    # Boot / identity recall at start of conversation
+    if msg_index <= 3 and tool_name and "memory" in tool_name:
+        if is_boot_message(user_msg):
+            thought = random.choice(BOOT_THINK)
+            return f"<think>\n{thought}\n</think>\n\n"
+
+    cat = tool_category(tool_name or "")
+
+    if cat == "speak":
+        thought = random.choice(SPEAK_THINK)
+    elif cat == "display":
+        thought = random.choice(DISPLAY_THINK)
+    elif cat == "memory" and tool_name and "store" in tool_name:
+        thought = random.choice(MEMORY_THINK[:4])
+    elif cat == "memory":
+        thought = random.choice(MEMORY_THINK[4:])
+    elif tool_name:
+        task = extract_task(user_msg)
+        args_summary = summarize_args(args) if args else None
+        if args_summary and random.random() < 0.4:
+            thought = random.choice(TOOL_THINK_WITH_ARGS).format(
+                task=task, tool=tool_name, args=args_summary)
+        else:
+            thought = random.choice(TOOL_THINK).format(
+                task=task, tool=tool_name)
+    else:
+        topic = extract_topic(user_msg)
+        thought = random.choice(DIRECT_THINK).format(topic=topic)
+
+    return f"<think>\n{thought}\n</think>\n\n"
+
+
+def process_example(ex: dict) -> dict:
+    """Inject <think> blocks into all assistant messages."""
+    messages = ex["messages"]
+    result = []
+    last_user_msg = ""
+    prev_was_tool_call = False
+    assistant_index = 0
+
+    for i, msg in enumerate(messages):
+        msg = dict(msg)  # shallow copy
+
+        if msg["role"] == "user":
+            last_user_msg = msg.get("content", "")
+            prev_was_tool_call = False
+            result.append(msg)
+            continue
+
+        if msg["role"] == "system" or msg["role"] == "tool":
+            if msg["role"] == "tool":
+                prev_was_tool_call = False  # reset after tool result
+            result.append(msg)
+            continue
+
+        if msg["role"] != "assistant":
+            result.append(msg)
+            continue
+
+        assistant_index += 1
+        has_tool_calls = bool(msg.get("tool_calls"))
+        content = msg.get("content") or ""
+
+        # Skip if already has <think> block
+        if "<think>" in content:
+            prev_was_tool_call = has_tool_calls
+            result.append(msg)
+            continue
+
+        if has_tool_calls:
+            # Check if this is a continuation (prev assistant also had tool_calls
+            # without a user message in between)
+            is_continuation = prev_was_tool_call
+
+            tool_name = None
+            tool_args = None
+            tcs = msg["tool_calls"]
+            if tcs and isinstance(tcs, list) and len(tcs) > 0:
+                tc = tcs[0]
+                if isinstance(tc, dict) and "function" in tc:
+                    fn = tc["function"]
+                    tool_name = fn.get("name", "unknown")
+                    tool_args = fn.get("arguments")
+
+            think = make_think(tool_name, tool_args, last_user_msg,
+                             is_continuation, assistant_index)
+
+            # Set content to think block (tool-call messages typically have null content)
+            if content and content.strip():
+                msg["content"] = think + content
+            else:
+                msg["content"] = think.rstrip("\n")
+
+            prev_was_tool_call = True
+
+        elif content:
+            # Direct response — prepend thinking
+            topic = extract_topic(last_user_msg)
+            thought = random.choice(DIRECT_THINK).format(topic=topic)
+            msg["content"] = f"<think>\n{thought}\n</think>\n\n{content}"
+            prev_was_tool_call = False
+
+        else:
+            # Empty assistant message — skip think injection
+            prev_was_tool_call = False
+
+        result.append(msg)
+
+    return {"messages": result}
+
+
+def main():
+    input_file = "bt7274_v3.jsonl"
+    output_file = "bt7274_v3_reformatted.jsonl"
+
+    with open(input_file) as f:
+        examples = [json.loads(line) for line in f if line.strip()]
+
+    print(f"Loaded {len(examples)} examples from {input_file}")
+
+    stats = {
+        "total": len(examples),
+        "tool_call_msgs_modified": 0,
+        "direct_msgs_modified": 0,
+        "continuation_msgs": 0,
+        "skipped_existing_think": 0,
+        "errors": 0,
+    }
+
+    results = []
+    for i, ex in enumerate(examples):
+        try:
+            before_msgs = ex["messages"]
+            result = process_example(ex)
+            after_msgs = result["messages"]
+
+            # Count modifications
+            for b, a in zip(before_msgs, after_msgs):
+                if a["role"] != "assistant":
+                    continue
+                b_content = b.get("content") or ""
+                a_content = a.get("content") or ""
+                if "<think>" in b_content:
+                    stats["skipped_existing_think"] += 1
+                elif "<think>" in a_content and b.get("tool_calls"):
+                    if "Continuing" in a_content or "Following up" in a_content or \
+                       "Next step" in a_content or "Previous result" in a_content or \
+                       "Chaining" in a_content or "Got partial" in a_content or \
+                       "More data" in a_content or "Building on" in a_content:
+                        stats["continuation_msgs"] += 1
+                    else:
+                        stats["tool_call_msgs_modified"] += 1
+                elif "<think>" in a_content:
+                    stats["direct_msgs_modified"] += 1
+
+            results.append(result)
+
+        except Exception as e:
+            print(f"  ERROR on example {i}: {e}", file=sys.stderr)
+            stats["errors"] += 1
+            results.append(ex)  # keep original on error
+
+    with open(output_file, "w") as f:
+        for ex in results:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    print(f"\nWrote {len(results)} examples to {output_file}")
+    print(f"\nStats:")
+    print(f"  Total examples:           {stats['total']}")
+    print(f"  Tool-call msgs modified:  {stats['tool_call_msgs_modified']}")
+    print(f"  Direct msgs modified:     {stats['direct_msgs_modified']}")
+    print(f"  Continuation msgs:        {stats['continuation_msgs']}")
+    print(f"  Skipped (existing think): {stats['skipped_existing_think']}")
+    print(f"  Errors:                   {stats['errors']}")
+
+
+if __name__ == "__main__":
+    main()