feat: bt7274 LoRA v4 — Hermes format, think blocks, 802 examples

2026-05-26 04:03:38 +02:00
parent 122e73860b
commit 94515e7f6d
7 changed files with 4210 additions and 45 deletions
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""Merge all v4 dataset sources into bt7274_v4.jsonl.
+
+Sources:
+  1. bt7274_v3_reformatted.jsonl  — reformatted v3 (582 examples, with <think>)
+  2. bt7274_persona.jsonl         — BT-7274 persona (183 examples)
+  3. core_agent_tools.jsonl       — agent tool calls (126 examples)
+
+Output: bt7274_v4.jsonl (shuffled, deduplicated)
+"""
+
+import json
+import random
+import hashlib
+import sys
+
+random.seed(42)
+
+SOURCES = [
+    ("bt7274_v3_reformatted.jsonl", "v3_reformatted"),
+    ("bt7274_persona.jsonl",        "persona"),
+    ("core_agent_tools.jsonl",      "agent_tools"),
+]
+
+
+def fingerprint(ex: dict) -> str:
+    """Create a content hash for deduplication."""
+    msgs = ex.get("messages", [])
+    # Hash user messages only (system prompts differ, tool results vary)
+    user_parts = []
+    for m in msgs:
+        if m["role"] == "user":
+            user_parts.append(m.get("content", ""))
+    content = "|".join(user_parts)
+    return hashlib.md5(content.encode()).hexdigest()
+
+
+def validate_example(ex: dict, source: str, idx: int) -> list[str]:
+    """Validate example structure. Returns list of warnings."""
+    warnings = []
+    msgs = ex.get("messages")
+    if not msgs or not isinstance(msgs, list):
+        warnings.append(f"{source}:{idx} — no messages array")
+        return warnings
+
+    if msgs[0].get("role") != "system":
+        warnings.append(f"{source}:{idx} — first message is not system")
+
+    has_user = any(m["role"] == "user" for m in msgs)
+    has_assistant = any(m["role"] == "assistant" for m in msgs)
+
+    if not has_user:
+        warnings.append(f"{source}:{idx} — no user message")
+    if not has_assistant:
+        warnings.append(f"{source}:{idx} — no assistant message")
+
+    for j, m in enumerate(msgs):
+        if m["role"] == "assistant":
+            content = m.get("content") or ""
+            if m.get("tool_calls") and not content:
+                # Tool call with no content (no think block) — warning
+                warnings.append(f"{source}:{idx}:msg{j} — tool_call assistant with no content/think")
+
+    return warnings
+
+
+def main():
+    all_examples = []
+    source_counts = {}
+    all_warnings = []
+
+    print("Building BT-7274 v4 Dataset")
+    print("=" * 50)
+
+    for filename, label in SOURCES:
+        try:
+            with open(filename) as f:
+                examples = [json.loads(line) for line in f if line.strip()]
+        except FileNotFoundError:
+            print(f"  ⚠ {filename} not found — skipping")
+            continue
+
+        for i, ex in enumerate(examples):
+            warns = validate_example(ex, label, i)
+            all_warnings.extend(warns)
+            # Tag source for stats
+            ex["_source"] = label
+
+        source_counts[label] = len(examples)
+        all_examples.extend(examples)
+        print(f"  {label:<20} {len(examples):>4} examples from {filename}")
+
+    # Deduplicate
+    seen = set()
+    deduped = []
+    dupes = 0
+    for ex in all_examples:
+        fp = fingerprint(ex)
+        if fp in seen:
+            dupes += 1
+            continue
+        seen.add(fp)
+        deduped.append(ex)
+
+    print(f"\n  Duplicates removed: {dupes}")
+    print(f"  After dedup:       {len(deduped)}")
+
+    # Shuffle
+    random.shuffle(deduped)
+
+    # Remove source tags before writing
+    for ex in deduped:
+        ex.pop("_source", None)
+
+    # Stats
+    total_tool_calls = 0
+    total_direct = 0
+    total_think = 0
+    tools_seen = set()
+
+    for ex in deduped:
+        has_tc = False
+        for m in ex["messages"]:
+            if m.get("tool_calls"):
+                has_tc = True
+                for tc in m["tool_calls"]:
+                    total_tool_calls += 1
+                    tools_seen.add(tc["function"]["name"])
+            content = m.get("content") or ""
+            if "<think>" in content and m["role"] == "assistant":
+                total_think += 1
+        if not has_tc:
+            total_direct += 1
+
+    print(f"\n  Total examples:    {len(deduped)}")
+    print(f"  Tool-call examples: {len(deduped) - total_direct}")
+    print(f"  Direct examples:   {total_direct}")
+    print(f"  Total tool calls:  {total_tool_calls}")
+    print(f"  Unique tools:      {len(tools_seen)}")
+    print(f"  Think blocks:      {total_think}")
+
+    if all_warnings:
+        print(f"\n  ⚠ Warnings ({len(all_warnings)}):")
+        for w in all_warnings[:20]:
+            print(f"    {w}")
+        if len(all_warnings) > 20:
+            print(f"    ... and {len(all_warnings) - 20} more")
+
+    output = "bt7274_v4.jsonl"
+    with open(output, "w") as f:
+        for ex in deduped:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    print(f"\n  Wrote {output}")
+
+    # Size comparison
+    import os
+    v3_size = os.path.getsize("bt7274_v3.jsonl") / 1024
+    v4_size = os.path.getsize(output) / 1024
+    print(f"  v3 size: {v3_size:.0f} KB")
+    print(f"  v4 size: {v4_size:.0f} KB ({v4_size/v3_size:.1f}x)")
+
+
+if __name__ == "__main__":
+    main()