add substrate v5 training and build scripts

2026-06-01 03:52:55 +02:00
parent 0e88c3c2ae
commit 26e776db71
2 changed files with 340 additions and 0 deletions
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""Build v5 training dataset from training/*.json files.
+
+Each file in training/ is a single JSON object with a "messages" array.
+This script validates, counts, and merges them into a single JSONL file.
+
+Usage:
+    python build_v5.py
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+TRAINING_DIR = Path(__file__).parent / "training"
+OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"
+
+
+def validate_example(ex: dict, filename: str) -> list[str]:
+    """Validate example structure. Returns list of warnings."""
+    warnings = []
+    msgs = ex.get("messages")
+    if not msgs or not isinstance(msgs, list):
+        warnings.append(f"{filename} — no messages array")
+        return warnings
+
+    if msgs[0].get("role") != "system":
+        warnings.append(f"{filename} — first message is not system")
+
+    has_user = any(m["role"] == "user" for m in msgs)
+    has_assistant = any(m["role"] == "assistant" for m in msgs)
+
+    if not has_user:
+        warnings.append(f"{filename} — no user message")
+    if not has_assistant:
+        warnings.append(f"{filename} — no assistant message")
+
+    # Check for <think> blocks in assistant messages
+    has_think = False
+    for m in msgs:
+        if m["role"] == "assistant":
+            content = m.get("content") or ""
+            if "<think>" in content:
+                has_think = True
+
+    if not has_think:
+        warnings.append(f"{filename} — no <think> block in assistant response")
+
+    return warnings
+
+
+def main():
+    if not TRAINING_DIR.exists():
+        print(f"ERROR: {TRAINING_DIR} not found")
+        sys.exit(1)
+
+    files = sorted(TRAINING_DIR.glob("*.json"))
+    if not files:
+        print(f"ERROR: no .json files in {TRAINING_DIR}")
+        sys.exit(1)
+
+    print(f"Building substrate v5 dataset")
+    print(f"Source: {TRAINING_DIR}/")
+    print(f"Output: {OUTPUT}")
+    print("=" * 50)
+
+    examples = []
+    all_warnings = []
+    tool_call_count = 0
+    direct_count = 0
+    think_count = 0
+    total_tool_calls = 0
+
+    for f in files:
+        try:
+            with open(f) as fh:
+                ex = json.load(fh)
+        except json.JSONDecodeError as e:
+            print(f"  ERROR: {f.name} — invalid JSON: {e}")
+            continue
+
+        warnings = validate_example(ex, f.name)
+        all_warnings.extend(warnings)
+
+        # Stats
+        has_tc = False
+        for m in ex["messages"]:
+            if m.get("tool_calls"):
+                has_tc = True
+                total_tool_calls += len(m["tool_calls"])
+            content = m.get("content") or ""
+            if "<think>" in content and m["role"] == "assistant":
+                think_count += 1
+
+        if has_tc:
+            tool_call_count += 1
+        else:
+            direct_count += 1
+
+        examples.append(ex)
+        print(f"  {f.name:<45} {'TC' if has_tc else 'direct':>6}  {len(ex['messages']):>2} msgs")
+
+    print(f"\n  Total examples:     {len(examples)}")
+    print(f"  Tool-call examples: {tool_call_count}")
+    print(f"  Direct examples:    {direct_count}")
+    print(f"  Total tool calls:   {total_tool_calls}")
+    print(f"  Think blocks:       {think_count}")
+
+    if all_warnings:
+        print(f"\n  Warnings ({len(all_warnings)}):")
+        for w in all_warnings:
+            print(f"    {w}")
+
+    # Write JSONL
+    with open(OUTPUT, "w") as out:
+        for ex in examples:
+            out.write(json.dumps(ex, ensure_ascii=False) + "\n")
+
+    size_kb = os.path.getsize(OUTPUT) / 1024
+    print(f"\n  Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")
+
+
+if __name__ == "__main__":
+    main()