#!/usr/bin/env python3 """Build v5 training dataset from training/*.json files. Each file in training/ is a single JSON object with a "messages" array. This script validates, counts, and merges them into a single JSONL file. Usage: python build_v5.py """ import json import os import sys from pathlib import Path TRAINING_DIR = Path(__file__).parent / "training" OUTPUT = Path(__file__).parent / "substrate_v5.jsonl" def validate_example(ex: dict, filename: str) -> list[str]: """Validate example structure. Returns list of warnings.""" warnings = [] msgs = ex.get("messages") if not msgs or not isinstance(msgs, list): warnings.append(f"{filename} — no messages array") return warnings if msgs[0].get("role") != "system": warnings.append(f"{filename} — first message is not system") has_user = any(m["role"] == "user" for m in msgs) has_assistant = any(m["role"] == "assistant" for m in msgs) if not has_user: warnings.append(f"{filename} — no user message") if not has_assistant: warnings.append(f"{filename} — no assistant message") # Check for blocks in assistant messages has_think = False for m in msgs: if m["role"] == "assistant": content = m.get("content") or "" if "" in content: has_think = True if not has_think: warnings.append(f"{filename} — no block in assistant response") return warnings def main(): if not TRAINING_DIR.exists(): print(f"ERROR: {TRAINING_DIR} not found") sys.exit(1) files = sorted(TRAINING_DIR.glob("*.json")) if not files: print(f"ERROR: no .json files in {TRAINING_DIR}") sys.exit(1) print(f"Building substrate v5 dataset") print(f"Source: {TRAINING_DIR}/") print(f"Output: {OUTPUT}") print("=" * 50) examples = [] all_warnings = [] tool_call_count = 0 direct_count = 0 think_count = 0 total_tool_calls = 0 for f in files: try: with open(f) as fh: ex = json.load(fh) except json.JSONDecodeError as e: print(f" ERROR: {f.name} — invalid JSON: {e}") continue warnings = validate_example(ex, f.name) all_warnings.extend(warnings) # Stats has_tc = False for m in ex["messages"]: if m.get("tool_calls"): has_tc = True total_tool_calls += len(m["tool_calls"]) content = m.get("content") or "" if "" in content and m["role"] == "assistant": think_count += 1 if has_tc: tool_call_count += 1 else: direct_count += 1 examples.append(ex) print(f" {f.name:<45} {'TC' if has_tc else 'direct':>6} {len(ex['messages']):>2} msgs") print(f"\n Total examples: {len(examples)}") print(f" Tool-call examples: {tool_call_count}") print(f" Direct examples: {direct_count}") print(f" Total tool calls: {total_tool_calls}") print(f" Think blocks: {think_count}") if all_warnings: print(f"\n Warnings ({len(all_warnings)}):") for w in all_warnings: print(f" {w}") # Write JSONL with open(OUTPUT, "w") as out: for ex in examples: out.write(json.dumps(ex, ensure_ascii=False) + "\n") size_kb = os.path.getsize(OUTPUT) / 1024 print(f"\n Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)") if __name__ == "__main__": main()