126 lines
3.5 KiB
Python
126 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Build v5 training dataset from training/*.json files.
|
|
|
|
Each file in training/ is a single JSON object with a "messages" array.
|
|
This script validates, counts, and merges them into a single JSONL file.
|
|
|
|
Usage:
|
|
python build_v5.py
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
TRAINING_DIR = Path(__file__).parent / "training"
|
|
OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"
|
|
|
|
|
|
def validate_example(ex: dict, filename: str) -> list[str]:
|
|
"""Validate example structure. Returns list of warnings."""
|
|
warnings = []
|
|
msgs = ex.get("messages")
|
|
if not msgs or not isinstance(msgs, list):
|
|
warnings.append(f"{filename} — no messages array")
|
|
return warnings
|
|
|
|
if msgs[0].get("role") != "system":
|
|
warnings.append(f"{filename} — first message is not system")
|
|
|
|
has_user = any(m["role"] == "user" for m in msgs)
|
|
has_assistant = any(m["role"] == "assistant" for m in msgs)
|
|
|
|
if not has_user:
|
|
warnings.append(f"{filename} — no user message")
|
|
if not has_assistant:
|
|
warnings.append(f"{filename} — no assistant message")
|
|
|
|
# Check for <think> blocks in assistant messages
|
|
has_think = False
|
|
for m in msgs:
|
|
if m["role"] == "assistant":
|
|
content = m.get("content") or ""
|
|
if "<think>" in content:
|
|
has_think = True
|
|
|
|
if not has_think:
|
|
warnings.append(f"{filename} — no <think> block in assistant response")
|
|
|
|
return warnings
|
|
|
|
|
|
def main():
|
|
if not TRAINING_DIR.exists():
|
|
print(f"ERROR: {TRAINING_DIR} not found")
|
|
sys.exit(1)
|
|
|
|
files = sorted(TRAINING_DIR.glob("*.json"))
|
|
if not files:
|
|
print(f"ERROR: no .json files in {TRAINING_DIR}")
|
|
sys.exit(1)
|
|
|
|
print(f"Building substrate v5 dataset")
|
|
print(f"Source: {TRAINING_DIR}/")
|
|
print(f"Output: {OUTPUT}")
|
|
print("=" * 50)
|
|
|
|
examples = []
|
|
all_warnings = []
|
|
tool_call_count = 0
|
|
direct_count = 0
|
|
think_count = 0
|
|
total_tool_calls = 0
|
|
|
|
for f in files:
|
|
try:
|
|
with open(f) as fh:
|
|
ex = json.load(fh)
|
|
except json.JSONDecodeError as e:
|
|
print(f" ERROR: {f.name} — invalid JSON: {e}")
|
|
continue
|
|
|
|
warnings = validate_example(ex, f.name)
|
|
all_warnings.extend(warnings)
|
|
|
|
# Stats
|
|
has_tc = False
|
|
for m in ex["messages"]:
|
|
if m.get("tool_calls"):
|
|
has_tc = True
|
|
total_tool_calls += len(m["tool_calls"])
|
|
content = m.get("content") or ""
|
|
if "<think>" in content and m["role"] == "assistant":
|
|
think_count += 1
|
|
|
|
if has_tc:
|
|
tool_call_count += 1
|
|
else:
|
|
direct_count += 1
|
|
|
|
examples.append(ex)
|
|
print(f" {f.name:<45} {'TC' if has_tc else 'direct':>6} {len(ex['messages']):>2} msgs")
|
|
|
|
print(f"\n Total examples: {len(examples)}")
|
|
print(f" Tool-call examples: {tool_call_count}")
|
|
print(f" Direct examples: {direct_count}")
|
|
print(f" Total tool calls: {total_tool_calls}")
|
|
print(f" Think blocks: {think_count}")
|
|
|
|
if all_warnings:
|
|
print(f"\n Warnings ({len(all_warnings)}):")
|
|
for w in all_warnings:
|
|
print(f" {w}")
|
|
|
|
# Write JSONL
|
|
with open(OUTPUT, "w") as out:
|
|
for ex in examples:
|
|
out.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
|
|
size_kb = os.path.getsize(OUTPUT) / 1024
|
|
print(f"\n Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|