add substrate v5 training and build scripts
This commit is contained in:
+125
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build v5 training dataset from training/*.json files.
|
||||
|
||||
Each file in training/ is a single JSON object with a "messages" array.
|
||||
This script validates, counts, and merges them into a single JSONL file.
|
||||
|
||||
Usage:
|
||||
python build_v5.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
TRAINING_DIR = Path(__file__).parent / "training"
|
||||
OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"
|
||||
|
||||
|
||||
def validate_example(ex: dict, filename: str) -> list[str]:
|
||||
"""Validate example structure. Returns list of warnings."""
|
||||
warnings = []
|
||||
msgs = ex.get("messages")
|
||||
if not msgs or not isinstance(msgs, list):
|
||||
warnings.append(f"{filename} — no messages array")
|
||||
return warnings
|
||||
|
||||
if msgs[0].get("role") != "system":
|
||||
warnings.append(f"{filename} — first message is not system")
|
||||
|
||||
has_user = any(m["role"] == "user" for m in msgs)
|
||||
has_assistant = any(m["role"] == "assistant" for m in msgs)
|
||||
|
||||
if not has_user:
|
||||
warnings.append(f"{filename} — no user message")
|
||||
if not has_assistant:
|
||||
warnings.append(f"{filename} — no assistant message")
|
||||
|
||||
# Check for <think> blocks in assistant messages
|
||||
has_think = False
|
||||
for m in msgs:
|
||||
if m["role"] == "assistant":
|
||||
content = m.get("content") or ""
|
||||
if "<think>" in content:
|
||||
has_think = True
|
||||
|
||||
if not has_think:
|
||||
warnings.append(f"{filename} — no <think> block in assistant response")
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
def main():
|
||||
if not TRAINING_DIR.exists():
|
||||
print(f"ERROR: {TRAINING_DIR} not found")
|
||||
sys.exit(1)
|
||||
|
||||
files = sorted(TRAINING_DIR.glob("*.json"))
|
||||
if not files:
|
||||
print(f"ERROR: no .json files in {TRAINING_DIR}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Building substrate v5 dataset")
|
||||
print(f"Source: {TRAINING_DIR}/")
|
||||
print(f"Output: {OUTPUT}")
|
||||
print("=" * 50)
|
||||
|
||||
examples = []
|
||||
all_warnings = []
|
||||
tool_call_count = 0
|
||||
direct_count = 0
|
||||
think_count = 0
|
||||
total_tool_calls = 0
|
||||
|
||||
for f in files:
|
||||
try:
|
||||
with open(f) as fh:
|
||||
ex = json.load(fh)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f" ERROR: {f.name} — invalid JSON: {e}")
|
||||
continue
|
||||
|
||||
warnings = validate_example(ex, f.name)
|
||||
all_warnings.extend(warnings)
|
||||
|
||||
# Stats
|
||||
has_tc = False
|
||||
for m in ex["messages"]:
|
||||
if m.get("tool_calls"):
|
||||
has_tc = True
|
||||
total_tool_calls += len(m["tool_calls"])
|
||||
content = m.get("content") or ""
|
||||
if "<think>" in content and m["role"] == "assistant":
|
||||
think_count += 1
|
||||
|
||||
if has_tc:
|
||||
tool_call_count += 1
|
||||
else:
|
||||
direct_count += 1
|
||||
|
||||
examples.append(ex)
|
||||
print(f" {f.name:<45} {'TC' if has_tc else 'direct':>6} {len(ex['messages']):>2} msgs")
|
||||
|
||||
print(f"\n Total examples: {len(examples)}")
|
||||
print(f" Tool-call examples: {tool_call_count}")
|
||||
print(f" Direct examples: {direct_count}")
|
||||
print(f" Total tool calls: {total_tool_calls}")
|
||||
print(f" Think blocks: {think_count}")
|
||||
|
||||
if all_warnings:
|
||||
print(f"\n Warnings ({len(all_warnings)}):")
|
||||
for w in all_warnings:
|
||||
print(f" {w}")
|
||||
|
||||
# Write JSONL
|
||||
with open(OUTPUT, "w") as out:
|
||||
for ex in examples:
|
||||
out.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
size_kb = os.path.getsize(OUTPUT) / 1024
|
||||
print(f"\n Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user