Files
lora/build_v5.py
2026-06-01 03:52:55 +02:00

126 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""Build v5 training dataset from training/*.json files.
Each file in training/ is a single JSON object with a "messages" array.
This script validates, counts, and merges them into a single JSONL file.
Usage:
python build_v5.py
"""
import json
import os
import sys
from pathlib import Path
TRAINING_DIR = Path(__file__).parent / "training"
OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"
def validate_example(ex: dict, filename: str) -> list[str]:
"""Validate example structure. Returns list of warnings."""
warnings = []
msgs = ex.get("messages")
if not msgs or not isinstance(msgs, list):
warnings.append(f"{filename} — no messages array")
return warnings
if msgs[0].get("role") != "system":
warnings.append(f"{filename} — first message is not system")
has_user = any(m["role"] == "user" for m in msgs)
has_assistant = any(m["role"] == "assistant" for m in msgs)
if not has_user:
warnings.append(f"{filename} — no user message")
if not has_assistant:
warnings.append(f"{filename} — no assistant message")
# Check for <think> blocks in assistant messages
has_think = False
for m in msgs:
if m["role"] == "assistant":
content = m.get("content") or ""
if "<think>" in content:
has_think = True
if not has_think:
warnings.append(f"{filename} — no <think> block in assistant response")
return warnings
def main():
if not TRAINING_DIR.exists():
print(f"ERROR: {TRAINING_DIR} not found")
sys.exit(1)
files = sorted(TRAINING_DIR.glob("*.json"))
if not files:
print(f"ERROR: no .json files in {TRAINING_DIR}")
sys.exit(1)
print(f"Building substrate v5 dataset")
print(f"Source: {TRAINING_DIR}/")
print(f"Output: {OUTPUT}")
print("=" * 50)
examples = []
all_warnings = []
tool_call_count = 0
direct_count = 0
think_count = 0
total_tool_calls = 0
for f in files:
try:
with open(f) as fh:
ex = json.load(fh)
except json.JSONDecodeError as e:
print(f" ERROR: {f.name} — invalid JSON: {e}")
continue
warnings = validate_example(ex, f.name)
all_warnings.extend(warnings)
# Stats
has_tc = False
for m in ex["messages"]:
if m.get("tool_calls"):
has_tc = True
total_tool_calls += len(m["tool_calls"])
content = m.get("content") or ""
if "<think>" in content and m["role"] == "assistant":
think_count += 1
if has_tc:
tool_call_count += 1
else:
direct_count += 1
examples.append(ex)
print(f" {f.name:<45} {'TC' if has_tc else 'direct':>6} {len(ex['messages']):>2} msgs")
print(f"\n Total examples: {len(examples)}")
print(f" Tool-call examples: {tool_call_count}")
print(f" Direct examples: {direct_count}")
print(f" Total tool calls: {total_tool_calls}")
print(f" Think blocks: {think_count}")
if all_warnings:
print(f"\n Warnings ({len(all_warnings)}):")
for w in all_warnings:
print(f" {w}")
# Write JSONL
with open(OUTPUT, "w") as out:
for ex in examples:
out.write(json.dumps(ex, ensure_ascii=False) + "\n")
size_kb = os.path.getsize(OUTPUT) / 1024
print(f"\n Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")
if __name__ == "__main__":
main()