lora/build_v5.py

#!/usr/bin/env python3
"""Build v5 training dataset from training/*.json files.

Each file in training/ is a single JSON object with a "messages" array.
This script validates, counts, and merges them into a single JSONL file.

Usage:
    python build_v5.py
"""

import json
import os
import sys
from pathlib import Path

TRAINING_DIR = Path(__file__).parent / "training"
OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"


def validate_example(ex: dict, filename: str) -> list[str]:
    """Validate example structure. Returns list of warnings."""
    warnings = []
    msgs = ex.get("messages")
    if not msgs or not isinstance(msgs, list):
        warnings.append(f"{filename} — no messages array")
        return warnings

    if msgs[0].get("role") != "system":
        warnings.append(f"{filename} — first message is not system")

    has_user = any(m["role"] == "user" for m in msgs)
    has_assistant = any(m["role"] == "assistant" for m in msgs)

    if not has_user:
        warnings.append(f"{filename} — no user message")
    if not has_assistant:
        warnings.append(f"{filename} — no assistant message")

    # Check for <think> blocks in assistant messages
    has_think = False
    for m in msgs:
        if m["role"] == "assistant":
            content = m.get("content") or ""
            if "<think>" in content:
                has_think = True

    if not has_think:
        warnings.append(f"{filename} — no <think> block in assistant response")

    return warnings


def main():
    if not TRAINING_DIR.exists():
        print(f"ERROR: {TRAINING_DIR} not found")
        sys.exit(1)

    files = sorted(TRAINING_DIR.glob("*.json"))
    if not files:
        print(f"ERROR: no .json files in {TRAINING_DIR}")
        sys.exit(1)

    print(f"Building substrate v5 dataset")
    print(f"Source: {TRAINING_DIR}/")
    print(f"Output: {OUTPUT}")
    print("=" * 50)

    examples = []
    all_warnings = []
    tool_call_count = 0
    direct_count = 0
    think_count = 0
    total_tool_calls = 0

    for f in files:
        try:
            with open(f) as fh:
                ex = json.load(fh)
        except json.JSONDecodeError as e:
            print(f"  ERROR: {f.name} — invalid JSON: {e}")
            continue

        warnings = validate_example(ex, f.name)
        all_warnings.extend(warnings)

        # Stats
        has_tc = False
        for m in ex["messages"]:
            if m.get("tool_calls"):
                has_tc = True
                total_tool_calls += len(m["tool_calls"])
            content = m.get("content") or ""
            if "<think>" in content and m["role"] == "assistant":
                think_count += 1

        if has_tc:
            tool_call_count += 1
        else:
            direct_count += 1

        examples.append(ex)
        print(f"  {f.name:<45} {'TC' if has_tc else 'direct':>6}  {len(ex['messages']):>2} msgs")

    print(f"\n  Total examples:     {len(examples)}")
    print(f"  Tool-call examples: {tool_call_count}")
    print(f"  Direct examples:    {direct_count}")
    print(f"  Total tool calls:   {total_tool_calls}")
    print(f"  Think blocks:       {think_count}")

    if all_warnings:
        print(f"\n  Warnings ({len(all_warnings)}):")
        for w in all_warnings:
            print(f"    {w}")

    # Write JSONL
    with open(OUTPUT, "w") as out:
        for ex in examples:
            out.write(json.dumps(ex, ensure_ascii=False) + "\n")

    size_kb = os.path.getsize(OUTPUT) / 1024
    print(f"\n  Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")


if __name__ == "__main__":
    main()