lora/build_v4.py

#!/usr/bin/env python3
"""Merge all v4 dataset sources into bt7274_v4.jsonl.

Sources:
  1. bt7274_v3_reformatted.jsonl  — reformatted v3 (582 examples, with <think>)
  2. bt7274_persona.jsonl         — BT-7274 persona (183 examples)
  3. core_agent_tools.jsonl       — agent tool calls (126 examples)

Output: bt7274_v4.jsonl (shuffled, deduplicated)
"""

import json
import random
import hashlib
import sys

random.seed(42)

SOURCES = [
    ("bt7274_v3_reformatted.jsonl", "v3_reformatted"),
    ("bt7274_persona.jsonl",        "persona"),
    ("core_agent_tools.jsonl",      "agent_tools"),
]


def fingerprint(ex: dict) -> str:
    """Create a content hash for deduplication."""
    msgs = ex.get("messages", [])
    # Hash user messages only (system prompts differ, tool results vary)
    user_parts = []
    for m in msgs:
        if m["role"] == "user":
            user_parts.append(m.get("content", ""))
    content = "|".join(user_parts)
    return hashlib.md5(content.encode()).hexdigest()


def validate_example(ex: dict, source: str, idx: int) -> list[str]:
    """Validate example structure. Returns list of warnings."""
    warnings = []
    msgs = ex.get("messages")
    if not msgs or not isinstance(msgs, list):
        warnings.append(f"{source}:{idx} — no messages array")
        return warnings

    if msgs[0].get("role") != "system":
        warnings.append(f"{source}:{idx} — first message is not system")

    has_user = any(m["role"] == "user" for m in msgs)
    has_assistant = any(m["role"] == "assistant" for m in msgs)

    if not has_user:
        warnings.append(f"{source}:{idx} — no user message")
    if not has_assistant:
        warnings.append(f"{source}:{idx} — no assistant message")

    for j, m in enumerate(msgs):
        if m["role"] == "assistant":
            content = m.get("content") or ""
            if m.get("tool_calls") and not content:
                # Tool call with no content (no think block) — warning
                warnings.append(f"{source}:{idx}:msg{j} — tool_call assistant with no content/think")

    return warnings


def main():
    all_examples = []
    source_counts = {}
    all_warnings = []

    print("Building BT-7274 v4 Dataset")
    print("=" * 50)

    for filename, label in SOURCES:
        try:
            with open(filename) as f:
                examples = [json.loads(line) for line in f if line.strip()]
        except FileNotFoundError:
            print(f"  ⚠ {filename} not found — skipping")
            continue

        for i, ex in enumerate(examples):
            warns = validate_example(ex, label, i)
            all_warnings.extend(warns)
            # Tag source for stats
            ex["_source"] = label

        source_counts[label] = len(examples)
        all_examples.extend(examples)
        print(f"  {label:<20} {len(examples):>4} examples from {filename}")

    # Deduplicate
    seen = set()
    deduped = []
    dupes = 0
    for ex in all_examples:
        fp = fingerprint(ex)
        if fp in seen:
            dupes += 1
            continue
        seen.add(fp)
        deduped.append(ex)

    print(f"\n  Duplicates removed: {dupes}")
    print(f"  After dedup:       {len(deduped)}")

    # Shuffle
    random.shuffle(deduped)

    # Remove source tags before writing
    for ex in deduped:
        ex.pop("_source", None)

    # Stats
    total_tool_calls = 0
    total_direct = 0
    total_think = 0
    tools_seen = set()

    for ex in deduped:
        has_tc = False
        for m in ex["messages"]:
            if m.get("tool_calls"):
                has_tc = True
                for tc in m["tool_calls"]:
                    total_tool_calls += 1
                    tools_seen.add(tc["function"]["name"])
            content = m.get("content") or ""
            if "<think>" in content and m["role"] == "assistant":
                total_think += 1
        if not has_tc:
            total_direct += 1

    print(f"\n  Total examples:    {len(deduped)}")
    print(f"  Tool-call examples: {len(deduped) - total_direct}")
    print(f"  Direct examples:   {total_direct}")
    print(f"  Total tool calls:  {total_tool_calls}")
    print(f"  Unique tools:      {len(tools_seen)}")
    print(f"  Think blocks:      {total_think}")

    if all_warnings:
        print(f"\n  ⚠ Warnings ({len(all_warnings)}):")
        for w in all_warnings[:20]:
            print(f"    {w}")
        if len(all_warnings) > 20:
            print(f"    ... and {len(all_warnings) - 20} more")

    output = "bt7274_v4.jsonl"
    with open(output, "w") as f:
        for ex in deduped:
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")

    print(f"\n  Wrote {output}")

    # Size comparison
    import os
    v3_size = os.path.getsize("bt7274_v3.jsonl") / 1024
    v4_size = os.path.getsize(output) / 1024
    print(f"  v3 size: {v3_size:.0f} KB")
    print(f"  v4 size: {v4_size:.0f} KB ({v4_size/v3_size:.1f}x)")


if __name__ == "__main__":
    main()