Files
lora/build_v4.py
T

166 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""Merge all v4 dataset sources into bt7274_v4.jsonl.
Sources:
1. bt7274_v3_reformatted.jsonl — reformatted v3 (582 examples, with <think>)
2. bt7274_persona.jsonl — BT-7274 persona (183 examples)
3. core_agent_tools.jsonl — agent tool calls (126 examples)
Output: bt7274_v4.jsonl (shuffled, deduplicated)
"""
import json
import random
import hashlib
import sys
random.seed(42)
SOURCES = [
("bt7274_v3_reformatted.jsonl", "v3_reformatted"),
("bt7274_persona.jsonl", "persona"),
("core_agent_tools.jsonl", "agent_tools"),
]
def fingerprint(ex: dict) -> str:
"""Create a content hash for deduplication."""
msgs = ex.get("messages", [])
# Hash user messages only (system prompts differ, tool results vary)
user_parts = []
for m in msgs:
if m["role"] == "user":
user_parts.append(m.get("content", ""))
content = "|".join(user_parts)
return hashlib.md5(content.encode()).hexdigest()
def validate_example(ex: dict, source: str, idx: int) -> list[str]:
"""Validate example structure. Returns list of warnings."""
warnings = []
msgs = ex.get("messages")
if not msgs or not isinstance(msgs, list):
warnings.append(f"{source}:{idx} — no messages array")
return warnings
if msgs[0].get("role") != "system":
warnings.append(f"{source}:{idx} — first message is not system")
has_user = any(m["role"] == "user" for m in msgs)
has_assistant = any(m["role"] == "assistant" for m in msgs)
if not has_user:
warnings.append(f"{source}:{idx} — no user message")
if not has_assistant:
warnings.append(f"{source}:{idx} — no assistant message")
for j, m in enumerate(msgs):
if m["role"] == "assistant":
content = m.get("content") or ""
if m.get("tool_calls") and not content:
# Tool call with no content (no think block) — warning
warnings.append(f"{source}:{idx}:msg{j} — tool_call assistant with no content/think")
return warnings
def main():
all_examples = []
source_counts = {}
all_warnings = []
print("Building BT-7274 v4 Dataset")
print("=" * 50)
for filename, label in SOURCES:
try:
with open(filename) as f:
examples = [json.loads(line) for line in f if line.strip()]
except FileNotFoundError:
print(f"{filename} not found — skipping")
continue
for i, ex in enumerate(examples):
warns = validate_example(ex, label, i)
all_warnings.extend(warns)
# Tag source for stats
ex["_source"] = label
source_counts[label] = len(examples)
all_examples.extend(examples)
print(f" {label:<20} {len(examples):>4} examples from {filename}")
# Deduplicate
seen = set()
deduped = []
dupes = 0
for ex in all_examples:
fp = fingerprint(ex)
if fp in seen:
dupes += 1
continue
seen.add(fp)
deduped.append(ex)
print(f"\n Duplicates removed: {dupes}")
print(f" After dedup: {len(deduped)}")
# Shuffle
random.shuffle(deduped)
# Remove source tags before writing
for ex in deduped:
ex.pop("_source", None)
# Stats
total_tool_calls = 0
total_direct = 0
total_think = 0
tools_seen = set()
for ex in deduped:
has_tc = False
for m in ex["messages"]:
if m.get("tool_calls"):
has_tc = True
for tc in m["tool_calls"]:
total_tool_calls += 1
tools_seen.add(tc["function"]["name"])
content = m.get("content") or ""
if "<think>" in content and m["role"] == "assistant":
total_think += 1
if not has_tc:
total_direct += 1
print(f"\n Total examples: {len(deduped)}")
print(f" Tool-call examples: {len(deduped) - total_direct}")
print(f" Direct examples: {total_direct}")
print(f" Total tool calls: {total_tool_calls}")
print(f" Unique tools: {len(tools_seen)}")
print(f" Think blocks: {total_think}")
if all_warnings:
print(f"\n ⚠ Warnings ({len(all_warnings)}):")
for w in all_warnings[:20]:
print(f" {w}")
if len(all_warnings) > 20:
print(f" ... and {len(all_warnings) - 20} more")
output = "bt7274_v4.jsonl"
with open(output, "w") as f:
for ex in deduped:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"\n Wrote {output}")
# Size comparison
import os
v3_size = os.path.getsize("bt7274_v3.jsonl") / 1024
v4_size = os.path.getsize(output) / 1024
print(f" v3 size: {v3_size:.0f} KB")
print(f" v4 size: {v4_size:.0f} KB ({v4_size/v3_size:.1f}x)")
if __name__ == "__main__":
main()