#!/usr/bin/env python3 """Merge all v4 dataset sources into bt7274_v4.jsonl. Sources: 1. bt7274_v3_reformatted.jsonl — reformatted v3 (582 examples, with ) 2. bt7274_persona.jsonl — BT-7274 persona (183 examples) 3. core_agent_tools.jsonl — agent tool calls (126 examples) Output: bt7274_v4.jsonl (shuffled, deduplicated) """ import json import random import hashlib import sys random.seed(42) SOURCES = [ ("bt7274_v3_reformatted.jsonl", "v3_reformatted"), ("bt7274_persona.jsonl", "persona"), ("core_agent_tools.jsonl", "agent_tools"), ] def fingerprint(ex: dict) -> str: """Create a content hash for deduplication.""" msgs = ex.get("messages", []) # Hash user messages only (system prompts differ, tool results vary) user_parts = [] for m in msgs: if m["role"] == "user": user_parts.append(m.get("content", "")) content = "|".join(user_parts) return hashlib.md5(content.encode()).hexdigest() def validate_example(ex: dict, source: str, idx: int) -> list[str]: """Validate example structure. Returns list of warnings.""" warnings = [] msgs = ex.get("messages") if not msgs or not isinstance(msgs, list): warnings.append(f"{source}:{idx} — no messages array") return warnings if msgs[0].get("role") != "system": warnings.append(f"{source}:{idx} — first message is not system") has_user = any(m["role"] == "user" for m in msgs) has_assistant = any(m["role"] == "assistant" for m in msgs) if not has_user: warnings.append(f"{source}:{idx} — no user message") if not has_assistant: warnings.append(f"{source}:{idx} — no assistant message") for j, m in enumerate(msgs): if m["role"] == "assistant": content = m.get("content") or "" if m.get("tool_calls") and not content: # Tool call with no content (no think block) — warning warnings.append(f"{source}:{idx}:msg{j} — tool_call assistant with no content/think") return warnings def main(): all_examples = [] source_counts = {} all_warnings = [] print("Building BT-7274 v4 Dataset") print("=" * 50) for filename, label in SOURCES: try: with open(filename) as f: examples = [json.loads(line) for line in f if line.strip()] except FileNotFoundError: print(f" ⚠ {filename} not found — skipping") continue for i, ex in enumerate(examples): warns = validate_example(ex, label, i) all_warnings.extend(warns) # Tag source for stats ex["_source"] = label source_counts[label] = len(examples) all_examples.extend(examples) print(f" {label:<20} {len(examples):>4} examples from {filename}") # Deduplicate seen = set() deduped = [] dupes = 0 for ex in all_examples: fp = fingerprint(ex) if fp in seen: dupes += 1 continue seen.add(fp) deduped.append(ex) print(f"\n Duplicates removed: {dupes}") print(f" After dedup: {len(deduped)}") # Shuffle random.shuffle(deduped) # Remove source tags before writing for ex in deduped: ex.pop("_source", None) # Stats total_tool_calls = 0 total_direct = 0 total_think = 0 tools_seen = set() for ex in deduped: has_tc = False for m in ex["messages"]: if m.get("tool_calls"): has_tc = True for tc in m["tool_calls"]: total_tool_calls += 1 tools_seen.add(tc["function"]["name"]) content = m.get("content") or "" if "" in content and m["role"] == "assistant": total_think += 1 if not has_tc: total_direct += 1 print(f"\n Total examples: {len(deduped)}") print(f" Tool-call examples: {len(deduped) - total_direct}") print(f" Direct examples: {total_direct}") print(f" Total tool calls: {total_tool_calls}") print(f" Unique tools: {len(tools_seen)}") print(f" Think blocks: {total_think}") if all_warnings: print(f"\n ⚠ Warnings ({len(all_warnings)}):") for w in all_warnings[:20]: print(f" {w}") if len(all_warnings) > 20: print(f" ... and {len(all_warnings) - 20} more") output = "bt7274_v4.jsonl" with open(output, "w") as f: for ex in deduped: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"\n Wrote {output}") # Size comparison import os v3_size = os.path.getsize("bt7274_v3.jsonl") / 1024 v4_size = os.path.getsize(output) / 1024 print(f" v3 size: {v3_size:.0f} KB") print(f" v4 size: {v4_size:.0f} KB ({v4_size/v3_size:.1f}x)") if __name__ == "__main__": main()