166 lines
4.9 KiB
Python
166 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Merge all v4 dataset sources into bt7274_v4.jsonl.
|
|
|
|
Sources:
|
|
1. bt7274_v3_reformatted.jsonl — reformatted v3 (582 examples, with <think>)
|
|
2. bt7274_persona.jsonl — BT-7274 persona (183 examples)
|
|
3. core_agent_tools.jsonl — agent tool calls (126 examples)
|
|
|
|
Output: bt7274_v4.jsonl (shuffled, deduplicated)
|
|
"""
|
|
|
|
import json
|
|
import random
|
|
import hashlib
|
|
import sys
|
|
|
|
random.seed(42)
|
|
|
|
SOURCES = [
|
|
("bt7274_v3_reformatted.jsonl", "v3_reformatted"),
|
|
("bt7274_persona.jsonl", "persona"),
|
|
("core_agent_tools.jsonl", "agent_tools"),
|
|
]
|
|
|
|
|
|
def fingerprint(ex: dict) -> str:
|
|
"""Create a content hash for deduplication."""
|
|
msgs = ex.get("messages", [])
|
|
# Hash user messages only (system prompts differ, tool results vary)
|
|
user_parts = []
|
|
for m in msgs:
|
|
if m["role"] == "user":
|
|
user_parts.append(m.get("content", ""))
|
|
content = "|".join(user_parts)
|
|
return hashlib.md5(content.encode()).hexdigest()
|
|
|
|
|
|
def validate_example(ex: dict, source: str, idx: int) -> list[str]:
|
|
"""Validate example structure. Returns list of warnings."""
|
|
warnings = []
|
|
msgs = ex.get("messages")
|
|
if not msgs or not isinstance(msgs, list):
|
|
warnings.append(f"{source}:{idx} — no messages array")
|
|
return warnings
|
|
|
|
if msgs[0].get("role") != "system":
|
|
warnings.append(f"{source}:{idx} — first message is not system")
|
|
|
|
has_user = any(m["role"] == "user" for m in msgs)
|
|
has_assistant = any(m["role"] == "assistant" for m in msgs)
|
|
|
|
if not has_user:
|
|
warnings.append(f"{source}:{idx} — no user message")
|
|
if not has_assistant:
|
|
warnings.append(f"{source}:{idx} — no assistant message")
|
|
|
|
for j, m in enumerate(msgs):
|
|
if m["role"] == "assistant":
|
|
content = m.get("content") or ""
|
|
if m.get("tool_calls") and not content:
|
|
# Tool call with no content (no think block) — warning
|
|
warnings.append(f"{source}:{idx}:msg{j} — tool_call assistant with no content/think")
|
|
|
|
return warnings
|
|
|
|
|
|
def main():
|
|
all_examples = []
|
|
source_counts = {}
|
|
all_warnings = []
|
|
|
|
print("Building BT-7274 v4 Dataset")
|
|
print("=" * 50)
|
|
|
|
for filename, label in SOURCES:
|
|
try:
|
|
with open(filename) as f:
|
|
examples = [json.loads(line) for line in f if line.strip()]
|
|
except FileNotFoundError:
|
|
print(f" ⚠ {filename} not found — skipping")
|
|
continue
|
|
|
|
for i, ex in enumerate(examples):
|
|
warns = validate_example(ex, label, i)
|
|
all_warnings.extend(warns)
|
|
# Tag source for stats
|
|
ex["_source"] = label
|
|
|
|
source_counts[label] = len(examples)
|
|
all_examples.extend(examples)
|
|
print(f" {label:<20} {len(examples):>4} examples from {filename}")
|
|
|
|
# Deduplicate
|
|
seen = set()
|
|
deduped = []
|
|
dupes = 0
|
|
for ex in all_examples:
|
|
fp = fingerprint(ex)
|
|
if fp in seen:
|
|
dupes += 1
|
|
continue
|
|
seen.add(fp)
|
|
deduped.append(ex)
|
|
|
|
print(f"\n Duplicates removed: {dupes}")
|
|
print(f" After dedup: {len(deduped)}")
|
|
|
|
# Shuffle
|
|
random.shuffle(deduped)
|
|
|
|
# Remove source tags before writing
|
|
for ex in deduped:
|
|
ex.pop("_source", None)
|
|
|
|
# Stats
|
|
total_tool_calls = 0
|
|
total_direct = 0
|
|
total_think = 0
|
|
tools_seen = set()
|
|
|
|
for ex in deduped:
|
|
has_tc = False
|
|
for m in ex["messages"]:
|
|
if m.get("tool_calls"):
|
|
has_tc = True
|
|
for tc in m["tool_calls"]:
|
|
total_tool_calls += 1
|
|
tools_seen.add(tc["function"]["name"])
|
|
content = m.get("content") or ""
|
|
if "<think>" in content and m["role"] == "assistant":
|
|
total_think += 1
|
|
if not has_tc:
|
|
total_direct += 1
|
|
|
|
print(f"\n Total examples: {len(deduped)}")
|
|
print(f" Tool-call examples: {len(deduped) - total_direct}")
|
|
print(f" Direct examples: {total_direct}")
|
|
print(f" Total tool calls: {total_tool_calls}")
|
|
print(f" Unique tools: {len(tools_seen)}")
|
|
print(f" Think blocks: {total_think}")
|
|
|
|
if all_warnings:
|
|
print(f"\n ⚠ Warnings ({len(all_warnings)}):")
|
|
for w in all_warnings[:20]:
|
|
print(f" {w}")
|
|
if len(all_warnings) > 20:
|
|
print(f" ... and {len(all_warnings) - 20} more")
|
|
|
|
output = "bt7274_v4.jsonl"
|
|
with open(output, "w") as f:
|
|
for ex in deduped:
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
|
|
print(f"\n Wrote {output}")
|
|
|
|
# Size comparison
|
|
import os
|
|
v3_size = os.path.getsize("bt7274_v3.jsonl") / 1024
|
|
v4_size = os.path.getsize(output) / 1024
|
|
print(f" v3 size: {v3_size:.0f} KB")
|
|
print(f" v4 size: {v4_size:.0f} KB ({v4_size/v3_size:.1f}x)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|