feat: bt7274 LoRA v4 — Hermes format, think blocks, 802 examples
This commit is contained in:
+165
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Merge all v4 dataset sources into bt7274_v4.jsonl.
|
||||
|
||||
Sources:
|
||||
1. bt7274_v3_reformatted.jsonl — reformatted v3 (582 examples, with <think>)
|
||||
2. bt7274_persona.jsonl — BT-7274 persona (183 examples)
|
||||
3. core_agent_tools.jsonl — agent tool calls (126 examples)
|
||||
|
||||
Output: bt7274_v4.jsonl (shuffled, deduplicated)
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import hashlib
|
||||
import sys
|
||||
|
||||
random.seed(42)
|
||||
|
||||
SOURCES = [
|
||||
("bt7274_v3_reformatted.jsonl", "v3_reformatted"),
|
||||
("bt7274_persona.jsonl", "persona"),
|
||||
("core_agent_tools.jsonl", "agent_tools"),
|
||||
]
|
||||
|
||||
|
||||
def fingerprint(ex: dict) -> str:
|
||||
"""Create a content hash for deduplication."""
|
||||
msgs = ex.get("messages", [])
|
||||
# Hash user messages only (system prompts differ, tool results vary)
|
||||
user_parts = []
|
||||
for m in msgs:
|
||||
if m["role"] == "user":
|
||||
user_parts.append(m.get("content", ""))
|
||||
content = "|".join(user_parts)
|
||||
return hashlib.md5(content.encode()).hexdigest()
|
||||
|
||||
|
||||
def validate_example(ex: dict, source: str, idx: int) -> list[str]:
|
||||
"""Validate example structure. Returns list of warnings."""
|
||||
warnings = []
|
||||
msgs = ex.get("messages")
|
||||
if not msgs or not isinstance(msgs, list):
|
||||
warnings.append(f"{source}:{idx} — no messages array")
|
||||
return warnings
|
||||
|
||||
if msgs[0].get("role") != "system":
|
||||
warnings.append(f"{source}:{idx} — first message is not system")
|
||||
|
||||
has_user = any(m["role"] == "user" for m in msgs)
|
||||
has_assistant = any(m["role"] == "assistant" for m in msgs)
|
||||
|
||||
if not has_user:
|
||||
warnings.append(f"{source}:{idx} — no user message")
|
||||
if not has_assistant:
|
||||
warnings.append(f"{source}:{idx} — no assistant message")
|
||||
|
||||
for j, m in enumerate(msgs):
|
||||
if m["role"] == "assistant":
|
||||
content = m.get("content") or ""
|
||||
if m.get("tool_calls") and not content:
|
||||
# Tool call with no content (no think block) — warning
|
||||
warnings.append(f"{source}:{idx}:msg{j} — tool_call assistant with no content/think")
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
def main():
|
||||
all_examples = []
|
||||
source_counts = {}
|
||||
all_warnings = []
|
||||
|
||||
print("Building BT-7274 v4 Dataset")
|
||||
print("=" * 50)
|
||||
|
||||
for filename, label in SOURCES:
|
||||
try:
|
||||
with open(filename) as f:
|
||||
examples = [json.loads(line) for line in f if line.strip()]
|
||||
except FileNotFoundError:
|
||||
print(f" ⚠ {filename} not found — skipping")
|
||||
continue
|
||||
|
||||
for i, ex in enumerate(examples):
|
||||
warns = validate_example(ex, label, i)
|
||||
all_warnings.extend(warns)
|
||||
# Tag source for stats
|
||||
ex["_source"] = label
|
||||
|
||||
source_counts[label] = len(examples)
|
||||
all_examples.extend(examples)
|
||||
print(f" {label:<20} {len(examples):>4} examples from {filename}")
|
||||
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
deduped = []
|
||||
dupes = 0
|
||||
for ex in all_examples:
|
||||
fp = fingerprint(ex)
|
||||
if fp in seen:
|
||||
dupes += 1
|
||||
continue
|
||||
seen.add(fp)
|
||||
deduped.append(ex)
|
||||
|
||||
print(f"\n Duplicates removed: {dupes}")
|
||||
print(f" After dedup: {len(deduped)}")
|
||||
|
||||
# Shuffle
|
||||
random.shuffle(deduped)
|
||||
|
||||
# Remove source tags before writing
|
||||
for ex in deduped:
|
||||
ex.pop("_source", None)
|
||||
|
||||
# Stats
|
||||
total_tool_calls = 0
|
||||
total_direct = 0
|
||||
total_think = 0
|
||||
tools_seen = set()
|
||||
|
||||
for ex in deduped:
|
||||
has_tc = False
|
||||
for m in ex["messages"]:
|
||||
if m.get("tool_calls"):
|
||||
has_tc = True
|
||||
for tc in m["tool_calls"]:
|
||||
total_tool_calls += 1
|
||||
tools_seen.add(tc["function"]["name"])
|
||||
content = m.get("content") or ""
|
||||
if "<think>" in content and m["role"] == "assistant":
|
||||
total_think += 1
|
||||
if not has_tc:
|
||||
total_direct += 1
|
||||
|
||||
print(f"\n Total examples: {len(deduped)}")
|
||||
print(f" Tool-call examples: {len(deduped) - total_direct}")
|
||||
print(f" Direct examples: {total_direct}")
|
||||
print(f" Total tool calls: {total_tool_calls}")
|
||||
print(f" Unique tools: {len(tools_seen)}")
|
||||
print(f" Think blocks: {total_think}")
|
||||
|
||||
if all_warnings:
|
||||
print(f"\n ⚠ Warnings ({len(all_warnings)}):")
|
||||
for w in all_warnings[:20]:
|
||||
print(f" {w}")
|
||||
if len(all_warnings) > 20:
|
||||
print(f" ... and {len(all_warnings) - 20} more")
|
||||
|
||||
output = "bt7274_v4.jsonl"
|
||||
with open(output, "w") as f:
|
||||
for ex in deduped:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"\n Wrote {output}")
|
||||
|
||||
# Size comparison
|
||||
import os
|
||||
v3_size = os.path.getsize("bt7274_v3.jsonl") / 1024
|
||||
v4_size = os.path.getsize(output) / 1024
|
||||
print(f" v3 size: {v3_size:.0f} KB")
|
||||
print(f" v4 size: {v4_size:.0f} KB ({v4_size/v3_size:.1f}x)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user