add substrate v5 training and build scripts
This commit is contained in:
+125
@@ -0,0 +1,125 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build v5 training dataset from training/*.json files.
|
||||||
|
|
||||||
|
Each file in training/ is a single JSON object with a "messages" array.
|
||||||
|
This script validates, counts, and merges them into a single JSONL file.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python build_v5.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
TRAINING_DIR = Path(__file__).parent / "training"
|
||||||
|
OUTPUT = Path(__file__).parent / "substrate_v5.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
def validate_example(ex: dict, filename: str) -> list[str]:
|
||||||
|
"""Validate example structure. Returns list of warnings."""
|
||||||
|
warnings = []
|
||||||
|
msgs = ex.get("messages")
|
||||||
|
if not msgs or not isinstance(msgs, list):
|
||||||
|
warnings.append(f"{filename} — no messages array")
|
||||||
|
return warnings
|
||||||
|
|
||||||
|
if msgs[0].get("role") != "system":
|
||||||
|
warnings.append(f"{filename} — first message is not system")
|
||||||
|
|
||||||
|
has_user = any(m["role"] == "user" for m in msgs)
|
||||||
|
has_assistant = any(m["role"] == "assistant" for m in msgs)
|
||||||
|
|
||||||
|
if not has_user:
|
||||||
|
warnings.append(f"{filename} — no user message")
|
||||||
|
if not has_assistant:
|
||||||
|
warnings.append(f"{filename} — no assistant message")
|
||||||
|
|
||||||
|
# Check for <think> blocks in assistant messages
|
||||||
|
has_think = False
|
||||||
|
for m in msgs:
|
||||||
|
if m["role"] == "assistant":
|
||||||
|
content = m.get("content") or ""
|
||||||
|
if "<think>" in content:
|
||||||
|
has_think = True
|
||||||
|
|
||||||
|
if not has_think:
|
||||||
|
warnings.append(f"{filename} — no <think> block in assistant response")
|
||||||
|
|
||||||
|
return warnings
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not TRAINING_DIR.exists():
|
||||||
|
print(f"ERROR: {TRAINING_DIR} not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
files = sorted(TRAINING_DIR.glob("*.json"))
|
||||||
|
if not files:
|
||||||
|
print(f"ERROR: no .json files in {TRAINING_DIR}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Building substrate v5 dataset")
|
||||||
|
print(f"Source: {TRAINING_DIR}/")
|
||||||
|
print(f"Output: {OUTPUT}")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
examples = []
|
||||||
|
all_warnings = []
|
||||||
|
tool_call_count = 0
|
||||||
|
direct_count = 0
|
||||||
|
think_count = 0
|
||||||
|
total_tool_calls = 0
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
try:
|
||||||
|
with open(f) as fh:
|
||||||
|
ex = json.load(fh)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f" ERROR: {f.name} — invalid JSON: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
warnings = validate_example(ex, f.name)
|
||||||
|
all_warnings.extend(warnings)
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
has_tc = False
|
||||||
|
for m in ex["messages"]:
|
||||||
|
if m.get("tool_calls"):
|
||||||
|
has_tc = True
|
||||||
|
total_tool_calls += len(m["tool_calls"])
|
||||||
|
content = m.get("content") or ""
|
||||||
|
if "<think>" in content and m["role"] == "assistant":
|
||||||
|
think_count += 1
|
||||||
|
|
||||||
|
if has_tc:
|
||||||
|
tool_call_count += 1
|
||||||
|
else:
|
||||||
|
direct_count += 1
|
||||||
|
|
||||||
|
examples.append(ex)
|
||||||
|
print(f" {f.name:<45} {'TC' if has_tc else 'direct':>6} {len(ex['messages']):>2} msgs")
|
||||||
|
|
||||||
|
print(f"\n Total examples: {len(examples)}")
|
||||||
|
print(f" Tool-call examples: {tool_call_count}")
|
||||||
|
print(f" Direct examples: {direct_count}")
|
||||||
|
print(f" Total tool calls: {total_tool_calls}")
|
||||||
|
print(f" Think blocks: {think_count}")
|
||||||
|
|
||||||
|
if all_warnings:
|
||||||
|
print(f"\n Warnings ({len(all_warnings)}):")
|
||||||
|
for w in all_warnings:
|
||||||
|
print(f" {w}")
|
||||||
|
|
||||||
|
# Write JSONL
|
||||||
|
with open(OUTPUT, "w") as out:
|
||||||
|
for ex in examples:
|
||||||
|
out.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
size_kb = os.path.getsize(OUTPUT) / 1024
|
||||||
|
print(f"\n Wrote {OUTPUT.name} ({size_kb:.0f} KB, {len(examples)} examples)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+215
@@ -0,0 +1,215 @@
|
|||||||
|
"""Substrate v5 LoRA — Qwen3.6-27B bf16 on H200.
|
||||||
|
|
||||||
|
Generic MADCAT OS substrate training. NOT persona-specific.
|
||||||
|
40 curated examples: identity, tool categories, disambiguation,
|
||||||
|
framing, EEMS deep-dive. All with <think> blocks.
|
||||||
|
|
||||||
|
bf16 LoRA (NOT QLoRA) — H200 143 GB has room for full precision.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 train_v5.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
os.environ["HF_HOME"] = "/workspace/models"
|
||||||
|
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
from trl import SFTTrainer, SFTConfig
|
||||||
|
import torch
|
||||||
|
import json
|
||||||
|
|
||||||
|
# ── Config ───────────────────────────────────────────────────────────
|
||||||
|
MODEL = "Qwen/Qwen3.6-27B"
|
||||||
|
MAX_SEQ = 8192
|
||||||
|
RANK = 16
|
||||||
|
ALPHA = 16
|
||||||
|
DATA = "/workspace/substrate_v5.jsonl"
|
||||||
|
OUT = "/workspace/substrate-qwen36-27b-lora-v5"
|
||||||
|
EPOCHS = 3
|
||||||
|
BATCH = 1
|
||||||
|
GRAD_ACCUM = 4 # smaller dataset → smaller effective batch (4 vs 8)
|
||||||
|
LR = 5e-5
|
||||||
|
WARMUP_RATIO = 0.1 # 10% warmup — small dataset benefits from longer warmup
|
||||||
|
MAX_EXAMPLES = None # use all examples
|
||||||
|
|
||||||
|
# ── Load model (bf16, NOT 4-bit) ────────────────────────────────────
|
||||||
|
print(f"Loading {MODEL}...")
|
||||||
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=MODEL,
|
||||||
|
max_seq_length=MAX_SEQ,
|
||||||
|
load_in_4bit=False,
|
||||||
|
load_in_16bit=True,
|
||||||
|
full_finetuning=False,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Model loaded: {MODEL}")
|
||||||
|
print(f" GPU: {torch.cuda.get_device_name(0)}")
|
||||||
|
print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
|
||||||
|
print(f" Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
|
||||||
|
|
||||||
|
# ── LoRA adapter ────────────────────────────────────────────────────
|
||||||
|
model = FastLanguageModel.get_peft_model(
|
||||||
|
model,
|
||||||
|
r=RANK,
|
||||||
|
lora_alpha=ALPHA,
|
||||||
|
lora_dropout=0,
|
||||||
|
target_modules=[
|
||||||
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||||
|
"gate_proj", "up_proj", "down_proj",
|
||||||
|
],
|
||||||
|
bias="none",
|
||||||
|
use_gradient_checkpointing="unsloth",
|
||||||
|
random_state=42,
|
||||||
|
max_seq_length=MAX_SEQ,
|
||||||
|
)
|
||||||
|
|
||||||
|
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||||
|
total = sum(p.numel() for p in model.parameters())
|
||||||
|
print(f"LoRA: r={RANK}, alpha={ALPHA}")
|
||||||
|
print(f" Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Dataset ─────────────────────────────────────────────────────────
|
||||||
|
def fix_tool_calls(messages):
|
||||||
|
"""Parse tool_call arguments from JSON strings to dicts."""
|
||||||
|
fixed = []
|
||||||
|
for msg in messages:
|
||||||
|
msg = dict(msg)
|
||||||
|
if msg.get("tool_calls"):
|
||||||
|
new_tcs = []
|
||||||
|
for tc in msg["tool_calls"]:
|
||||||
|
tc = dict(tc)
|
||||||
|
if "function" in tc:
|
||||||
|
fn = dict(tc["function"])
|
||||||
|
if isinstance(fn.get("arguments"), str):
|
||||||
|
try:
|
||||||
|
fn["arguments"] = json.loads(fn["arguments"])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
fn["arguments"] = {"raw": fn["arguments"]}
|
||||||
|
tc["function"] = fn
|
||||||
|
new_tcs.append(tc)
|
||||||
|
msg["tool_calls"] = new_tcs
|
||||||
|
fixed.append(msg)
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_format(path, max_examples=None):
|
||||||
|
"""Load JSONL and format with chat template."""
|
||||||
|
from datasets import Dataset
|
||||||
|
_enc = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||||
|
texts = []
|
||||||
|
skipped = 0
|
||||||
|
with open(path) as f:
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
if max_examples and i >= max_examples:
|
||||||
|
break
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
row = json.loads(line)
|
||||||
|
messages = fix_tool_calls(row["messages"])
|
||||||
|
try:
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=False,
|
||||||
|
enable_thinking=True,
|
||||||
|
)
|
||||||
|
except TypeError:
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=False,
|
||||||
|
)
|
||||||
|
tok_len = len(_enc.encode(text))
|
||||||
|
if tok_len <= MAX_SEQ:
|
||||||
|
texts.append(text)
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
print(f" Skipped example {i}: {tok_len} tokens > {MAX_SEQ}")
|
||||||
|
if skipped:
|
||||||
|
print(f" Filtered {skipped} examples exceeding {MAX_SEQ} tokens")
|
||||||
|
return Dataset.from_dict({"text": texts})
|
||||||
|
|
||||||
|
|
||||||
|
print(f"\nLoading dataset: {DATA}")
|
||||||
|
ds = load_and_format(DATA, max_examples=MAX_EXAMPLES)
|
||||||
|
|
||||||
|
steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
|
||||||
|
print(f" Loaded: {len(ds)} examples")
|
||||||
|
print(f" Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}")
|
||||||
|
print(f" Estimated steps: {steps}")
|
||||||
|
|
||||||
|
# ── Train ───────────────────────────────────────────────────────────
|
||||||
|
print(f"\nTraining...")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
trainer = SFTTrainer(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
train_dataset=ds,
|
||||||
|
args=SFTConfig(
|
||||||
|
output_dir=OUT,
|
||||||
|
per_device_train_batch_size=BATCH,
|
||||||
|
gradient_accumulation_steps=GRAD_ACCUM,
|
||||||
|
num_train_epochs=EPOCHS,
|
||||||
|
learning_rate=LR,
|
||||||
|
bf16=True,
|
||||||
|
logging_steps=1,
|
||||||
|
save_steps=999999, # save only at end
|
||||||
|
warmup_ratio=WARMUP_RATIO,
|
||||||
|
optim="adamw_torch", # no adamw_8bit — bitsandbytes cu132 issue
|
||||||
|
seed=42,
|
||||||
|
report_to="none",
|
||||||
|
max_seq_length=MAX_SEQ,
|
||||||
|
dataset_num_proc=1,
|
||||||
|
lr_scheduler_type="cosine",
|
||||||
|
weight_decay=0.01,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = trainer.train()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Training complete — loss: {result.training_loss:.4f}")
|
||||||
|
print(f" Steps: {result.global_step}")
|
||||||
|
print(f" Runtime: {result.metrics['train_runtime']:.0f}s")
|
||||||
|
print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
|
||||||
|
|
||||||
|
# ── Save adapter ────────────────────────────────────────────────────
|
||||||
|
print(f"\nSaving adapter to {OUT}/")
|
||||||
|
model.save_pretrained(OUT)
|
||||||
|
tokenizer.save_pretrained(OUT)
|
||||||
|
|
||||||
|
adapter_path = os.path.join(OUT, "adapter_model.safetensors")
|
||||||
|
if os.path.exists(adapter_path):
|
||||||
|
size_mb = os.path.getsize(adapter_path) / 1e6
|
||||||
|
print(f" Adapter: {size_mb:.1f} MB")
|
||||||
|
else:
|
||||||
|
print(" ERROR: adapter_model.safetensors not found")
|
||||||
|
|
||||||
|
# ── Merge LoRA into base weights ────────────────────────────────────
|
||||||
|
MERGED = "/workspace/substrate-qwen36-27b-merged"
|
||||||
|
print(f"\nMerging LoRA into base weights → {MERGED}/")
|
||||||
|
model.save_pretrained_merged(
|
||||||
|
MERGED,
|
||||||
|
tokenizer,
|
||||||
|
save_method="merged_16bit",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify merged model
|
||||||
|
merged_files = [f for f in os.listdir(MERGED) if f.endswith(".safetensors")]
|
||||||
|
merged_size_gb = sum(os.path.getsize(os.path.join(MERGED, f)) for f in merged_files) / 1e9
|
||||||
|
print(f" Merged: {len(merged_files)} safetensor files, {merged_size_gb:.1f} GB")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print("DONE — TRAIN + MERGE")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
print(f"\n Model: {MODEL}")
|
||||||
|
print(f" Examples: {len(ds)}")
|
||||||
|
print(f" LoRA: r={RANK}, alpha={ALPHA}")
|
||||||
|
print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
|
||||||
|
print(f" Adapter: {OUT}/")
|
||||||
|
print(f" Merged: {MERGED}/")
|
||||||
|
print(f"\nNext: python3 quantize_awq.py")
|
||||||
Reference in New Issue
Block a user