feat: bt7274 LoRA v4 — Hermes format, think blocks, 802 examples
This commit is contained in:
+195
@@ -0,0 +1,195 @@
|
||||
"""BT-7274 LoRA v4 — Qwen3.5-27B, bf16 LoRA (NOT QLoRA).
|
||||
|
||||
Key differences from v3 train script:
|
||||
- Uses BASE Qwen3.5 tokenizer (Hermes tool format, NOT Coder XML)
|
||||
- Dataset includes <think> blocks (enable_thinking in template)
|
||||
- Combined dataset: persona + agent tools + reformatted v3
|
||||
- No custom chat_template override — base model template produces
|
||||
Hermes-format tool calls that vLLM's hermes parser can decode
|
||||
|
||||
vLLM serving flags for v4:
|
||||
--tool-call-parser hermes
|
||||
--reasoning-parser deepseek_r1
|
||||
--enable-reasoning (or --enable-thinking via Qwen3 alias)
|
||||
|
||||
Usage:
|
||||
pip install --upgrade unsloth unsloth_zoo
|
||||
python train_v4.py
|
||||
"""
|
||||
|
||||
from unsloth import FastLanguageModel
|
||||
from trl import SFTTrainer, SFTConfig
|
||||
from datasets import load_dataset
|
||||
import torch
|
||||
import json
|
||||
|
||||
# ── Config ───────────────────────────────────────────────────────────
|
||||
MODEL = "Qwen/Qwen3.5-27B"
|
||||
MAX_SEQ = 8192 # bumped from 4096 — multi-turn conversations are longer now
|
||||
RANK = 16
|
||||
ALPHA = 16
|
||||
DATA = "./bt7274_v4.jsonl"
|
||||
OUT = "./bt7274-qwen35-27b-lora-v4"
|
||||
EPOCHS = 3
|
||||
BATCH = 1
|
||||
GRAD_ACCUM = 8
|
||||
LR = 5e-5 # lowered from 1e-4 — larger dataset benefits from gentler lr
|
||||
WARMUP_RATIO = 0.05 # 5% warmup instead of fixed steps
|
||||
|
||||
# ── Load model (bf16, NOT 4-bit) ────────────────────────────────────
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=MODEL,
|
||||
max_seq_length=MAX_SEQ,
|
||||
load_in_4bit=False, # QLoRA not recommended for Qwen3.5
|
||||
load_in_16bit=True, # bf16 LoRA
|
||||
full_finetuning=False,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# CRITICAL: Verify we're using the BASE tokenizer, not a LoRA override.
|
||||
# The base Qwen3.5 template produces Hermes-format tool calls:
|
||||
# <tool_call>{"name":"...","arguments":{...}}</tool_call>
|
||||
# NOT the Coder XML format that v3 used.
|
||||
print(f"Chat template source: {tokenizer.chat_template[:80] if tokenizer.chat_template else 'NONE'}...")
|
||||
|
||||
# ── LoRA adapter ────────────────────────────────────────────────────
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=RANK,
|
||||
lora_alpha=ALPHA,
|
||||
lora_dropout=0,
|
||||
target_modules=[
|
||||
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",
|
||||
],
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=42,
|
||||
max_seq_length=MAX_SEQ,
|
||||
)
|
||||
|
||||
# ── Dataset ─────────────────────────────────────────────────────────
|
||||
ds = load_dataset("json", data_files=DATA, split="train")
|
||||
|
||||
|
||||
def fix_tool_calls(messages):
|
||||
"""Parse tool_call arguments from JSON strings to dicts for Qwen3.5 template."""
|
||||
fixed = []
|
||||
for msg in messages:
|
||||
msg = dict(msg)
|
||||
if msg.get("tool_calls"):
|
||||
new_tcs = []
|
||||
for tc in msg["tool_calls"]:
|
||||
tc = dict(tc)
|
||||
if "function" in tc:
|
||||
fn = dict(tc["function"])
|
||||
if isinstance(fn.get("arguments"), str):
|
||||
try:
|
||||
fn["arguments"] = json.loads(fn["arguments"])
|
||||
except (ValueError, TypeError):
|
||||
fn["arguments"] = {"raw": fn["arguments"]}
|
||||
tc["function"] = fn
|
||||
new_tcs.append(tc)
|
||||
msg["tool_calls"] = new_tcs
|
||||
fixed.append(msg)
|
||||
return fixed
|
||||
|
||||
|
||||
def to_chatml(ex):
|
||||
"""Apply Qwen3.5 base chat template with thinking enabled."""
|
||||
messages = fix_tool_calls(ex["messages"])
|
||||
try:
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False,
|
||||
# Enable thinking mode so <think> blocks are properly formatted
|
||||
enable_thinking=True,
|
||||
)
|
||||
except TypeError:
|
||||
# Fallback if enable_thinking not supported in this template version
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False,
|
||||
)
|
||||
return {"text": text}
|
||||
|
||||
|
||||
ds = ds.map(to_chatml)
|
||||
|
||||
# Filter out examples that exceed max sequence length
|
||||
orig_len = len(ds)
|
||||
ds = ds.filter(lambda ex: len(tokenizer.encode(ex["text"])) <= MAX_SEQ)
|
||||
filtered = orig_len - len(ds)
|
||||
if filtered > 0:
|
||||
print(f"⚠ Filtered {filtered} examples exceeding {MAX_SEQ} tokens")
|
||||
|
||||
steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM)
|
||||
print(f"Dataset: {len(ds)} examples")
|
||||
print(f"Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}")
|
||||
print(f"Estimated steps: {steps}")
|
||||
print(f"LoRA: r={RANK}, alpha={ALPHA}")
|
||||
print(f"Max seq: {MAX_SEQ}")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"Learning rate: {LR}")
|
||||
print(f"Output: {OUT}")
|
||||
|
||||
# ── Train ───────────────────────────────────────────────────────────
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=ds,
|
||||
args=SFTConfig(
|
||||
output_dir=OUT,
|
||||
per_device_train_batch_size=BATCH,
|
||||
gradient_accumulation_steps=GRAD_ACCUM,
|
||||
num_train_epochs=EPOCHS,
|
||||
learning_rate=LR,
|
||||
bf16=True,
|
||||
logging_steps=5,
|
||||
save_steps=100,
|
||||
save_total_limit=2,
|
||||
warmup_ratio=WARMUP_RATIO,
|
||||
optim="adamw_8bit",
|
||||
seed=42,
|
||||
report_to="none",
|
||||
max_seq_length=MAX_SEQ,
|
||||
dataset_num_proc=1,
|
||||
lr_scheduler_type="cosine", # cosine decay for smoother convergence
|
||||
weight_decay=0.01, # light regularization
|
||||
),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# ── Save LoRA adapter ──────────────────────────────────────────────
|
||||
# IMPORTANT: Do NOT save a custom chat_template.
|
||||
# The base Qwen3.5 template is correct for Hermes format.
|
||||
# v3's mistake was saving a Coder XML template with the adapter.
|
||||
model.save_pretrained(OUT)
|
||||
|
||||
# Save tokenizer WITHOUT overriding the chat template
|
||||
# This ensures the base model's template is used at inference time
|
||||
tokenizer.save_pretrained(OUT)
|
||||
|
||||
# Verify no chat_template.jinja was saved (or if it was, it's the base one)
|
||||
import os
|
||||
template_path = os.path.join(OUT, "chat_template.jinja")
|
||||
if os.path.exists(template_path):
|
||||
with open(template_path) as f:
|
||||
content = f.read()
|
||||
if "function=" in content or "<parameter=" in content:
|
||||
print("⚠ WARNING: Saved template contains Coder XML format!")
|
||||
print(" This will cause hermes parser failures at inference.")
|
||||
print(" Delete chat_template.jinja from the adapter directory.")
|
||||
else:
|
||||
print("✓ Saved template uses Hermes JSON format (correct)")
|
||||
else:
|
||||
print("✓ No chat_template.jinja saved — will use base model template")
|
||||
|
||||
print(f"\nSaved LoRA adapter to {OUT}/")
|
||||
print(f"\nDeploy with:")
|
||||
print(f" --lora-modules bt7274={OUT}")
|
||||
print(f" --tool-call-parser hermes")
|
||||
print(f" --reasoning-parser deepseek_r1")
|
||||
Reference in New Issue
Block a user