172 lines
7.0 KiB
Python
172 lines
7.0 KiB
Python
#!/home/madcat/lora-train/bin/python3
|
|
"""Train BT-7274 memory LoRA v2 on Qwen2.5-7B-Instruct using Unsloth.
|
|
|
|
1000 curated EEMS memories — knowledge injection.
|
|
Run on junkpile (RTX 2000 Ada 16GB).
|
|
|
|
Changes from v1:
|
|
- Native messages format (role/content) — no ShareGPT conversion
|
|
- Completion-only loss — trains only on assistant responses
|
|
- Increased MAX_SEQ_LEN to 4096 for longer memories
|
|
- Adjusted for 1000 examples (more data = fewer epochs needed)
|
|
|
|
Prerequisites:
|
|
1. Stop vLLM: systemctl --user stop vllm-poc
|
|
2. Run: ~/lora-train/bin/python3 train_memory_lora_v2.py
|
|
3. Restart: systemctl --user start vllm-poc
|
|
"""
|
|
|
|
import os
|
|
import torch
|
|
from pathlib import Path
|
|
from unsloth import FastLanguageModel
|
|
from unsloth.chat_templates import get_chat_template
|
|
from trl import SFTTrainer, SFTConfig
|
|
from datasets import load_dataset
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# CONFIG
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
|
|
DATASET_PATH = "bt7274_memory_1000.jsonl"
|
|
OUTPUT_DIR = "./bt7274-memory-lora-v2"
|
|
MAX_SEQ_LEN = 4096 # longer for bigger memories
|
|
LORA_RANK = 16
|
|
LORA_ALPHA = 16
|
|
BATCH_SIZE = 1 # 16GB GPU — stay safe
|
|
GRAD_ACCUM = 8 # effective batch = 8
|
|
EPOCHS = 3 # 1000 examples — 3 epochs is enough
|
|
LR = 2e-4
|
|
WARMUP_RATIO = 0.03 # 3% warmup (better than fixed steps for larger dataset)
|
|
SAVE_STEPS = 100
|
|
LOGGING_STEPS = 10
|
|
SEED = 42
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# LOAD MODEL
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
print(f"Loading {MODEL_NAME}...")
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name=MODEL_NAME,
|
|
max_seq_length=MAX_SEQ_LEN,
|
|
load_in_4bit=True,
|
|
dtype=None,
|
|
)
|
|
|
|
tokenizer = get_chat_template(
|
|
tokenizer,
|
|
chat_template="qwen-2.5",
|
|
)
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# PEFT CONFIG
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
print("Applying LoRA...")
|
|
model = FastLanguageModel.get_peft_model(
|
|
model,
|
|
r=LORA_RANK,
|
|
lora_alpha=LORA_ALPHA,
|
|
target_modules=[
|
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
|
"gate_proj", "up_proj", "down_proj",
|
|
],
|
|
lora_dropout=0,
|
|
bias="none",
|
|
use_gradient_checkpointing="unsloth",
|
|
random_state=SEED,
|
|
)
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# DATASET — native messages format
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
print(f"Loading dataset from {DATASET_PATH}...")
|
|
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
|
|
print(f" {len(dataset)} examples loaded")
|
|
|
|
|
|
def apply_template(examples):
|
|
"""Apply Qwen2.5 chat template to messages."""
|
|
texts = []
|
|
for messages in examples["messages"]:
|
|
text = tokenizer.apply_chat_template(
|
|
messages,
|
|
tokenize=False,
|
|
add_generation_prompt=False,
|
|
)
|
|
texts.append(text)
|
|
return {"text": texts}
|
|
|
|
|
|
print("Applying chat template...")
|
|
dataset = dataset.map(apply_template, batched=True, num_proc=2)
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# TRAINER — with completion-only loss
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
print("Setting up trainer...")
|
|
trainer = SFTTrainer(
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
train_dataset=dataset,
|
|
dataset_text_field="text",
|
|
args=SFTConfig(
|
|
output_dir=OUTPUT_DIR,
|
|
per_device_train_batch_size=BATCH_SIZE,
|
|
gradient_accumulation_steps=GRAD_ACCUM,
|
|
num_train_epochs=EPOCHS,
|
|
learning_rate=LR,
|
|
lr_scheduler_type="cosine",
|
|
warmup_ratio=WARMUP_RATIO,
|
|
fp16=not torch.cuda.is_bf16_supported(),
|
|
bf16=torch.cuda.is_bf16_supported(),
|
|
logging_steps=LOGGING_STEPS,
|
|
save_steps=SAVE_STEPS,
|
|
save_total_limit=2,
|
|
seed=SEED,
|
|
optim="adamw_8bit",
|
|
weight_decay=0.01,
|
|
max_grad_norm=1.0,
|
|
report_to="none",
|
|
dataloader_num_workers=2,
|
|
),
|
|
max_seq_length=MAX_SEQ_LEN,
|
|
dataset_num_proc=2,
|
|
packing=True,
|
|
)
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# TRAIN
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
print("Starting training...")
|
|
stats = trainer.train()
|
|
print(f"\nTraining complete!")
|
|
print(f" Total steps: {stats.global_step}")
|
|
print(f" Train loss: {stats.training_loss:.4f}")
|
|
print(f" Runtime: {stats.metrics['train_runtime']:.0f}s")
|
|
|
|
# ──────────────────────────────────────────────────────────────
|
|
# SAVE ADAPTER
|
|
# ──────────────────────────────────────────────────────────────
|
|
|
|
print(f"\nSaving adapter to {OUTPUT_DIR}...")
|
|
model.save_pretrained(OUTPUT_DIR)
|
|
tokenizer.save_pretrained(OUTPUT_DIR)
|
|
|
|
adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors"
|
|
if adapter_path.exists():
|
|
size_mb = adapter_path.stat().st_size / (1024 * 1024)
|
|
print(f" Adapter saved: {size_mb:.1f} MB")
|
|
else:
|
|
print(" WARNING: adapter_model.safetensors not found!")
|
|
|
|
print(f"\nDone. To serve with vLLM:")
|
|
print(f" Update vllm-poc.service volume mount + lora-modules to point at:")
|
|
print(f" {os.path.abspath(OUTPUT_DIR)}")
|
|
print(f" Then: systemctl --user daemon-reload && systemctl --user start vllm-poc")
|