#!/home/madcat/lora-train/bin/python3 """Train BT-7274 memory LoRA v2 on Qwen2.5-7B-Instruct using Unsloth. 1000 curated EEMS memories — knowledge injection. Run on junkpile (RTX 2000 Ada 16GB). Changes from v1: - Native messages format (role/content) — no ShareGPT conversion - Completion-only loss — trains only on assistant responses - Increased MAX_SEQ_LEN to 4096 for longer memories - Adjusted for 1000 examples (more data = fewer epochs needed) Prerequisites: 1. Stop vLLM: systemctl --user stop vllm-poc 2. Run: ~/lora-train/bin/python3 train_memory_lora_v2.py 3. Restart: systemctl --user start vllm-poc """ import os import torch from pathlib import Path from unsloth import FastLanguageModel from unsloth.chat_templates import get_chat_template from trl import SFTTrainer, SFTConfig from datasets import load_dataset # ────────────────────────────────────────────────────────────── # CONFIG # ────────────────────────────────────────────────────────────── MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" DATASET_PATH = "bt7274_memory_1000.jsonl" OUTPUT_DIR = "./bt7274-memory-lora-v2" MAX_SEQ_LEN = 4096 # longer for bigger memories LORA_RANK = 16 LORA_ALPHA = 16 BATCH_SIZE = 1 # 16GB GPU — stay safe GRAD_ACCUM = 8 # effective batch = 8 EPOCHS = 3 # 1000 examples — 3 epochs is enough LR = 2e-4 WARMUP_RATIO = 0.03 # 3% warmup (better than fixed steps for larger dataset) SAVE_STEPS = 100 LOGGING_STEPS = 10 SEED = 42 # ────────────────────────────────────────────────────────────── # LOAD MODEL # ────────────────────────────────────────────────────────────── print(f"Loading {MODEL_NAME}...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_NAME, max_seq_length=MAX_SEQ_LEN, load_in_4bit=True, dtype=None, ) tokenizer = get_chat_template( tokenizer, chat_template="qwen-2.5", ) # ────────────────────────────────────────────────────────────── # PEFT CONFIG # ────────────────────────────────────────────────────────────── print("Applying LoRA...") model = FastLanguageModel.get_peft_model( model, r=LORA_RANK, lora_alpha=LORA_ALPHA, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=SEED, ) # ────────────────────────────────────────────────────────────── # DATASET — native messages format # ────────────────────────────────────────────────────────────── print(f"Loading dataset from {DATASET_PATH}...") dataset = load_dataset("json", data_files=DATASET_PATH, split="train") print(f" {len(dataset)} examples loaded") def apply_template(examples): """Apply Qwen2.5 chat template to messages.""" texts = [] for messages in examples["messages"]: text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False, ) texts.append(text) return {"text": texts} print("Applying chat template...") dataset = dataset.map(apply_template, batched=True, num_proc=2) # ────────────────────────────────────────────────────────────── # TRAINER — with completion-only loss # ────────────────────────────────────────────────────────────── print("Setting up trainer...") trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", args=SFTConfig( output_dir=OUTPUT_DIR, per_device_train_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRAD_ACCUM, num_train_epochs=EPOCHS, learning_rate=LR, lr_scheduler_type="cosine", warmup_ratio=WARMUP_RATIO, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=LOGGING_STEPS, save_steps=SAVE_STEPS, save_total_limit=2, seed=SEED, optim="adamw_8bit", weight_decay=0.01, max_grad_norm=1.0, report_to="none", dataloader_num_workers=2, ), max_seq_length=MAX_SEQ_LEN, dataset_num_proc=2, packing=True, ) # ────────────────────────────────────────────────────────────── # TRAIN # ────────────────────────────────────────────────────────────── print("Starting training...") stats = trainer.train() print(f"\nTraining complete!") print(f" Total steps: {stats.global_step}") print(f" Train loss: {stats.training_loss:.4f}") print(f" Runtime: {stats.metrics['train_runtime']:.0f}s") # ────────────────────────────────────────────────────────────── # SAVE ADAPTER # ────────────────────────────────────────────────────────────── print(f"\nSaving adapter to {OUTPUT_DIR}...") model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) adapter_path = Path(OUTPUT_DIR) / "adapter_model.safetensors" if adapter_path.exists(): size_mb = adapter_path.stat().st_size / (1024 * 1024) print(f" Adapter saved: {size_mb:.1f} MB") else: print(" WARNING: adapter_model.safetensors not found!") print(f"\nDone. To serve with vLLM:") print(f" Update vllm-poc.service volume mount + lora-modules to point at:") print(f" {os.path.abspath(OUTPUT_DIR)}") print(f" Then: systemctl --user daemon-reload && systemctl --user start vllm-poc")