Files

25 lines
656 B
Python

"""Merge LoRA adapter into base model for vLLM serving."""
from unsloth import FastLanguageModel
MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
ADAPTER = "./bt7274-lora"
MERGED = "./bt7274-merged"
MAX_SEQ = 2048
print("Loading base model + adapter...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL,
max_seq_length=MAX_SEQ,
load_in_4bit=True,
dtype=None,
)
# Load the LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(model, ADAPTER)
print(f"Merging and saving to {MERGED}/ (16-bit, ~14GB)...")
model.save_pretrained_merged(MERGED, tokenizer, save_method="merged_16bit")
print("Done.")