25 lines
656 B
Python
25 lines
656 B
Python
"""Merge LoRA adapter into base model for vLLM serving."""
|
|
|
|
from unsloth import FastLanguageModel
|
|
|
|
MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
|
|
ADAPTER = "./bt7274-lora"
|
|
MERGED = "./bt7274-merged"
|
|
MAX_SEQ = 2048
|
|
|
|
print("Loading base model + adapter...")
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name=MODEL,
|
|
max_seq_length=MAX_SEQ,
|
|
load_in_4bit=True,
|
|
dtype=None,
|
|
)
|
|
|
|
# Load the LoRA adapter
|
|
from peft import PeftModel
|
|
model = PeftModel.from_pretrained(model, ADAPTER)
|
|
|
|
print(f"Merging and saving to {MERGED}/ (16-bit, ~14GB)...")
|
|
model.save_pretrained_merged(MERGED, tokenizer, save_method="merged_16bit")
|
|
print("Done.")
|