switch quantization from llm-compressor to AutoRound
llm-compressor pins transformers<=4.57.6, can't load Qwen3.6. AutoRound (Intel) works with transformers 5.x and is already installed as an llmcompressor dependency. Produces vLLM-compatible INT4 output.
This commit is contained in:
+21
-23
@@ -1,25 +1,22 @@
|
|||||||
"""AWQ quantization via llm-compressor.
|
"""INT4 quantization via AutoRound (Intel).
|
||||||
|
|
||||||
Run in .venv-quant (llmcompressor + transformers 4.57.6).
|
Run in .venv-quant with transformers 5.x.
|
||||||
|
llm-compressor is pinned to transformers <=4.57.6 and can't load Qwen3.6.
|
||||||
|
AutoRound works with transformers 5.x and produces vLLM-compatible output.
|
||||||
Uses domain calibration data from substrate_v5.jsonl.
|
Uses domain calibration data from substrate_v5.jsonl.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
from auto_round import AutoRound
|
||||||
from llmcompressor import oneshot
|
|
||||||
|
|
||||||
MODEL = "/workspace/substrate-qwen36-27b-merged"
|
MODEL = "/workspace/substrate-qwen36-27b-merged"
|
||||||
OUTPUT = "/workspace/substrate-qwen36-27b-awq"
|
OUTPUT = "/workspace/substrate-qwen36-27b-int4"
|
||||||
CALIB_DATA = "/workspace/substrate_v5.jsonl"
|
CALIB_DATA = "/workspace/substrate_v5.jsonl"
|
||||||
# Merged model was saved with transformers 5.5.0 which writes
|
|
||||||
# tokenizer_class=TokenizersBackend — unknown to 4.57.6 in the quant venv.
|
|
||||||
# Load tokenizer from original repo (has standard class name).
|
|
||||||
TOKENIZER_SOURCE = "Qwen/Qwen3.6-27B"
|
|
||||||
|
|
||||||
print("Loading tokenizer...")
|
print("Loading tokenizer...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SOURCE)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
||||||
|
|
||||||
print("Loading model...")
|
print("Loading model...")
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
@@ -29,25 +26,26 @@ model = AutoModelForCausalLM.from_pretrained(
|
|||||||
# Domain calibration from training data
|
# Domain calibration from training data
|
||||||
print("Loading calibration data...")
|
print("Loading calibration data...")
|
||||||
with open(CALIB_DATA) as f:
|
with open(CALIB_DATA) as f:
|
||||||
calib = [json.loads(line)["text"] for line in f][:128]
|
calib = [json.loads(line)["text"] for line in f]
|
||||||
|
|
||||||
print(f"Calibration samples: {len(calib)}")
|
print(f"Calibration samples: {len(calib)}")
|
||||||
|
|
||||||
recipe = QuantizationModifier(
|
rounder = AutoRound(
|
||||||
targets="Linear",
|
|
||||||
scheme="W4A16",
|
|
||||||
ignore=["lm_head"],
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Running AWQ quantization...")
|
|
||||||
oneshot(
|
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
dataset=calib,
|
dataset=calib,
|
||||||
recipe=recipe,
|
bits=4,
|
||||||
output_dir=OUTPUT,
|
group_size=128,
|
||||||
max_seq_length=2048,
|
seqlen=2048,
|
||||||
num_calibration_samples=len(calib),
|
nsamples=min(128, len(calib)),
|
||||||
|
iters=200,
|
||||||
|
format="auto_round",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print("Running AutoRound INT4 quantization...")
|
||||||
|
rounder.quantize()
|
||||||
|
|
||||||
|
print(f"Saving to {OUTPUT}...")
|
||||||
|
rounder.save_quantized(OUTPUT, format="auto_round")
|
||||||
|
|
||||||
print(f"Done! Output: {OUTPUT}")
|
print(f"Done! Output: {OUTPUT}")
|
||||||
|
|||||||
Reference in New Issue
Block a user