switch quantization from llm-compressor to AutoRound

llm-compressor pins transformers<=4.57.6, can't load Qwen3.6.
AutoRound (Intel) works with transformers 5.x and is already
installed as an llmcompressor dependency. Produces vLLM-compatible
INT4 output.
This commit is contained in:
marauder-actual
2026-06-01 04:22:07 +02:00
parent 934be8ce48
commit 4edaeeb21b
+21 -23
View File
@@ -1,25 +1,22 @@
"""AWQ quantization via llm-compressor. """INT4 quantization via AutoRound (Intel).
Run in .venv-quant (llmcompressor + transformers 4.57.6). Run in .venv-quant with transformers 5.x.
llm-compressor is pinned to transformers <=4.57.6 and can't load Qwen3.6.
AutoRound works with transformers 5.x and produces vLLM-compatible output.
Uses domain calibration data from substrate_v5.jsonl. Uses domain calibration data from substrate_v5.jsonl.
""" """
import json import json
import torch import torch
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.modifiers.quantization import QuantizationModifier from auto_round import AutoRound
from llmcompressor import oneshot
MODEL = "/workspace/substrate-qwen36-27b-merged" MODEL = "/workspace/substrate-qwen36-27b-merged"
OUTPUT = "/workspace/substrate-qwen36-27b-awq" OUTPUT = "/workspace/substrate-qwen36-27b-int4"
CALIB_DATA = "/workspace/substrate_v5.jsonl" CALIB_DATA = "/workspace/substrate_v5.jsonl"
# Merged model was saved with transformers 5.5.0 which writes
# tokenizer_class=TokenizersBackend — unknown to 4.57.6 in the quant venv.
# Load tokenizer from original repo (has standard class name).
TOKENIZER_SOURCE = "Qwen/Qwen3.6-27B"
print("Loading tokenizer...") print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SOURCE) tokenizer = AutoTokenizer.from_pretrained(MODEL)
print("Loading model...") print("Loading model...")
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
@@ -29,25 +26,26 @@ model = AutoModelForCausalLM.from_pretrained(
# Domain calibration from training data # Domain calibration from training data
print("Loading calibration data...") print("Loading calibration data...")
with open(CALIB_DATA) as f: with open(CALIB_DATA) as f:
calib = [json.loads(line)["text"] for line in f][:128] calib = [json.loads(line)["text"] for line in f]
print(f"Calibration samples: {len(calib)}") print(f"Calibration samples: {len(calib)}")
recipe = QuantizationModifier( rounder = AutoRound(
targets="Linear",
scheme="W4A16",
ignore=["lm_head"],
)
print("Running AWQ quantization...")
oneshot(
model=model, model=model,
tokenizer=tokenizer, tokenizer=tokenizer,
dataset=calib, dataset=calib,
recipe=recipe, bits=4,
output_dir=OUTPUT, group_size=128,
max_seq_length=2048, seqlen=2048,
num_calibration_samples=len(calib), nsamples=min(128, len(calib)),
iters=200,
format="auto_round",
) )
print("Running AutoRound INT4 quantization...")
rounder.quantize()
print(f"Saving to {OUTPUT}...")
rounder.save_quantized(OUTPUT, format="auto_round")
print(f"Done! Output: {OUTPUT}") print(f"Done! Output: {OUTPUT}")