From 4edaeeb21b92e5f41552388bcd749ecac204d445 Mon Sep 17 00:00:00 2001 From: marauder-actual Date: Mon, 1 Jun 2026 04:22:07 +0200 Subject: [PATCH] switch quantization from llm-compressor to AutoRound llm-compressor pins transformers<=4.57.6, can't load Qwen3.6. AutoRound (Intel) works with transformers 5.x and is already installed as an llmcompressor dependency. Produces vLLM-compatible INT4 output. --- quantize_awq.py | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/quantize_awq.py b/quantize_awq.py index 77032d7..14a18e5 100644 --- a/quantize_awq.py +++ b/quantize_awq.py @@ -1,25 +1,22 @@ -"""AWQ quantization via llm-compressor. +"""INT4 quantization via AutoRound (Intel). -Run in .venv-quant (llmcompressor + transformers 4.57.6). +Run in .venv-quant with transformers 5.x. +llm-compressor is pinned to transformers <=4.57.6 and can't load Qwen3.6. +AutoRound works with transformers 5.x and produces vLLM-compatible output. Uses domain calibration data from substrate_v5.jsonl. """ import json import torch from transformers import AutoModelForCausalLM, AutoTokenizer -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor import oneshot +from auto_round import AutoRound MODEL = "/workspace/substrate-qwen36-27b-merged" -OUTPUT = "/workspace/substrate-qwen36-27b-awq" +OUTPUT = "/workspace/substrate-qwen36-27b-int4" CALIB_DATA = "/workspace/substrate_v5.jsonl" -# Merged model was saved with transformers 5.5.0 which writes -# tokenizer_class=TokenizersBackend — unknown to 4.57.6 in the quant venv. -# Load tokenizer from original repo (has standard class name). -TOKENIZER_SOURCE = "Qwen/Qwen3.6-27B" print("Loading tokenizer...") -tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SOURCE) +tokenizer = AutoTokenizer.from_pretrained(MODEL) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( @@ -29,25 +26,26 @@ model = AutoModelForCausalLM.from_pretrained( # Domain calibration from training data print("Loading calibration data...") with open(CALIB_DATA) as f: - calib = [json.loads(line)["text"] for line in f][:128] + calib = [json.loads(line)["text"] for line in f] print(f"Calibration samples: {len(calib)}") -recipe = QuantizationModifier( - targets="Linear", - scheme="W4A16", - ignore=["lm_head"], -) - -print("Running AWQ quantization...") -oneshot( +rounder = AutoRound( model=model, tokenizer=tokenizer, dataset=calib, - recipe=recipe, - output_dir=OUTPUT, - max_seq_length=2048, - num_calibration_samples=len(calib), + bits=4, + group_size=128, + seqlen=2048, + nsamples=min(128, len(calib)), + iters=200, + format="auto_round", ) +print("Running AutoRound INT4 quantization...") +rounder.quantize() + +print(f"Saving to {OUTPUT}...") +rounder.save_quantized(OUTPUT, format="auto_round") + print(f"Done! Output: {OUTPUT}")