"""AWQ quantization via llm-compressor. Run in .venv-quant (llmcompressor + transformers 4.57.6). Uses domain calibration data from substrate_v5.jsonl. """ import json import torch from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor import oneshot MODEL = "/workspace/substrate-qwen36-27b-merged" OUTPUT = "/workspace/substrate-qwen36-27b-awq" CALIB_DATA = "/workspace/substrate_v5.jsonl" # Merged model was saved with transformers 5.5.0 which writes # tokenizer_class=TokenizersBackend — unknown to 4.57.6 in the quant venv. # Load tokenizer from original repo (has standard class name). TOKENIZER_SOURCE = "Qwen/Qwen3.6-27B" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SOURCE) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL, torch_dtype=torch.float16, device_map="auto" ) # Domain calibration from training data print("Loading calibration data...") with open(CALIB_DATA) as f: calib = [json.loads(line)["text"] for line in f][:128] print(f"Calibration samples: {len(calib)}") recipe = QuantizationModifier( targets="Linear", scheme="W4A16", ignore=["lm_head"], ) print("Running AWQ quantization...") oneshot( model=model, tokenizer=tokenizer, dataset=calib, recipe=recipe, output_dir=OUTPUT, max_seq_length=2048, num_calibration_samples=len(calib), ) print(f"Done! Output: {OUTPUT}")