"""INT4 quantization via AutoRound (Intel). Run in .venv-quant with transformers 5.x. llm-compressor is pinned to transformers <=4.57.6 and can't load Qwen3.6. AutoRound works with transformers 5.x and produces vLLM-compatible output. Uses domain calibration data from substrate_v5.jsonl. """ import json import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound MODEL = "/workspace/substrate-qwen36-27b-merged" OUTPUT = "/workspace/substrate-qwen36-27b-int4" CALIB_DATA = "/workspace/substrate_v5.jsonl" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL, torch_dtype=torch.float16, device_map="auto" ) # Domain calibration from training data (chat format → raw text) print("Loading calibration data...") with open(CALIB_DATA) as f: calib = [] for line in f: msgs = json.loads(line)["messages"] text = tokenizer.apply_chat_template(msgs, tokenize=False) calib.append(text) print(f"Calibration samples: {len(calib)}") rounder = AutoRound( model=model, tokenizer=tokenizer, dataset=calib, bits=4, group_size=128, seqlen=2048, nsamples=min(128, len(calib)), iters=200, format="auto_round", ) print("Running AutoRound INT4 quantization...") rounder.quantize() print(f"Saving to {OUTPUT}...") rounder.save_quantized(OUTPUT, format="auto_round") print(f"Done! Output: {OUTPUT}")