diff --git a/quantize_awq.py b/quantize_awq.py new file mode 100644 index 0000000..53c1bfe --- /dev/null +++ b/quantize_awq.py @@ -0,0 +1,49 @@ +"""AWQ quantization via llm-compressor. + +Run in .venv-quant (llmcompressor + transformers 4.57.6). +Uses domain calibration data from substrate_v5.jsonl. +""" + +import json +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor import oneshot + +MODEL = "/workspace/substrate-qwen36-27b-merged" +OUTPUT = "/workspace/substrate-qwen36-27b-awq" +CALIB_DATA = "/workspace/substrate_v5.jsonl" + +print("Loading tokenizer...") +tokenizer = AutoTokenizer.from_pretrained(MODEL) + +print("Loading model...") +model = AutoModelForCausalLM.from_pretrained( + MODEL, torch_dtype=torch.float16, device_map="auto" +) + +# Domain calibration from training data +print("Loading calibration data...") +with open(CALIB_DATA) as f: + calib = [json.loads(line)["text"] for line in f][:128] + +print(f"Calibration samples: {len(calib)}") + +recipe = QuantizationModifier( + targets="Linear", + scheme="W4A16", + ignore=["lm_head"], +) + +print("Running AWQ quantization...") +oneshot( + model=model, + tokenizer=tokenizer, + dataset=calib, + recipe=recipe, + output_dir=OUTPUT, + max_seq_length=2048, + num_calibration_samples=len(calib), +) + +print(f"Done! Output: {OUTPUT}")