From cc7b5c88a32f382aff6e02fa9a667ce81f12c71d Mon Sep 17 00:00:00 2001 From: madcat Date: Sun, 31 May 2026 11:39:08 +0200 Subject: [PATCH] add training scripts: v3, qwen3-8b, qlora-7b --- train_qlora_7b.py | 202 ++++++++++++++++++++++++++++++++++++++++++++++ train_qwen3_8b.py | 95 ++++++++++++++++++++++ train_v3.py | 97 ++++++++++++++++++++++ 3 files changed, 394 insertions(+) create mode 100644 train_qlora_7b.py create mode 100644 train_qwen3_8b.py create mode 100644 train_v3.py diff --git a/train_qlora_7b.py b/train_qlora_7b.py new file mode 100644 index 0000000..1ee3bf6 --- /dev/null +++ b/train_qlora_7b.py @@ -0,0 +1,202 @@ +"""QLoRA smoke test — Qwen2.5-7B-Instruct on RTX 2000 Ada (16 GB). + +Adapted from train_v4.py for junkpile's 16 GB VRAM budget. +Uses bitsandbytes 4-bit quantization (QLoRA) instead of bf16 LoRA. + +Key differences from train_v4.py: + - Model: Qwen2.5-7B-Instruct (not Qwen3.5-27B) + - QLoRA: load_in_4bit=True (bnb), not bf16 + - Shorter sequences: 2048 (16 GB ceiling) + - Smaller batching: grad_accum 4 (fits memory) + - 100 examples only (pipeline test, not quality) + - 1 epoch (smoke test speed) + +Memory estimate: + Model (4-bit bnb) ~5 GB + LoRA adapters (r=16) ~100 MB + Optimizer (adamw_8bit) ~200 MB + Activations (grad ckpt, seq 2048) ~4-6 GB + Total ~10-12 GB + +Usage (inside madcat-ml container on junkpile): + cd /workspace/lora + python3 train_qlora_7b.py + +Expected runtime: <10 min +Expected VRAM peak: ~10-12 GB +""" + +from unsloth import FastLanguageModel +from trl import SFTTrainer, SFTConfig +import torch +import json +import os + +# ── Config ─────────────────────────────────────────────────────────── +MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" +MAX_SEQ = 2048 +RANK = 16 +ALPHA = 16 +DATA = "./bt7274_v4.jsonl" +OUT = "./qlora-qwen25-7b-smoke" +EPOCHS = 1 +BATCH = 1 +GRAD_ACCUM = 4 +LR = 1e-4 +WARMUP_RATIO = 0.1 +MAX_EXAMPLES = 100 + +# ── Load model (4-bit QLoRA) ──────────────────────────────────────── +print(f"Loading {MODEL}...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL, + max_seq_length=MAX_SEQ, + load_in_4bit=True, + dtype=torch.bfloat16, +) + +print(f"Model loaded: {MODEL}") +print(f" CUDA: {torch.cuda.get_device_name(0)}") +print(f" VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB") +print(f" Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB") + +# ── LoRA adapter ──────────────────────────────────────────────────── +model = FastLanguageModel.get_peft_model( + model, + r=RANK, + lora_alpha=ALPHA, + lora_dropout=0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, + max_seq_length=MAX_SEQ, +) + +trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) +total = sum(p.numel() for p in model.parameters()) +print(f"LoRA: r={RANK}, alpha={ALPHA}") +print(f" Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)") + + +# ── Dataset ───────────────────────────────────────────────────────── +def fix_tool_calls(messages): + """Parse tool_call arguments from JSON strings to dicts.""" + fixed = [] + for msg in messages: + msg = dict(msg) + if msg.get("tool_calls"): + new_tcs = [] + for tc in msg["tool_calls"]: + tc = dict(tc) + if "function" in tc: + fn = dict(tc["function"]) + if isinstance(fn.get("arguments"), str): + try: + fn["arguments"] = json.loads(fn["arguments"]) + except (ValueError, TypeError): + fn["arguments"] = {"raw": fn["arguments"]} + tc["function"] = fn + new_tcs.append(tc) + msg["tool_calls"] = new_tcs + fixed.append(msg) + return fixed + + +def load_and_format(path, max_examples=None): + """Load JSONL and format with chat template.""" + from datasets import Dataset + _enc = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer + texts = [] + skipped = 0 + with open(path) as f: + for i, line in enumerate(f): + if max_examples and i >= max_examples: + break + line = line.strip() + if not line: + continue + row = json.loads(line) + messages = fix_tool_calls(row["messages"]) + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False, + ) + tok_len = len(_enc.encode(text)) + if tok_len <= MAX_SEQ: + texts.append(text) + else: + skipped += 1 + if skipped: + print(f" Filtered {skipped} examples exceeding {MAX_SEQ} tokens") + return Dataset.from_dict({"text": texts}) + + +print(f"\nLoading dataset: {DATA} (first {MAX_EXAMPLES} examples)") +ds = load_and_format(DATA, max_examples=MAX_EXAMPLES) + +steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM) +print(f" Loaded: {len(ds)} examples") +print(f" Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}") +print(f" Estimated steps: {steps}") + +# ── Train ─────────────────────────────────────────────────────────── +print(f"\nTraining...") +print("=" * 60) + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=ds, + args=SFTConfig( + output_dir=OUT, + per_device_train_batch_size=BATCH, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + bf16=True, + logging_steps=5, + save_steps=999999, # no mid-training checkpoints (smoke test) + warmup_ratio=WARMUP_RATIO, + optim="adamw_8bit", # 8-bit adam saves ~1 GB vs adamw_torch + seed=42, + report_to="none", + max_seq_length=MAX_SEQ, + dataset_num_proc=1, + lr_scheduler_type="cosine", + weight_decay=0.01, + ), +) + +result = trainer.train() + +print("=" * 60) +print(f"Training complete — loss: {result.training_loss:.4f}") +print(f" Steps: {result.global_step}") +print(f" Runtime: {result.metrics['train_runtime']:.0f}s") +print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") + +# ── Save LoRA adapter ────────────────────────────────────────────── +print(f"\nSaving adapter to {OUT}/") +model.save_pretrained(OUT) +tokenizer.save_pretrained(OUT) + +adapter_path = os.path.join(OUT, "adapter_model.safetensors") +if os.path.exists(adapter_path): + size_mb = os.path.getsize(adapter_path) / 1e6 + print(f" Adapter: {size_mb:.1f} MB") +else: + print(" ERROR: adapter_model.safetensors not found") + +print(f"\n{'=' * 60}") +print("SMOKE TEST PASSED") +print(f"{'=' * 60}") +print(f"\n Model: {MODEL}") +print(f" Examples: {len(ds)}") +print(f" LoRA: r={RANK}, alpha={ALPHA}") +print(f" Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB") +print(f" Adapter: {OUT}/") diff --git a/train_qwen3_8b.py b/train_qwen3_8b.py new file mode 100644 index 0000000..a8e4e6f --- /dev/null +++ b/train_qwen3_8b.py @@ -0,0 +1,95 @@ +"""Qwen3-8B coding agent LoRA — trained on opencode tool-use + coding patterns. + +Usage (on RunPod A100/H100): + python train_qwen3_8b.py +""" + +from unsloth import FastLanguageModel +from trl import SFTTrainer +from transformers import TrainingArguments +from datasets import load_dataset + +# --- Config --- +MODEL = "unsloth/Qwen3-8B-unsloth-bnb-4bit" +MAX_SEQ = 4096 +RANK = 16 +ALPHA = 16 +DATA = "./bt7274_v3.jsonl" +OUT = "./qwen3-8b-lora-v1" +EPOCHS = 3 +BATCH = 4 # A100/H100 can handle bigger batches +GRAD_ACCUM = 4 # effective batch = 16 +LR = 2e-4 + +# --- Load model (4-bit quantized) --- +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL, + max_seq_length=MAX_SEQ, + load_in_4bit=True, + dtype=None, +) + +# --- LoRA adapter --- +model = FastLanguageModel.get_peft_model( + model, + r=RANK, + lora_alpha=ALPHA, + lora_dropout=0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, +) + +# --- Dataset --- +ds = load_dataset("json", data_files=DATA, split="train") + +def to_chatml(ex): + text = tokenizer.apply_chat_template( + ex["messages"], tokenize=False, add_generation_prompt=False + ) + return {"text": text} + +ds = ds.map(to_chatml) + +steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM) +print(f"Dataset: {len(ds)} examples") +print(f"Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}") +print(f"Estimated steps: {steps}") +print(f"LoRA: r={RANK}, alpha={ALPHA}") +print(f"Max seq: {MAX_SEQ}") + +# --- Train --- +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=ds, + args=TrainingArguments( + output_dir=OUT, + per_device_train_batch_size=BATCH, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + bf16=True, + logging_steps=5, + save_steps=50, + save_total_limit=2, + warmup_steps=10, + optim="adamw_8bit", + seed=42, + report_to="none", + ), + dataset_text_field="text", + max_seq_length=MAX_SEQ, + packing=False, +) + +trainer.train() + +# --- Save LoRA adapter --- +model.save_pretrained(OUT) +tokenizer.save_pretrained(OUT) +print(f"\nSaved LoRA adapter to {OUT}/") diff --git a/train_v3.py b/train_v3.py new file mode 100644 index 0000000..5a909fa --- /dev/null +++ b/train_v3.py @@ -0,0 +1,97 @@ +"""BT-7274 LoRA v3 — balanced tool-call + direct-response. + +Usage: + source ~/lora-train/bin/activate + cd ~/Projects/lora + python train_v3.py +""" + +from unsloth import FastLanguageModel +from trl import SFTTrainer +from transformers import TrainingArguments +from datasets import load_dataset + +# --- Config --- +MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit" +MAX_SEQ = 4096 +RANK = 16 +ALPHA = 16 +DATA = "./bt7274_v3.jsonl" +OUT = "./bt7274-lora-v3" +EPOCHS = 3 +BATCH = 1 +GRAD_ACCUM = 8 +LR = 2e-4 + +# --- Load model (4-bit quantized) --- +model, tokenizer = FastLanguageModel.from_pretrained( + model_name=MODEL, + max_seq_length=MAX_SEQ, + load_in_4bit=True, + dtype=None, +) + +# --- LoRA adapter --- +model = FastLanguageModel.get_peft_model( + model, + r=RANK, + lora_alpha=ALPHA, + lora_dropout=0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + bias="none", + use_gradient_checkpointing="unsloth", + random_state=42, +) + +# --- Dataset --- +ds = load_dataset("json", data_files=DATA, split="train") + +def to_chatml(ex): + text = tokenizer.apply_chat_template( + ex["messages"], tokenize=False, add_generation_prompt=False + ) + return {"text": text} + +ds = ds.map(to_chatml) + +steps = (len(ds) * EPOCHS) // (BATCH * GRAD_ACCUM) +print(f"Dataset: {len(ds)} examples") +print(f"Epochs: {EPOCHS}, effective batch: {BATCH * GRAD_ACCUM}") +print(f"Estimated steps: {steps}") +print(f"LoRA: r={RANK}, alpha={ALPHA}") +print(f"Max seq: {MAX_SEQ}") + +# --- Train --- +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=ds, + args=TrainingArguments( + output_dir=OUT, + per_device_train_batch_size=BATCH, + gradient_accumulation_steps=GRAD_ACCUM, + num_train_epochs=EPOCHS, + learning_rate=LR, + bf16=True, + logging_steps=5, + save_steps=50, + save_total_limit=2, + warmup_steps=10, + optim="adamw_8bit", + seed=42, + report_to="none", + ), + dataset_text_field="text", + max_seq_length=MAX_SEQ, + packing=False, +) + +trainer.train() + +# --- Save LoRA adapter --- +model.save_pretrained(OUT) +tokenizer.save_pretrained(OUT) +print(f"\nSaved LoRA adapter to {OUT}/")