lora/justfile

# ── Specialist LoRA Training Pipeline ──
# Base model: Qwen/Qwen3.5-27B
# Run on: RunPod H100 or sin GB10

default_data := "bt7274_v3.jsonl"

# ── Data Extraction ──────────────────────────────────────────────────

# Extract specialist training data from opencode session DB
extract:
    @echo "── Extracting specialist data from opencode DB ──"
    python extract_specialists.py --outdir data/
    @echo ""
    @echo "── Mining git repos ──"
    python mine_repos.py --repos repos.json --outdir data/
    @echo ""
    @echo "Done. Review data/*.jsonl before training."

# Extract session data only (no git mining)
extract-sessions:
    python extract_specialists.py --outdir data/

# Mine git repos only (no session extraction)
extract-git:
    python mine_repos.py --repos repos.json --outdir data/

# Mine a single repo
mine repo lang:
    python mine_repos.py --repo {{repo}} --lang {{lang}} --outdir data/

# ── Dataset Stats ────────────────────────────────────────────────────

# Show stats for all datasets
stats:
    @echo "── Dataset Statistics ──"
    @for f in data/*.jsonl bt7274_v3.jsonl; do \
        if [ -f "$$f" ]; then \
            count=$$(wc -l < "$$f" | tr -d ' '); \
            echo "  $$f: $$count examples"; \
        fi; \
    done

# Detailed stats for a specific dataset
check file:
    python -c "\
    from datasets import load_dataset; \
    ds = load_dataset('json', data_files='{{file}}', split='train'); \
    print(f'Examples: {len(ds)}'); \
    roles = {}; \
    [roles.update({r: roles.get(r,0)+1}) for ex in ds for m in ex['messages'] for r in [m['role']]]; \
    print(f'Roles: {roles}'); \
    lens = [sum(len(m.get('content','') or '') for m in ex['messages']) for ex in ds]; \
    print(f'Avg chars/example: {sum(lens)//len(lens)}'); \
    print(f'Max chars/example: {max(lens)}'); \
    tc = sum(1 for ex in ds if any(m.get('tool_calls') for m in ex['messages'])); \
    print(f'Tool-call examples: {tc} ({100*tc//len(ds)}%)'); \
    "

# ── Training ─────────────────────────────────────────────────────────

# Train bt7274 persona adapter v4 (Hermes format, <think> blocks, 802 examples)
train-bt7274:
    python train_v4.py

# Train bt7274 v3 (legacy)
train-bt7274-v3:
    python train_qwen35_27b.py

# Train a specialist adapter
train name:
    python train_specialist.py --name {{name}}

# Train all specialists in sequence
train-all:
    @echo "── Training all specialist adapters ──"
    @echo "Order: oxidizer → prism → serpent → forge → swiftblade → trace"
    @echo ""
    python train_specialist.py --name oxidizer
    python train_specialist.py --name prism
    python train_specialist.py --name serpent
    python train_specialist.py --name forge
    python train_specialist.py --name swiftblade
    python train_specialist.py --name trace

# Train with custom data path
train-custom name data:
    python train_specialist.py --name {{name}} --data {{data}}

# ── Serving ──────────────────────────────────────────────────────────

# List trained adapters
adapters:
    @echo "── Trained Adapters ──"
    @for d in adapters/*/; do \
        if [ -f "$$d/adapter_model.safetensors" ]; then \
            size=$$(du -sh "$$d/adapter_model.safetensors" | cut -f1); \
            echo "  ✓ $$(basename $$d) ($$size)"; \
        else \
            echo "  ✗ $$(basename $$d) (no adapter_model.safetensors)"; \
        fi; \
    done

# Transfer adapter to sin
transfer name:
    @echo "── Transferring {{name}} to sin ──"
    @test -d "adapters/{{name}}" || (echo "ERROR: adapters/{{name}} not found" && exit 1)
    ssh madcat@192.168.88.108 "mkdir -p ~/models/loras/{{name}}"
    rsync -avP "adapters/{{name}}/" "madcat@192.168.88.108:~/models/loras/{{name}}/"
    @echo "✓ Transferred to sin:~/models/loras/{{name}}/"

# Transfer all adapters to sin
transfer-all:
    @for d in adapters/*/; do \
        name=$$(basename "$$d"); \
        if [ -f "$$d/adapter_model.safetensors" ]; then \
            echo "── Transferring $$name ──"; \
            ssh madcat@192.168.88.108 "mkdir -p ~/models/loras/$$name"; \
            rsync -avP "$$d" "madcat@192.168.88.108:~/models/loras/$$name/"; \
        fi; \
    done

# ── Utilities ────────────────────────────────────────────────────────

# Clean generated data (keeps hand-crafted datasets)
clean-data:
    rm -rf data/*.jsonl
    @echo "Cleaned data/*.jsonl"

# Clean trained adapters
clean-adapters:
    rm -rf adapters/
    @echo "Cleaned adapters/"

# Full clean
clean: clean-data clean-adapters