Files
lora/mine_repos.py

336 lines
10 KiB
Python

#!/usr/bin/env python3
"""Mine git repos for code training pairs.
Extracts commit-level diffs and converts them to training examples:
user: "implement/fix/refactor X" (from commit message)
assistant: tool_calls to read/edit files (from diff)
Usage:
python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl
python mine_repos.py --repos repos.json --outdir data/
"""
import argparse
import json
import re
import subprocess
from pathlib import Path
from typing import Any
# Extension to language mapping
EXT_TO_LANG = {
".rs": "rust",
".ts": "typescript", ".tsx": "typescript", ".mts": "typescript",
".py": "python", ".pyi": "python",
".rb": "ruby", ".erb": "ruby",
".swift": "swift",
}
# Max diff size per commit (chars)
MAX_DIFF_SIZE = 10_000
# Skip files matching these patterns
SKIP_PATTERNS = [
r"\.lock$", r"\.min\.", r"node_modules/", r"target/",
r"\.generated\.", r"__pycache__/", r"\.pyc$",
r"Pods/", r"\.build/", r"vendor/",
]
def run_git(repo: Path, *args: str) -> str:
"""Run a git command and return stdout."""
result = subprocess.run(
["git", *args],
cwd=repo,
capture_output=True,
text=True,
timeout=30,
)
return result.stdout
def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]:
"""Get commits that touch files of the target language."""
extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang]
if not extensions:
return []
# Get commit log with stats
log = run_git(
repo, "log",
f"--max-count={max_commits}",
"--no-merges",
"--diff-filter=M", # Modified files only
"--format=%H%n%s%n%b%n---END---",
"--", *[f"*{ext}" for ext in extensions],
)
commits = []
for block in log.split("---END---"):
block = block.strip()
if not block:
continue
lines = block.split("\n", 2)
if len(lines) < 2:
continue
sha = lines[0].strip()
subject = lines[1].strip()
body = lines[2].strip() if len(lines) > 2 else ""
if not sha or not subject:
continue
commits.append({
"sha": sha,
"subject": subject,
"body": body,
})
return commits
def get_diff(repo: Path, sha: str, lang: str) -> list[dict]:
"""Get per-file diffs for a commit, filtered by language."""
extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang}
diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3")
if not diff or len(diff) > MAX_DIFF_SIZE:
return []
# Parse into per-file hunks
files = []
current_file = None
current_hunks: list[str] = []
for line in diff.split("\n"):
if line.startswith("diff --git"):
if current_file and current_hunks:
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
# Extract filename
match = re.search(r"b/(.+)$", line)
if match:
fname = match.group(1)
ext = Path(fname).suffix
# Skip non-target and generated files
if ext not in extensions:
current_file = None
current_hunks = []
continue
if any(re.search(p, fname) for p in SKIP_PATTERNS):
current_file = None
current_hunks = []
continue
current_file = fname
current_hunks = []
else:
current_file = None
current_hunks = []
elif current_file is not None:
current_hunks.append(line)
if current_file and current_hunks:
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
return files
def commit_to_example(
commit: dict,
file_diffs: list[dict],
system_prompt: str,
) -> dict | None:
"""Convert a commit + diffs to a training example."""
if not file_diffs:
return None
# Build user message from commit message
user_msg = commit["subject"]
if commit["body"]:
user_msg += "\n\n" + commit["body"]
# Build assistant tool calls: read each file, then edit
messages: list[dict[str, Any]] = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_msg},
]
for fd in file_diffs:
# Parse diff into old/new hunks for edit tool calls
old_lines = []
new_lines = []
for line in fd["diff"].split("\n"):
if line.startswith("-") and not line.startswith("---"):
old_lines.append(line[1:])
elif line.startswith("+") and not line.startswith("+++"):
new_lines.append(line[1:])
if not old_lines and not new_lines:
continue
old_text = "\n".join(old_lines)
new_text = "\n".join(new_lines)
if old_text and new_text:
# Edit operation
messages.append({
"role": "assistant",
"content": None,
"tool_calls": [{
"type": "function",
"function": {
"name": "edit",
"arguments": {
"filePath": fd["file"],
"oldString": old_text,
"newString": new_text,
},
},
}],
})
messages.append({
"role": "tool",
"content": "Edit applied successfully.",
})
elif new_text and not old_text:
# New content added
messages.append({
"role": "assistant",
"content": None,
"tool_calls": [{
"type": "function",
"function": {
"name": "edit",
"arguments": {
"filePath": fd["file"],
"oldString": "",
"newString": new_text,
},
},
}],
})
messages.append({
"role": "tool",
"content": "Edit applied successfully.",
})
# Add summary response
files_touched = [fd["file"] for fd in file_diffs]
messages.append({
"role": "assistant",
"content": f"Applied changes to {', '.join(files_touched)}.",
})
if len(messages) < 4: # system + user + at least one tool call + summary
return None
return {"messages": messages, "metadata": {"sha": commit["sha"]}}
def mine_repo(
repo: Path,
lang: str,
system_prompt: str,
max_commits: int = 500,
) -> list[dict]:
"""Mine a single repo for training examples."""
print(f" Mining {repo} for {lang}...")
commits = get_commits(repo, lang, max_commits)
print(f" Found {len(commits)} relevant commits")
examples = []
for commit in commits:
diffs = get_diff(repo, commit["sha"], lang)
example = commit_to_example(commit, diffs, system_prompt)
if example:
examples.append(example)
print(f" Generated {len(examples)} training examples")
return examples
def main() -> None:
parser = argparse.ArgumentParser(description="Mine git repos for training data")
parser.add_argument("--repo", type=Path, help="Single repo path")
parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift")
parser.add_argument("--out", type=Path, help="Output JSONL file")
parser.add_argument(
"--repos",
type=Path,
help="JSON file mapping lang → list of repo paths",
)
parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir")
parser.add_argument(
"--agents-dir",
type=Path,
default=Path.home() / ".config/opencode/agents",
help="Agent system prompt directory",
)
parser.add_argument("--max-commits", type=int, default=500)
args = parser.parse_args()
# Load system prompts
prompt_files = {
"rust": "build-rust.md",
"typescript": "build-ts.md",
"python": "build-python.md",
"ruby": "build-ruby.md",
"swift": "build-swift.md",
}
prompts = {}
for lang, fname in prompt_files.items():
path = args.agents_dir / fname
if path.exists():
prompts[lang] = path.read_text().strip()
else:
prompts[lang] = f"You are a {lang} coding agent."
if args.repo and args.lang:
# Single repo mode
prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.")
examples = mine_repo(args.repo, args.lang, prompt, args.max_commits)
out = args.out or args.outdir / f"{args.lang}_git.jsonl"
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
for ex in examples:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"Wrote {len(examples)} examples to {out}")
elif args.repos:
# Multi-repo mode from config file
with open(args.repos) as f:
repo_config = json.load(f)
lang_to_name = {
"rust": "oxidizer",
"typescript": "prism",
"python": "serpent",
"ruby": "forge",
"swift": "swiftblade",
}
for lang, repos in repo_config.items():
all_examples = []
prompt = prompts.get(lang, f"You are a {lang} coding agent.")
for repo_path in repos:
repo = Path(repo_path).expanduser()
if not repo.exists():
print(f" SKIP: {repo} does not exist")
continue
examples = mine_repo(repo, lang, prompt, args.max_commits)
all_examples.extend(examples)
name = lang_to_name.get(lang, lang)
out = args.outdir / f"{name}_git.jsonl"
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
for ex in all_examples:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"Wrote {len(all_examples)} examples to {out}")
else:
parser.error("Provide --repo + --lang, or --repos config file")
if __name__ == "__main__":
main()