336 lines
10 KiB
Python
336 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""Mine git repos for code training pairs.
|
|
|
|
Extracts commit-level diffs and converts them to training examples:
|
|
user: "implement/fix/refactor X" (from commit message)
|
|
assistant: tool_calls to read/edit files (from diff)
|
|
|
|
Usage:
|
|
python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl
|
|
python mine_repos.py --repos repos.json --outdir data/
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Extension to language mapping
|
|
EXT_TO_LANG = {
|
|
".rs": "rust",
|
|
".ts": "typescript", ".tsx": "typescript", ".mts": "typescript",
|
|
".py": "python", ".pyi": "python",
|
|
".rb": "ruby", ".erb": "ruby",
|
|
".swift": "swift",
|
|
}
|
|
|
|
# Max diff size per commit (chars)
|
|
MAX_DIFF_SIZE = 10_000
|
|
# Skip files matching these patterns
|
|
SKIP_PATTERNS = [
|
|
r"\.lock$", r"\.min\.", r"node_modules/", r"target/",
|
|
r"\.generated\.", r"__pycache__/", r"\.pyc$",
|
|
r"Pods/", r"\.build/", r"vendor/",
|
|
]
|
|
|
|
|
|
def run_git(repo: Path, *args: str) -> str:
|
|
"""Run a git command and return stdout."""
|
|
result = subprocess.run(
|
|
["git", *args],
|
|
cwd=repo,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]:
|
|
"""Get commits that touch files of the target language."""
|
|
extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang]
|
|
if not extensions:
|
|
return []
|
|
|
|
# Get commit log with stats
|
|
log = run_git(
|
|
repo, "log",
|
|
f"--max-count={max_commits}",
|
|
"--no-merges",
|
|
"--diff-filter=M", # Modified files only
|
|
"--format=%H%n%s%n%b%n---END---",
|
|
"--", *[f"*{ext}" for ext in extensions],
|
|
)
|
|
|
|
commits = []
|
|
for block in log.split("---END---"):
|
|
block = block.strip()
|
|
if not block:
|
|
continue
|
|
lines = block.split("\n", 2)
|
|
if len(lines) < 2:
|
|
continue
|
|
sha = lines[0].strip()
|
|
subject = lines[1].strip()
|
|
body = lines[2].strip() if len(lines) > 2 else ""
|
|
|
|
if not sha or not subject:
|
|
continue
|
|
|
|
commits.append({
|
|
"sha": sha,
|
|
"subject": subject,
|
|
"body": body,
|
|
})
|
|
|
|
return commits
|
|
|
|
|
|
def get_diff(repo: Path, sha: str, lang: str) -> list[dict]:
|
|
"""Get per-file diffs for a commit, filtered by language."""
|
|
extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang}
|
|
|
|
diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3")
|
|
if not diff or len(diff) > MAX_DIFF_SIZE:
|
|
return []
|
|
|
|
# Parse into per-file hunks
|
|
files = []
|
|
current_file = None
|
|
current_hunks: list[str] = []
|
|
|
|
for line in diff.split("\n"):
|
|
if line.startswith("diff --git"):
|
|
if current_file and current_hunks:
|
|
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
|
|
# Extract filename
|
|
match = re.search(r"b/(.+)$", line)
|
|
if match:
|
|
fname = match.group(1)
|
|
ext = Path(fname).suffix
|
|
# Skip non-target and generated files
|
|
if ext not in extensions:
|
|
current_file = None
|
|
current_hunks = []
|
|
continue
|
|
if any(re.search(p, fname) for p in SKIP_PATTERNS):
|
|
current_file = None
|
|
current_hunks = []
|
|
continue
|
|
current_file = fname
|
|
current_hunks = []
|
|
else:
|
|
current_file = None
|
|
current_hunks = []
|
|
elif current_file is not None:
|
|
current_hunks.append(line)
|
|
|
|
if current_file and current_hunks:
|
|
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
|
|
|
|
return files
|
|
|
|
|
|
def commit_to_example(
|
|
commit: dict,
|
|
file_diffs: list[dict],
|
|
system_prompt: str,
|
|
) -> dict | None:
|
|
"""Convert a commit + diffs to a training example."""
|
|
if not file_diffs:
|
|
return None
|
|
|
|
# Build user message from commit message
|
|
user_msg = commit["subject"]
|
|
if commit["body"]:
|
|
user_msg += "\n\n" + commit["body"]
|
|
|
|
# Build assistant tool calls: read each file, then edit
|
|
messages: list[dict[str, Any]] = [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_msg},
|
|
]
|
|
|
|
for fd in file_diffs:
|
|
# Parse diff into old/new hunks for edit tool calls
|
|
old_lines = []
|
|
new_lines = []
|
|
for line in fd["diff"].split("\n"):
|
|
if line.startswith("-") and not line.startswith("---"):
|
|
old_lines.append(line[1:])
|
|
elif line.startswith("+") and not line.startswith("+++"):
|
|
new_lines.append(line[1:])
|
|
|
|
if not old_lines and not new_lines:
|
|
continue
|
|
|
|
old_text = "\n".join(old_lines)
|
|
new_text = "\n".join(new_lines)
|
|
|
|
if old_text and new_text:
|
|
# Edit operation
|
|
messages.append({
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "edit",
|
|
"arguments": {
|
|
"filePath": fd["file"],
|
|
"oldString": old_text,
|
|
"newString": new_text,
|
|
},
|
|
},
|
|
}],
|
|
})
|
|
messages.append({
|
|
"role": "tool",
|
|
"content": "Edit applied successfully.",
|
|
})
|
|
elif new_text and not old_text:
|
|
# New content added
|
|
messages.append({
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "edit",
|
|
"arguments": {
|
|
"filePath": fd["file"],
|
|
"oldString": "",
|
|
"newString": new_text,
|
|
},
|
|
},
|
|
}],
|
|
})
|
|
messages.append({
|
|
"role": "tool",
|
|
"content": "Edit applied successfully.",
|
|
})
|
|
|
|
# Add summary response
|
|
files_touched = [fd["file"] for fd in file_diffs]
|
|
messages.append({
|
|
"role": "assistant",
|
|
"content": f"Applied changes to {', '.join(files_touched)}.",
|
|
})
|
|
|
|
if len(messages) < 4: # system + user + at least one tool call + summary
|
|
return None
|
|
|
|
return {"messages": messages, "metadata": {"sha": commit["sha"]}}
|
|
|
|
|
|
def mine_repo(
|
|
repo: Path,
|
|
lang: str,
|
|
system_prompt: str,
|
|
max_commits: int = 500,
|
|
) -> list[dict]:
|
|
"""Mine a single repo for training examples."""
|
|
print(f" Mining {repo} for {lang}...")
|
|
|
|
commits = get_commits(repo, lang, max_commits)
|
|
print(f" Found {len(commits)} relevant commits")
|
|
|
|
examples = []
|
|
for commit in commits:
|
|
diffs = get_diff(repo, commit["sha"], lang)
|
|
example = commit_to_example(commit, diffs, system_prompt)
|
|
if example:
|
|
examples.append(example)
|
|
|
|
print(f" Generated {len(examples)} training examples")
|
|
return examples
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Mine git repos for training data")
|
|
parser.add_argument("--repo", type=Path, help="Single repo path")
|
|
parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift")
|
|
parser.add_argument("--out", type=Path, help="Output JSONL file")
|
|
parser.add_argument(
|
|
"--repos",
|
|
type=Path,
|
|
help="JSON file mapping lang → list of repo paths",
|
|
)
|
|
parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir")
|
|
parser.add_argument(
|
|
"--agents-dir",
|
|
type=Path,
|
|
default=Path.home() / ".config/opencode/agents",
|
|
help="Agent system prompt directory",
|
|
)
|
|
parser.add_argument("--max-commits", type=int, default=500)
|
|
args = parser.parse_args()
|
|
|
|
# Load system prompts
|
|
prompt_files = {
|
|
"rust": "build-rust.md",
|
|
"typescript": "build-ts.md",
|
|
"python": "build-python.md",
|
|
"ruby": "build-ruby.md",
|
|
"swift": "build-swift.md",
|
|
}
|
|
prompts = {}
|
|
for lang, fname in prompt_files.items():
|
|
path = args.agents_dir / fname
|
|
if path.exists():
|
|
prompts[lang] = path.read_text().strip()
|
|
else:
|
|
prompts[lang] = f"You are a {lang} coding agent."
|
|
|
|
if args.repo and args.lang:
|
|
# Single repo mode
|
|
prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.")
|
|
examples = mine_repo(args.repo, args.lang, prompt, args.max_commits)
|
|
out = args.out or args.outdir / f"{args.lang}_git.jsonl"
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(out, "w") as f:
|
|
for ex in examples:
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
print(f"Wrote {len(examples)} examples to {out}")
|
|
|
|
elif args.repos:
|
|
# Multi-repo mode from config file
|
|
with open(args.repos) as f:
|
|
repo_config = json.load(f)
|
|
|
|
lang_to_name = {
|
|
"rust": "oxidizer",
|
|
"typescript": "prism",
|
|
"python": "serpent",
|
|
"ruby": "forge",
|
|
"swift": "swiftblade",
|
|
}
|
|
|
|
for lang, repos in repo_config.items():
|
|
all_examples = []
|
|
prompt = prompts.get(lang, f"You are a {lang} coding agent.")
|
|
for repo_path in repos:
|
|
repo = Path(repo_path).expanduser()
|
|
if not repo.exists():
|
|
print(f" SKIP: {repo} does not exist")
|
|
continue
|
|
examples = mine_repo(repo, lang, prompt, args.max_commits)
|
|
all_examples.extend(examples)
|
|
|
|
name = lang_to_name.get(lang, lang)
|
|
out = args.outdir / f"{name}_git.jsonl"
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(out, "w") as f:
|
|
for ex in all_examples:
|
|
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
print(f"Wrote {len(all_examples)} examples to {out}")
|
|
|
|
else:
|
|
parser.error("Provide --repo + --lang, or --repos config file")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|