#!/usr/bin/env python3 """Mine git repos for code training pairs. Extracts commit-level diffs and converts them to training examples: user: "implement/fix/refactor X" (from commit message) assistant: tool_calls to read/edit files (from diff) Usage: python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl python mine_repos.py --repos repos.json --outdir data/ """ import argparse import json import re import subprocess from pathlib import Path from typing import Any # Extension to language mapping EXT_TO_LANG = { ".rs": "rust", ".ts": "typescript", ".tsx": "typescript", ".mts": "typescript", ".py": "python", ".pyi": "python", ".rb": "ruby", ".erb": "ruby", ".swift": "swift", } # Max diff size per commit (chars) MAX_DIFF_SIZE = 10_000 # Skip files matching these patterns SKIP_PATTERNS = [ r"\.lock$", r"\.min\.", r"node_modules/", r"target/", r"\.generated\.", r"__pycache__/", r"\.pyc$", r"Pods/", r"\.build/", r"vendor/", ] def run_git(repo: Path, *args: str) -> str: """Run a git command and return stdout.""" result = subprocess.run( ["git", *args], cwd=repo, capture_output=True, text=True, timeout=30, ) return result.stdout def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]: """Get commits that touch files of the target language.""" extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang] if not extensions: return [] # Get commit log with stats log = run_git( repo, "log", f"--max-count={max_commits}", "--no-merges", "--diff-filter=M", # Modified files only "--format=%H%n%s%n%b%n---END---", "--", *[f"*{ext}" for ext in extensions], ) commits = [] for block in log.split("---END---"): block = block.strip() if not block: continue lines = block.split("\n", 2) if len(lines) < 2: continue sha = lines[0].strip() subject = lines[1].strip() body = lines[2].strip() if len(lines) > 2 else "" if not sha or not subject: continue commits.append({ "sha": sha, "subject": subject, "body": body, }) return commits def get_diff(repo: Path, sha: str, lang: str) -> list[dict]: """Get per-file diffs for a commit, filtered by language.""" extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang} diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3") if not diff or len(diff) > MAX_DIFF_SIZE: return [] # Parse into per-file hunks files = [] current_file = None current_hunks: list[str] = [] for line in diff.split("\n"): if line.startswith("diff --git"): if current_file and current_hunks: files.append({"file": current_file, "diff": "\n".join(current_hunks)}) # Extract filename match = re.search(r"b/(.+)$", line) if match: fname = match.group(1) ext = Path(fname).suffix # Skip non-target and generated files if ext not in extensions: current_file = None current_hunks = [] continue if any(re.search(p, fname) for p in SKIP_PATTERNS): current_file = None current_hunks = [] continue current_file = fname current_hunks = [] else: current_file = None current_hunks = [] elif current_file is not None: current_hunks.append(line) if current_file and current_hunks: files.append({"file": current_file, "diff": "\n".join(current_hunks)}) return files def commit_to_example( commit: dict, file_diffs: list[dict], system_prompt: str, ) -> dict | None: """Convert a commit + diffs to a training example.""" if not file_diffs: return None # Build user message from commit message user_msg = commit["subject"] if commit["body"]: user_msg += "\n\n" + commit["body"] # Build assistant tool calls: read each file, then edit messages: list[dict[str, Any]] = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_msg}, ] for fd in file_diffs: # Parse diff into old/new hunks for edit tool calls old_lines = [] new_lines = [] for line in fd["diff"].split("\n"): if line.startswith("-") and not line.startswith("---"): old_lines.append(line[1:]) elif line.startswith("+") and not line.startswith("+++"): new_lines.append(line[1:]) if not old_lines and not new_lines: continue old_text = "\n".join(old_lines) new_text = "\n".join(new_lines) if old_text and new_text: # Edit operation messages.append({ "role": "assistant", "content": None, "tool_calls": [{ "type": "function", "function": { "name": "edit", "arguments": { "filePath": fd["file"], "oldString": old_text, "newString": new_text, }, }, }], }) messages.append({ "role": "tool", "content": "Edit applied successfully.", }) elif new_text and not old_text: # New content added messages.append({ "role": "assistant", "content": None, "tool_calls": [{ "type": "function", "function": { "name": "edit", "arguments": { "filePath": fd["file"], "oldString": "", "newString": new_text, }, }, }], }) messages.append({ "role": "tool", "content": "Edit applied successfully.", }) # Add summary response files_touched = [fd["file"] for fd in file_diffs] messages.append({ "role": "assistant", "content": f"Applied changes to {', '.join(files_touched)}.", }) if len(messages) < 4: # system + user + at least one tool call + summary return None return {"messages": messages, "metadata": {"sha": commit["sha"]}} def mine_repo( repo: Path, lang: str, system_prompt: str, max_commits: int = 500, ) -> list[dict]: """Mine a single repo for training examples.""" print(f" Mining {repo} for {lang}...") commits = get_commits(repo, lang, max_commits) print(f" Found {len(commits)} relevant commits") examples = [] for commit in commits: diffs = get_diff(repo, commit["sha"], lang) example = commit_to_example(commit, diffs, system_prompt) if example: examples.append(example) print(f" Generated {len(examples)} training examples") return examples def main() -> None: parser = argparse.ArgumentParser(description="Mine git repos for training data") parser.add_argument("--repo", type=Path, help="Single repo path") parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift") parser.add_argument("--out", type=Path, help="Output JSONL file") parser.add_argument( "--repos", type=Path, help="JSON file mapping lang → list of repo paths", ) parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir") parser.add_argument( "--agents-dir", type=Path, default=Path.home() / ".config/opencode/agents", help="Agent system prompt directory", ) parser.add_argument("--max-commits", type=int, default=500) args = parser.parse_args() # Load system prompts prompt_files = { "rust": "build-rust.md", "typescript": "build-ts.md", "python": "build-python.md", "ruby": "build-ruby.md", "swift": "build-swift.md", } prompts = {} for lang, fname in prompt_files.items(): path = args.agents_dir / fname if path.exists(): prompts[lang] = path.read_text().strip() else: prompts[lang] = f"You are a {lang} coding agent." if args.repo and args.lang: # Single repo mode prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.") examples = mine_repo(args.repo, args.lang, prompt, args.max_commits) out = args.out or args.outdir / f"{args.lang}_git.jsonl" out.parent.mkdir(parents=True, exist_ok=True) with open(out, "w") as f: for ex in examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"Wrote {len(examples)} examples to {out}") elif args.repos: # Multi-repo mode from config file with open(args.repos) as f: repo_config = json.load(f) lang_to_name = { "rust": "oxidizer", "typescript": "prism", "python": "serpent", "ruby": "forge", "swift": "swiftblade", } for lang, repos in repo_config.items(): all_examples = [] prompt = prompts.get(lang, f"You are a {lang} coding agent.") for repo_path in repos: repo = Path(repo_path).expanduser() if not repo.exists(): print(f" SKIP: {repo} does not exist") continue examples = mine_repo(repo, lang, prompt, args.max_commits) all_examples.extend(examples) name = lang_to_name.get(lang, lang) out = args.outdir / f"{name}_git.jsonl" out.parent.mkdir(parents=True, exist_ok=True) with open(out, "w") as f: for ex in all_examples: f.write(json.dumps(ex, ensure_ascii=False) + "\n") print(f"Wrote {len(all_examples)} examples to {out}") else: parser.error("Provide --repo + --lang, or --repos config file") if __name__ == "__main__": main()