add training scripts: memory, specialist, mining, smoke test
This commit is contained in:
+335
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Mine git repos for code training pairs.
|
||||
|
||||
Extracts commit-level diffs and converts them to training examples:
|
||||
user: "implement/fix/refactor X" (from commit message)
|
||||
assistant: tool_calls to read/edit files (from diff)
|
||||
|
||||
Usage:
|
||||
python mine_repos.py --repo /path/to/repo --lang rust --out data/oxidizer_git.jsonl
|
||||
python mine_repos.py --repos repos.json --outdir data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Extension to language mapping
|
||||
EXT_TO_LANG = {
|
||||
".rs": "rust",
|
||||
".ts": "typescript", ".tsx": "typescript", ".mts": "typescript",
|
||||
".py": "python", ".pyi": "python",
|
||||
".rb": "ruby", ".erb": "ruby",
|
||||
".swift": "swift",
|
||||
}
|
||||
|
||||
# Max diff size per commit (chars)
|
||||
MAX_DIFF_SIZE = 10_000
|
||||
# Skip files matching these patterns
|
||||
SKIP_PATTERNS = [
|
||||
r"\.lock$", r"\.min\.", r"node_modules/", r"target/",
|
||||
r"\.generated\.", r"__pycache__/", r"\.pyc$",
|
||||
r"Pods/", r"\.build/", r"vendor/",
|
||||
]
|
||||
|
||||
|
||||
def run_git(repo: Path, *args: str) -> str:
|
||||
"""Run a git command and return stdout."""
|
||||
result = subprocess.run(
|
||||
["git", *args],
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def get_commits(repo: Path, lang: str, max_commits: int = 500) -> list[dict]:
|
||||
"""Get commits that touch files of the target language."""
|
||||
extensions = [ext for ext, l in EXT_TO_LANG.items() if l == lang]
|
||||
if not extensions:
|
||||
return []
|
||||
|
||||
# Get commit log with stats
|
||||
log = run_git(
|
||||
repo, "log",
|
||||
f"--max-count={max_commits}",
|
||||
"--no-merges",
|
||||
"--diff-filter=M", # Modified files only
|
||||
"--format=%H%n%s%n%b%n---END---",
|
||||
"--", *[f"*{ext}" for ext in extensions],
|
||||
)
|
||||
|
||||
commits = []
|
||||
for block in log.split("---END---"):
|
||||
block = block.strip()
|
||||
if not block:
|
||||
continue
|
||||
lines = block.split("\n", 2)
|
||||
if len(lines) < 2:
|
||||
continue
|
||||
sha = lines[0].strip()
|
||||
subject = lines[1].strip()
|
||||
body = lines[2].strip() if len(lines) > 2 else ""
|
||||
|
||||
if not sha or not subject:
|
||||
continue
|
||||
|
||||
commits.append({
|
||||
"sha": sha,
|
||||
"subject": subject,
|
||||
"body": body,
|
||||
})
|
||||
|
||||
return commits
|
||||
|
||||
|
||||
def get_diff(repo: Path, sha: str, lang: str) -> list[dict]:
|
||||
"""Get per-file diffs for a commit, filtered by language."""
|
||||
extensions = {ext for ext, l in EXT_TO_LANG.items() if l == lang}
|
||||
|
||||
diff = run_git(repo, "diff", f"{sha}~1..{sha}", "--unified=3")
|
||||
if not diff or len(diff) > MAX_DIFF_SIZE:
|
||||
return []
|
||||
|
||||
# Parse into per-file hunks
|
||||
files = []
|
||||
current_file = None
|
||||
current_hunks: list[str] = []
|
||||
|
||||
for line in diff.split("\n"):
|
||||
if line.startswith("diff --git"):
|
||||
if current_file and current_hunks:
|
||||
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
|
||||
# Extract filename
|
||||
match = re.search(r"b/(.+)$", line)
|
||||
if match:
|
||||
fname = match.group(1)
|
||||
ext = Path(fname).suffix
|
||||
# Skip non-target and generated files
|
||||
if ext not in extensions:
|
||||
current_file = None
|
||||
current_hunks = []
|
||||
continue
|
||||
if any(re.search(p, fname) for p in SKIP_PATTERNS):
|
||||
current_file = None
|
||||
current_hunks = []
|
||||
continue
|
||||
current_file = fname
|
||||
current_hunks = []
|
||||
else:
|
||||
current_file = None
|
||||
current_hunks = []
|
||||
elif current_file is not None:
|
||||
current_hunks.append(line)
|
||||
|
||||
if current_file and current_hunks:
|
||||
files.append({"file": current_file, "diff": "\n".join(current_hunks)})
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def commit_to_example(
|
||||
commit: dict,
|
||||
file_diffs: list[dict],
|
||||
system_prompt: str,
|
||||
) -> dict | None:
|
||||
"""Convert a commit + diffs to a training example."""
|
||||
if not file_diffs:
|
||||
return None
|
||||
|
||||
# Build user message from commit message
|
||||
user_msg = commit["subject"]
|
||||
if commit["body"]:
|
||||
user_msg += "\n\n" + commit["body"]
|
||||
|
||||
# Build assistant tool calls: read each file, then edit
|
||||
messages: list[dict[str, Any]] = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
]
|
||||
|
||||
for fd in file_diffs:
|
||||
# Parse diff into old/new hunks for edit tool calls
|
||||
old_lines = []
|
||||
new_lines = []
|
||||
for line in fd["diff"].split("\n"):
|
||||
if line.startswith("-") and not line.startswith("---"):
|
||||
old_lines.append(line[1:])
|
||||
elif line.startswith("+") and not line.startswith("+++"):
|
||||
new_lines.append(line[1:])
|
||||
|
||||
if not old_lines and not new_lines:
|
||||
continue
|
||||
|
||||
old_text = "\n".join(old_lines)
|
||||
new_text = "\n".join(new_lines)
|
||||
|
||||
if old_text and new_text:
|
||||
# Edit operation
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "edit",
|
||||
"arguments": {
|
||||
"filePath": fd["file"],
|
||||
"oldString": old_text,
|
||||
"newString": new_text,
|
||||
},
|
||||
},
|
||||
}],
|
||||
})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"content": "Edit applied successfully.",
|
||||
})
|
||||
elif new_text and not old_text:
|
||||
# New content added
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "edit",
|
||||
"arguments": {
|
||||
"filePath": fd["file"],
|
||||
"oldString": "",
|
||||
"newString": new_text,
|
||||
},
|
||||
},
|
||||
}],
|
||||
})
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"content": "Edit applied successfully.",
|
||||
})
|
||||
|
||||
# Add summary response
|
||||
files_touched = [fd["file"] for fd in file_diffs]
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": f"Applied changes to {', '.join(files_touched)}.",
|
||||
})
|
||||
|
||||
if len(messages) < 4: # system + user + at least one tool call + summary
|
||||
return None
|
||||
|
||||
return {"messages": messages, "metadata": {"sha": commit["sha"]}}
|
||||
|
||||
|
||||
def mine_repo(
|
||||
repo: Path,
|
||||
lang: str,
|
||||
system_prompt: str,
|
||||
max_commits: int = 500,
|
||||
) -> list[dict]:
|
||||
"""Mine a single repo for training examples."""
|
||||
print(f" Mining {repo} for {lang}...")
|
||||
|
||||
commits = get_commits(repo, lang, max_commits)
|
||||
print(f" Found {len(commits)} relevant commits")
|
||||
|
||||
examples = []
|
||||
for commit in commits:
|
||||
diffs = get_diff(repo, commit["sha"], lang)
|
||||
example = commit_to_example(commit, diffs, system_prompt)
|
||||
if example:
|
||||
examples.append(example)
|
||||
|
||||
print(f" Generated {len(examples)} training examples")
|
||||
return examples
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Mine git repos for training data")
|
||||
parser.add_argument("--repo", type=Path, help="Single repo path")
|
||||
parser.add_argument("--lang", help="Language: rust, typescript, python, ruby, swift")
|
||||
parser.add_argument("--out", type=Path, help="Output JSONL file")
|
||||
parser.add_argument(
|
||||
"--repos",
|
||||
type=Path,
|
||||
help="JSON file mapping lang → list of repo paths",
|
||||
)
|
||||
parser.add_argument("--outdir", type=Path, default=Path("data"), help="Output dir")
|
||||
parser.add_argument(
|
||||
"--agents-dir",
|
||||
type=Path,
|
||||
default=Path.home() / ".config/opencode/agents",
|
||||
help="Agent system prompt directory",
|
||||
)
|
||||
parser.add_argument("--max-commits", type=int, default=500)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load system prompts
|
||||
prompt_files = {
|
||||
"rust": "build-rust.md",
|
||||
"typescript": "build-ts.md",
|
||||
"python": "build-python.md",
|
||||
"ruby": "build-ruby.md",
|
||||
"swift": "build-swift.md",
|
||||
}
|
||||
prompts = {}
|
||||
for lang, fname in prompt_files.items():
|
||||
path = args.agents_dir / fname
|
||||
if path.exists():
|
||||
prompts[lang] = path.read_text().strip()
|
||||
else:
|
||||
prompts[lang] = f"You are a {lang} coding agent."
|
||||
|
||||
if args.repo and args.lang:
|
||||
# Single repo mode
|
||||
prompt = prompts.get(args.lang, f"You are a {args.lang} coding agent.")
|
||||
examples = mine_repo(args.repo, args.lang, prompt, args.max_commits)
|
||||
out = args.out or args.outdir / f"{args.lang}_git.jsonl"
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
for ex in examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
print(f"Wrote {len(examples)} examples to {out}")
|
||||
|
||||
elif args.repos:
|
||||
# Multi-repo mode from config file
|
||||
with open(args.repos) as f:
|
||||
repo_config = json.load(f)
|
||||
|
||||
lang_to_name = {
|
||||
"rust": "oxidizer",
|
||||
"typescript": "prism",
|
||||
"python": "serpent",
|
||||
"ruby": "forge",
|
||||
"swift": "swiftblade",
|
||||
}
|
||||
|
||||
for lang, repos in repo_config.items():
|
||||
all_examples = []
|
||||
prompt = prompts.get(lang, f"You are a {lang} coding agent.")
|
||||
for repo_path in repos:
|
||||
repo = Path(repo_path).expanduser()
|
||||
if not repo.exists():
|
||||
print(f" SKIP: {repo} does not exist")
|
||||
continue
|
||||
examples = mine_repo(repo, lang, prompt, args.max_commits)
|
||||
all_examples.extend(examples)
|
||||
|
||||
name = lang_to_name.get(lang, lang)
|
||||
out = args.outdir / f"{name}_git.jsonl"
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out, "w") as f:
|
||||
for ex in all_examples:
|
||||
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
||||
print(f"Wrote {len(all_examples)} examples to {out}")
|
||||
|
||||
else:
|
||||
parser.error("Provide --repo + --lang, or --repos config file")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user