Skip to content

Commit 406323a

Browse files
authored
Merge pull request #3 from wandb/git-background-tracking
Git background tracking
2 parents 2e57759 + 4da1e34 commit 406323a

File tree

6 files changed

+206
-87
lines changed

6 files changed

+206
-87
lines changed

docs/design/git_integration_rework.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Design Document: Git Integration Rework for Programmer
2+
3+
## Overview
4+
This document outlines the proposed changes to the Git integration feature of the `programmer` tool. The goal is to allow real-time edits in the user's working state while maintaining a separate commit history in a `programmer-<session>` branch, without affecting the user's visible working state.
5+
6+
## Objectives
7+
- Allow users to see changes in real-time using tools like VSCode's Git view.
8+
- Maintain a separate commit history in the background in a `programmer-<session>` branch.
9+
- Avoid using stash or switching the HEAD, ensuring the user's working state remains unchanged.
10+
11+
## Proposed Solution
12+
To achieve the above objectives, we propose the following solution:
13+
14+
### Session Branch Management
15+
1. **Initialization**: At the start of a session, initialize a `programmer-<session>` branch based on the current state of the user's branch.
16+
17+
2. **Change Tracking**: Monitor file changes in the working directory to reflect them in the session branch, keeping the user's current branch and working directory unchanged.
18+
19+
3. **Commit History**: Maintain a separate commit history in the `programmer-<session>` branch to allow for session management and review without interfering with the user's workflow.
20+
21+
## Benefits
22+
- Users can continue using their preferred development tools and see live changes.
23+
- Maintains a clean separation of session history, enabling better session management and review.
24+
25+
## Challenges
26+
- Requires careful handling of Git's internal mechanisms to ensure seamless integration.
27+
28+
## Conclusion
29+
By leveraging a separate session branch, we can achieve seamless integration of the `programmer` tool with the user's workflow, providing real-time feedback and maintaining a comprehensive session history without disrupting the user's development environment.

programmer/environment.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
from contextvars import ContextVar
44
from contextlib import contextmanager
55

6-
76
from .git import GitRepo
87

9-
108
@dataclass
119
class EnvironmentSnapshotKey:
1210
env_id: str
@@ -49,15 +47,17 @@ def start_session(self, session_id: str):
4947
self.original_git_ref = self.repo.get_current_head()
5048
self.programmer_branch = f"programmer-{session_id}"
5149
print("programmer_branch:", self.programmer_branch)
52-
self.repo.checkout_new(self.programmer_branch)
50+
# Create the programmer branch based on the current state
51+
self.repo.create_branch(self.programmer_branch)
5352

5453
def finish_session(self):
5554
if self.original_git_ref is None or self.programmer_branch is None:
5655
raise ValueError("Session not started")
57-
self.repo.checkout_and_copy(self.original_git_ref)
56+
# No need to checkout back as we never changed the branch
5857

5958
def make_snapshot(self, message: str) -> EnvironmentSnapshotKey:
60-
commit_hash = self.repo.add_all_and_commit(message)
59+
# Commit directly to the programmer branch using new method
60+
commit_hash = self.repo.commit_directly_to_branch(self.programmer_branch, message)
6161
return EnvironmentSnapshotKey(
6262
"git", {"origin": self.repo.get_origin_url(), "commit": commit_hash}
6363
)

programmer/git.py

Lines changed: 48 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
from git import Repo, InvalidGitRepositoryError, GitCommandError
21
import os
2+
from git import Repo, InvalidGitRepositoryError, GitCommandError
33
from typing import Optional
44
import tempfile
5-
import shutil
65

76

87
class GitRepo:
@@ -11,101 +10,70 @@ def __init__(self, repo: Repo):
1110

1211
@classmethod
1312
def from_current_dir(cls) -> Optional["GitRepo"]:
14-
"""
15-
Create a GitRepo instance from the current working directory or its parent directories.
16-
17-
Returns:
18-
GitRepo instance if in a Git repository, None otherwise.
19-
"""
2013
try:
2114
repo = Repo(os.getcwd(), search_parent_directories=True)
2215
return cls(repo)
2316
except InvalidGitRepositoryError:
2417
return None
2518

2619
def get_origin_url(self) -> Optional[str]:
27-
"""
28-
Get the remote URL (e.g., GitHub URL) for this repository.
29-
30-
Returns:
31-
The remote URL as a string if it exists, None otherwise.
32-
"""
3320
try:
3421
remote_url = self.repo.remotes.origin.url
3522
return remote_url if remote_url else None
3623
except AttributeError:
37-
# No remote named 'origin' exists
3824
return None
3925

40-
def checkout_existing(self, ref: str) -> None:
41-
self.repo.git.checkout(ref)
26+
def create_branch(self, branch_name: str) -> None:
27+
if branch_name not in self.repo.heads:
28+
# Create a new branch from the current HEAD
29+
self.repo.git.branch(branch_name, self.repo.head.commit.hexsha)
4230

43-
def checkout_new(self, branch_name: str) -> None:
44-
self.repo.git.checkout(b=branch_name)
31+
def commit_directly_to_branch(self, branch_name: str, message: str) -> str:
32+
# Ensure the branch is initialized
33+
self.create_branch(branch_name)
4534

46-
def get_current_head(self) -> str:
47-
"""
48-
Get the current HEAD of the repository, which is either the branch name or the commit SHA.
35+
# Use a temporary index file to stage files without affecting the actual index.
36+
with tempfile.TemporaryDirectory() as temp_dir:
37+
temp_index_file = os.path.join(temp_dir, "index")
38+
env = os.environ.copy()
39+
env["GIT_INDEX_FILE"] = temp_index_file
40+
41+
# Add all files from the working directory to the temporary index
42+
self.repo.git.add(A=True, env=env)
43+
44+
# Determine the parent commit
45+
parent_commit = self.repo.commit(branch_name)
46+
47+
# Check for changes between parent_commit and the temporary index
48+
diff_output = self.repo.git.diff(parent_commit.hexsha, "--cached", env=env)
49+
50+
if not diff_output.strip():
51+
# No changes to commit
52+
return parent_commit.hexsha
53+
54+
# Write the tree from the temporary index
55+
tree = self.repo.git.write_tree(env=env)
4956

50-
Returns:
51-
The branch name if on a branch, otherwise the commit SHA.
52-
"""
57+
# print(
58+
# f"Committing to branch {branch_name}, parent commit: {parent_commit.hexsha}"
59+
# )
60+
61+
# Set author information using environment variables
62+
env["GIT_AUTHOR_NAME"] = "programmer"
63+
env["GIT_AUTHOR_EMAIL"] = "[email protected]"
64+
65+
# Use the Repo's git command interface to create a commit-tree
66+
commit_hash = self.repo.git.commit_tree(
67+
tree, "-p", parent_commit.hexsha, "-m", message, env=env
68+
)
69+
70+
# Update the branch reference to point to the new commit
71+
self.repo.git.update_ref(f"refs/heads/{branch_name}", commit_hash)
72+
73+
return commit_hash
74+
75+
def get_current_head(self) -> str:
5376
if self.repo.head.is_detached:
5477
return str(self.repo.head.commit.hexsha)
5578
else:
5679
return str(self.repo.active_branch.name)
57-
58-
def checkout_and_copy(self, to_ref: str) -> None:
59-
current_branch = self.get_current_head()
60-
61-
with tempfile.TemporaryDirectory() as temp_dir:
62-
# Get all changed files between current branch and to_ref
63-
changed_files = self.repo.git.diff(
64-
"--name-only", current_branch, to_ref
65-
).splitlines()
66-
67-
# Copy changed files to the temp directory
68-
for file in changed_files:
69-
src_path = os.path.join(self.repo.working_tree_dir, file)
70-
dst_path = os.path.join(temp_dir, file)
71-
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
72-
if os.path.exists(src_path):
73-
shutil.copy2(src_path, dst_path)
74-
75-
# Checkout the target branch
76-
self.repo.git.checkout(to_ref)
77-
78-
# Copy files from temp directory to the working directory
79-
for root, _, files in os.walk(temp_dir):
80-
for file in files:
81-
src_path = os.path.join(root, file)
82-
rel_path = os.path.relpath(src_path, temp_dir)
83-
dst_path = os.path.join(self.repo.working_tree_dir, rel_path)
84-
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
85-
shutil.copy2(src_path, dst_path)
86-
87-
def add_all_and_commit(self, message: str) -> str:
88-
"""
89-
Add all files (including untracked ones) and commit them.
90-
91-
Args:
92-
message: The commit message.
93-
94-
Returns:
95-
The SHA of the new commit if changes were committed,
96-
or the current HEAD's SHA if no changes were made.
97-
98-
Raises:
99-
GitCommandError: If there are issues with Git operations.
100-
"""
101-
# Add all files, including untracked ones
102-
self.repo.git.add(A=True)
103-
104-
# Check if there are changes to commit
105-
if self.repo.is_dirty(untracked_files=True):
106-
# Commit the changes
107-
commit = self.repo.index.commit(message)
108-
return commit.hexsha
109-
else:
110-
# If no changes, return the current HEAD's SHA
111-
return self.repo.head.commit.hexsha

programmer/programmer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def main():
9999
logging_mode = SettingsManager.get_setting("weave_logging")
100100
if logging_mode == "cloud":
101101
curdir = os.path.basename(os.path.abspath(os.curdir))
102-
weave.init(f"programmerdev1-{curdir}")
102+
weave.init(f"programmer-{curdir}")
103103
elif logging_mode == "local":
104104
weave.init_local_client()
105105

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import pytest
2+
import tempfile
3+
import shutil
4+
import os
5+
from git import Repo
6+
from programmer.git import GitRepo
7+
8+
9+
@pytest.fixture
10+
def setup_repo():
11+
# Create a temporary directory for the repository
12+
test_dir = tempfile.mkdtemp()
13+
repo = Repo.init(test_dir)
14+
git_repo = GitRepo(repo)
15+
16+
# Set up user config for the test repo
17+
with repo.config_writer() as config:
18+
config.set_value("user", "name", "Test User")
19+
config.set_value("user", "email", "[email protected]")
20+
21+
# Create an initial commit so HEAD exists
22+
initial_file_path = os.path.join(test_dir, "initial.txt")
23+
with open(initial_file_path, "w") as f:
24+
f.write("Initial content")
25+
repo.index.add([initial_file_path])
26+
repo.index.commit("Initial commit")
27+
28+
# Create and checkout the main branch
29+
main_branch = repo.create_head('main')
30+
main_branch.checkout()
31+
32+
yield repo, git_repo, test_dir
33+
34+
# Remove the temporary directory after the test
35+
shutil.rmtree(test_dir)
36+
37+
38+
def test_commit_directly_to_branch(setup_repo):
39+
repo, git_repo, test_dir = setup_repo
40+
41+
# Create and commit a file in the main branch
42+
file_path = os.path.join(test_dir, "test_file.py")
43+
with open(file_path, "w") as f:
44+
f.write("print('Hello, world!')\n")
45+
46+
repo.index.add([file_path])
47+
repo.index.commit("Initial commit on main")
48+
49+
# Modify the file
50+
with open(file_path, "a") as f:
51+
f.write("print('Another line')\n")
52+
53+
# Commit changes to the programmer-<session> branch
54+
session_branch_name = "programmer-session"
55+
git_repo.create_branch(session_branch_name)
56+
commit_message = "Commit from programmer session"
57+
git_repo.commit_directly_to_branch(session_branch_name, commit_message)
58+
59+
# Verify the commit in the programmer-<session> branch
60+
session_branch_commit = repo.commit(session_branch_name)
61+
tree_files = session_branch_commit.tree.traverse()
62+
file_names = [item.path for item in tree_files]
63+
64+
assert "test_file.py" in file_names
65+
66+
# Verify the content of the file in the commit
67+
blob_data = (
68+
session_branch_commit.tree["test_file.py"].data_stream.read().decode("utf-8")
69+
)
70+
assert "print('Another line')" in blob_data
71+
72+
# Verify that the main branch is unaffected
73+
main_branch_commit = repo.commit("main")
74+
assert main_branch_commit.hexsha != session_branch_commit.hexsha
75+
76+
77+
def test_no_empty_commit(setup_repo):
78+
repo, git_repo, _ = setup_repo
79+
80+
# Create and checkout the programmer-<session> branch
81+
session_branch_name = "programmer-session"
82+
git_repo.create_branch(session_branch_name)
83+
84+
# Commit changes to the programmer-<session> branch
85+
commit_message = "Commit from programmer session"
86+
initial_commit_sha = repo.commit(session_branch_name).hexsha
87+
commit_sha = git_repo.commit_directly_to_branch(session_branch_name, commit_message)
88+
89+
# Verify that the SHA returned is equal to the initial commit SHA
90+
assert initial_commit_sha == commit_sha
91+
92+
# Verify no new commit is created
93+
new_commit_sha = repo.commit(session_branch_name).hexsha
94+
assert initial_commit_sha == new_commit_sha
95+
96+
97+
def test_multiple_commits_with_empty(setup_repo):
98+
repo, git_repo, test_dir = setup_repo
99+
100+
# Create and checkout the programmer-<session> branch
101+
session_branch_name = "programmer-session"
102+
git_repo.create_branch(session_branch_name)
103+
initial_commit_sha = repo.commit(session_branch_name).hexsha
104+
105+
# First commit (empty)
106+
commit_message = "First empty commit"
107+
commit_sha_1 = git_repo.commit_directly_to_branch(session_branch_name, commit_message)
108+
assert commit_sha_1 == initial_commit_sha
109+
110+
# Second commit (non-empty)
111+
file_path = os.path.join(test_dir, "test_file.py")
112+
with open(file_path, "w") as f:
113+
f.write("print('Hello, world!')\n")
114+
115+
git_repo.commit_directly_to_branch(session_branch_name, "Second commit with changes")
116+
commit_sha_2 = repo.commit(session_branch_name).hexsha
117+
assert commit_sha_2 != initial_commit_sha
118+
119+
# Third commit (empty)
120+
commit_sha_3 = git_repo.commit_directly_to_branch(session_branch_name, "Third empty commit")
121+
assert commit_sha_3 == commit_sha_2
122+
assert commit_sha_3 != initial_commit_sha

programmer/tools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import subprocess
55
import weave
66

7-
LENGTH_LIMIT = 10000
7+
LENGTH_LIMIT = 30000
88

99

1010
def read_image_as_base64(path: str):

0 commit comments

Comments
 (0)