Skip to content

Commit 12da515

Browse files
committed
backfill: basic functionality and tests
The default behavior of 'git backfill' is to fetch all missing blobs that are reachable from HEAD. Document and test this behavior. The implementation is a very simple use of the path-walk API, initializing the revision walk at HEAD to start the path-walk from all commits reachable from HEAD. Ignore the object arrays that correspond to tree entries, assuming that they are all present already. Signed-off-by: Derrick Stolee <[email protected]>
1 parent e872c9a commit 12da515

File tree

4 files changed

+221
-4
lines changed

4 files changed

+221
-4
lines changed

Documentation/git-backfill.txt

+24
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,30 @@ SYNOPSIS
1414
DESCRIPTION
1515
-----------
1616

17+
Blobless partial clones are created using `git clone --filter=blob:none`
18+
and then configure the local repository such that the Git client avoids
19+
downloading blob objects unless they are required for a local operation.
20+
This initially means that the clone and later fetches download reachable
21+
commits and trees but no blobs. Later operations that change the `HEAD`
22+
pointer, such as `git checkout` or `git merge`, may need to download
23+
missing blobs in order to complete their operation.
24+
25+
In the worst cases, commands that compute blob diffs, such as `git blame`,
26+
become very slow as they download the missing blobs in single-blob
27+
requests to satisfy the missing object as the Git command needs it. This
28+
leads to multiple download requests and no ability for the Git server to
29+
provide delta compression across those objects.
30+
31+
The `git backfill` command provides a way for the user to request that
32+
Git downloads the missing blobs (with optional filters) such that the
33+
missing blobs representing historical versions of files can be downloaded
34+
in batches. The `backfill` command attempts to optimize the request by
35+
grouping blobs that appear at the same path, hopefully leading to good
36+
delta compression in the packfile sent by the server.
37+
38+
By default, `git backfill` downloads all blobs reachable from the `HEAD`
39+
commit. This set can be restricted or expanded using various options.
40+
1741
SEE ALSO
1842
--------
1943
linkgit:git-clone[1].

Documentation/technical/api-path-walk.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ Examples
6060
--------
6161

6262
See example usages in:
63-
`t/helper/test-path-walk.c`
63+
`t/helper/test-path-walk.c`,
64+
`builtin/backfill.c`

builtin/backfill.c

+101-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,116 @@
11
#include "builtin.h"
2+
#include "git-compat-util.h"
23
#include "config.h"
34
#include "parse-options.h"
45
#include "repository.h"
6+
#include "commit.h"
7+
#include "hex.h"
8+
#include "tree.h"
9+
#include "tree-walk.h"
510
#include "object.h"
11+
#include "object-store-ll.h"
12+
#include "oid-array.h"
13+
#include "oidset.h"
14+
#include "promisor-remote.h"
15+
#include "strmap.h"
16+
#include "string-list.h"
17+
#include "revision.h"
18+
#include "trace2.h"
19+
#include "progress.h"
20+
#include "packfile.h"
21+
#include "path-walk.h"
622

723
static const char * const builtin_backfill_usage[] = {
824
N_("git backfill [<options>]"),
925
NULL
1026
};
1127

28+
struct backfill_context {
29+
struct repository *repo;
30+
struct oid_array current_batch;
31+
size_t batch_size;
32+
};
33+
34+
static void clear_backfill_context(struct backfill_context *ctx)
35+
{
36+
oid_array_clear(&ctx->current_batch);
37+
}
38+
39+
static void download_batch(struct backfill_context *ctx)
40+
{
41+
promisor_remote_get_direct(ctx->repo,
42+
ctx->current_batch.oid,
43+
ctx->current_batch.nr);
44+
oid_array_clear(&ctx->current_batch);
45+
46+
/*
47+
* We likely have a new packfile. Add it to the packed list to
48+
* avoid possible duplicate downloads of the same objects.
49+
*/
50+
reprepare_packed_git(ctx->repo);
51+
}
52+
53+
static int fill_missing_blobs(const char *path UNUSED,
54+
struct oid_array *list,
55+
enum object_type type,
56+
void *data)
57+
{
58+
struct backfill_context *ctx = data;
59+
60+
if (type != OBJ_BLOB)
61+
return 0;
62+
63+
for (size_t i = 0; i < list->nr; i++) {
64+
off_t size = 0;
65+
struct object_info info = OBJECT_INFO_INIT;
66+
info.disk_sizep = &size;
67+
if (oid_object_info_extended(ctx->repo,
68+
&list->oid[i],
69+
&info,
70+
OBJECT_INFO_FOR_PREFETCH) ||
71+
!size)
72+
oid_array_append(&ctx->current_batch, &list->oid[i]);
73+
}
74+
75+
if (ctx->current_batch.nr >= ctx->batch_size)
76+
download_batch(ctx);
77+
78+
return 0;
79+
}
80+
81+
static int do_backfill(struct backfill_context *ctx)
82+
{
83+
struct rev_info revs;
84+
struct path_walk_info info = PATH_WALK_INFO_INIT;
85+
int ret;
86+
87+
repo_init_revisions(ctx->repo, &revs, "");
88+
handle_revision_arg("HEAD", &revs, 0, 0);
89+
90+
info.blobs = 1;
91+
info.tags = info.commits = info.trees = 0;
92+
93+
info.revs = &revs;
94+
info.path_fn = fill_missing_blobs;
95+
info.path_fn_data = ctx;
96+
97+
ret = walk_objects_by_path(&info);
98+
99+
/* Download the objects that did not fill a batch. */
100+
if (!ret)
101+
download_batch(ctx);
102+
103+
clear_backfill_context(ctx);
104+
return ret;
105+
}
106+
12107
int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
13108
{
109+
struct backfill_context ctx = {
110+
.repo = repo,
111+
.current_batch = OID_ARRAY_INIT,
112+
.batch_size = 16000,
113+
};
14114
struct option options[] = {
15115
OPT_END(),
16116
};
@@ -23,7 +123,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
23123

24124
repo_config(repo, git_default_config, NULL);
25125

26-
die(_("not implemented"));
27-
28-
return 0;
126+
return do_backfill(&ctx);
29127
}

t/t5620-backfill.sh

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/bin/sh
2+
3+
test_description='git backfill on partial clones'
4+
5+
GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
6+
export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
7+
8+
. ./test-lib.sh
9+
10+
# We create objects in the 'src' repo.
11+
test_expect_success 'setup repo for object creation' '
12+
echo "{print \$1}" >print_1.awk &&
13+
echo "{print \$2}" >print_2.awk &&
14+
15+
git init src &&
16+
17+
mkdir -p src/a/b/c &&
18+
mkdir -p src/d/e &&
19+
20+
for i in 1 2
21+
do
22+
for n in 1 2 3 4
23+
do
24+
echo "Version $i of file $n" > src/file.$n.txt &&
25+
echo "Version $i of file a/$n" > src/a/file.$n.txt &&
26+
echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
27+
echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
28+
echo "Version $i of file d/$n" > src/d/file.$n.txt &&
29+
echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
30+
git -C src add . &&
31+
git -C src commit -m "Iteration $n" || return 1
32+
done
33+
done
34+
'
35+
36+
# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
37+
# server for the partial clone.
38+
test_expect_success 'setup bare clone for server' '
39+
git clone --bare "file://$(pwd)/src" srv.bare &&
40+
git -C srv.bare config --local uploadpack.allowfilter 1 &&
41+
git -C srv.bare config --local uploadpack.allowanysha1inwant 1
42+
'
43+
44+
# do basic partial clone from "srv.bare"
45+
test_expect_success 'do partial clone 1, backfill gets all objects' '
46+
git clone --no-checkout --filter=blob:none \
47+
--single-branch --branch=main \
48+
"file://$(pwd)/srv.bare" backfill1 &&
49+
50+
# Backfill with no options gets everything reachable from HEAD.
51+
GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
52+
-C backfill1 backfill &&
53+
54+
# We should have engaged the partial clone machinery
55+
test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
56+
57+
# No more missing objects!
58+
git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
59+
test_line_count = 0 revs2
60+
'
61+
62+
. "$TEST_DIRECTORY"/lib-httpd.sh
63+
start_httpd
64+
65+
test_expect_success 'create a partial clone over HTTP' '
66+
SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
67+
rm -rf "$SERVER" repo &&
68+
git clone --bare "file://$(pwd)/src" "$SERVER" &&
69+
test_config -C "$SERVER" uploadpack.allowfilter 1 &&
70+
test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
71+
72+
git clone --no-checkout --filter=blob:none \
73+
"$HTTPD_URL/smart/server" backfill-http
74+
'
75+
76+
test_expect_success 'backfilling over HTTP succeeds' '
77+
GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
78+
-C backfill-http backfill &&
79+
80+
# We should have engaged the partial clone machinery
81+
test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
82+
83+
# Confirm all objects are present, none missing.
84+
git -C backfill-http rev-list --objects --all >rev-list-out &&
85+
awk "{print \$1;}" <rev-list-out >oids &&
86+
GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
87+
cat-file --batch-check <oids >batch-out &&
88+
! grep missing batch-out
89+
'
90+
91+
# DO NOT add non-httpd-specific tests here, because the last part of this
92+
# test script is only executed when httpd is available and enabled.
93+
94+
test_done

0 commit comments

Comments
 (0)