Skip to content

Commit 0bbac43

Browse files
dschoKevin Willford
authored and
Kevin Willford
committed
Merge 'gvfs/midx-sort-by-1st-byte'
This is part 3 of 3 This PR replays an earlier attemptwithout the two bugs it introduces (fanout problem for byte `00`; dropping large offsets). The previous PRs protect us from writing a bad MIDX if other bugs are introduced, and more rigorously tests the MIDX feature. If we ever do see a problematic MIDX in the wild, we can use `git midx --verify` to inspect it and see what is wrong. With the creation of this PR, I will run the gambit of GVFS tests and add a `git midx --verify` step to the OS Repo Tests (once the GitForWindows update is in master). --- The multi-pack index requires a sorted list of objects to create a binary-searchable index of objects across multiple pack-files. If an object appears in multiple packs, then we select a signle copy based on the most-recent mtime among packs containing that object. In the case of many duplicate objects, or simply many objects, we can speed up the de-duplication by processing objects in batches. Using the first byte of the object ID is a natural way to batch because the MIDX and IDX files have a fanout table based on the first byte. This gives us a way to navigate directly to the objects from each batch from each source. To process a batch, create an array of MIDX entries for each object matching that first byte value, then sort by OID (breaking ties by recently-modified packs first). Then copy the first instance of an object to the final object list. Since the pack-by-pack loading happens in builtin/midx.c, move this de-duplication to that file and add an expectation to write_midx_file() that the input object list is sorted and de-duplicated. Note: this commit includes fixes for the bugs introduced by a previous version ("midx: batch object sort by first byte"). Signed-off-by: Derrick Stolee <[email protected]>
2 parents b08a9c2 + 1b5be2f commit 0bbac43

File tree

3 files changed

+181
-108
lines changed

3 files changed

+181
-108
lines changed

builtin/midx.c

Lines changed: 145 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,157 @@ static struct opts_midx {
3131
struct object_id old_midx_oid;
3232
} opts;
3333

34+
static int midx_oid_compare(const void *_a, const void *_b)
35+
{
36+
struct pack_midx_entry *a = (struct pack_midx_entry *)_a;
37+
struct pack_midx_entry *b = (struct pack_midx_entry *)_b;
38+
int cmp = oidcmp(&a->oid, &b->oid);
39+
40+
if (cmp)
41+
return cmp;
42+
43+
if (a->pack_mtime > b->pack_mtime)
44+
return -1;
45+
else if (a->pack_mtime < b->pack_mtime)
46+
return 1;
47+
48+
return a->pack_int_id - b->pack_int_id;
49+
}
50+
51+
static uint32_t get_pack_fanout(struct packed_git *p, uint32_t value)
52+
{
53+
const uint32_t *level1_ofs = p->index_data;
54+
55+
if (!level1_ofs) {
56+
if (open_pack_index(p))
57+
return 0;
58+
level1_ofs = p->index_data;
59+
}
60+
61+
if (p->index_version > 1) {
62+
level1_ofs += 2;
63+
}
64+
65+
return ntohl(level1_ofs[value]);
66+
}
67+
68+
/*
69+
* It is possible to artificially get into a state where there are many
70+
* duplicate copies of objects. That can create high memory pressure if
71+
* we are to create a list of all objects before de-duplication. To reduce
72+
* this memory pressure without a significant performance drop, automatically
73+
* group objects by the first byte of their object id. Use the IDX fanout
74+
* tables to group the data, copy to a local array, then sort.
75+
*
76+
* Copy only the de-duplicated entries (selected by most-recent modified time
77+
* of a packfile containing the object).
78+
*/
79+
static void dedupe_and_sort_entries(
80+
struct packed_git **packs, uint32_t nr_packs,
81+
struct midxed_git *midx,
82+
struct pack_midx_entry **objects, uint32_t *nr_objects)
83+
{
84+
uint32_t first_byte, i;
85+
struct pack_midx_entry *objects_batch = NULL;
86+
uint32_t nr_objects_batch = 0;
87+
uint32_t alloc_objects_batch = 0;
88+
uint32_t alloc_objects;
89+
uint32_t pack_offset = 0;
90+
struct pack_midx_entry *local_objects = NULL;
91+
int nr_local_objects = 0;
92+
93+
if (midx) {
94+
nr_objects_batch = midx->num_objects;
95+
pack_offset = midx->num_packs;
96+
}
97+
98+
for (i = pack_offset; i < nr_packs; i++)
99+
nr_objects_batch += packs[i]->num_objects;
100+
101+
/*
102+
* Predict the size of the batches to be roughly ~1/256 the total
103+
* count, but give some slack as they will not be equally sized.
104+
*/
105+
alloc_objects_batch = nr_objects_batch / 200;
106+
ALLOC_ARRAY(objects_batch, alloc_objects_batch);
107+
108+
*nr_objects = 0;
109+
alloc_objects = alloc_objects_batch;
110+
ALLOC_ARRAY(local_objects, alloc_objects);
111+
112+
for (first_byte = 0; first_byte < 256; first_byte++) {
113+
nr_objects_batch = 0;
114+
115+
if (midx) {
116+
uint32_t start, end;
117+
if (first_byte)
118+
start = get_be32(midx->chunk_oid_fanout + 4 * (first_byte - 1));
119+
else
120+
start = 0;
121+
122+
end = get_be32(midx->chunk_oid_fanout + 4 * first_byte);
123+
124+
while (start < end) {
125+
ALLOC_GROW(objects_batch, nr_objects_batch + 1, alloc_objects_batch);
126+
nth_midxed_object_entry(midx, start, &objects_batch[nr_objects_batch]);
127+
nr_objects_batch++;
128+
start++;
129+
}
130+
}
131+
132+
for (i = pack_offset; i < nr_packs; i++) {
133+
uint32_t start, end;
134+
135+
if (first_byte)
136+
start = get_pack_fanout(packs[i], first_byte - 1);
137+
else
138+
start = 0;
139+
end = get_pack_fanout(packs[i], first_byte);
140+
141+
while (start < end) {
142+
struct pack_midx_entry *entry;
143+
ALLOC_GROW(objects_batch, nr_objects_batch + 1, alloc_objects_batch);
144+
entry = &objects_batch[nr_objects_batch++];
145+
146+
if (!nth_packed_object_oid(&entry->oid, packs[i], start))
147+
die("unable to get sha1 of object %u in %s",
148+
start, packs[i]->pack_name);
149+
150+
entry->pack_int_id = i;
151+
entry->offset = nth_packed_object_offset(packs[i], start);
152+
entry->pack_mtime = packs[i]->mtime;
153+
start++;
154+
}
155+
}
156+
157+
QSORT(objects_batch, nr_objects_batch, midx_oid_compare);
158+
159+
/* de-dupe as we copy from the batch in-order */
160+
for (i = 0; i < nr_objects_batch; i++) {
161+
if (i > 0 && !oidcmp(&objects_batch[i - 1].oid, &objects_batch[i].oid))
162+
continue;
163+
164+
ALLOC_GROW(local_objects, nr_local_objects + 1, alloc_objects);
165+
memcpy(&local_objects[nr_local_objects], &objects_batch[i], sizeof(struct pack_midx_entry));
166+
nr_local_objects++;
167+
}
168+
}
169+
170+
*nr_objects = nr_local_objects;
171+
*objects = local_objects;
172+
}
173+
34174
static int build_midx_from_packs(
35175
const char *pack_dir,
36176
const char **pack_names, uint32_t nr_packs,
37177
const char **midx_id, struct midxed_git *midx)
38178
{
39179
struct packed_git **packs;
40180
const char **installed_pack_names;
41-
uint32_t i, j, nr_installed_packs = 0;
181+
uint32_t i, nr_installed_packs = 0;
42182
uint32_t nr_objects = 0;
43-
struct pack_midx_entry *objects;
44-
struct pack_midx_entry **obj_ptrs;
183+
struct pack_midx_entry *objects = NULL;
45184
uint32_t nr_total_packs = nr_packs;
46-
uint32_t pack_offset = 0;
47185
struct strbuf pack_path = STRBUF_INIT;
48186
int baselen;
49187

@@ -56,7 +194,6 @@ static int build_midx_from_packs(
56194
if (midx) {
57195
for (i = 0; i < midx->num_packs; i++)
58196
installed_pack_names[nr_installed_packs++] = midx->pack_names[i];
59-
pack_offset = midx->num_packs;
60197
}
61198

62199
strbuf_addstr(&pack_path, pack_dir);
@@ -95,44 +232,14 @@ static int build_midx_from_packs(
95232
return 0;
96233
}
97234

98-
if (midx)
99-
nr_objects += midx->num_objects;
100-
101-
ALLOC_ARRAY(objects, nr_objects);
102-
nr_objects = 0;
103-
104-
for (i = 0; midx && i < midx->num_objects; i++)
105-
nth_midxed_object_entry(midx, i, &objects[nr_objects++]);
106-
107-
for (i = pack_offset; i < nr_installed_packs; i++) {
108-
struct packed_git *p = packs[i];
109-
110-
for (j = 0; j < p->num_objects; j++) {
111-
struct pack_midx_entry entry;
112-
113-
if (!nth_packed_object_oid(&entry.oid, p, j))
114-
die("unable to get sha1 of object %u in %s",
115-
i, p->pack_name);
116-
117-
entry.pack_int_id = i;
118-
entry.offset = nth_packed_object_offset(p, j);
119-
entry.pack_mtime = p->mtime;
120-
121-
objects[nr_objects] = entry;
122-
nr_objects++;
123-
}
124-
}
125-
126-
ALLOC_ARRAY(obj_ptrs, nr_objects);
127-
for (i = 0; i < nr_objects; i++)
128-
obj_ptrs[i] = &objects[i];
235+
dedupe_and_sort_entries(packs, nr_installed_packs,
236+
midx, &objects, &nr_objects);
129237

130238
*midx_id = write_midx_file(pack_dir, NULL,
131239
installed_pack_names, nr_installed_packs,
132-
obj_ptrs, nr_objects);
240+
objects, nr_objects);
133241

134242
FREE_AND_NULL(installed_pack_names);
135-
FREE_AND_NULL(obj_ptrs);
136243
FREE_AND_NULL(objects);
137244

138245
return 0;

0 commit comments

Comments
 (0)