Skip to content

Commit 9035d75

Browse files
sprohaskagitster
authored andcommitted
convert: stream from fd to required clean filter to reduce used address space
The data is streamed to the filter process anyway. Better avoid mapping the file if possible. This is especially useful if a clean filter reduces the size, for example if it computes a sha1 for binary data, like git media. The file size that the previous implementation could handle was limited by the available address space; large files for example could not be handled with (32-bit) msysgit. The new implementation can filter files of any size as long as the filter output is small enough. The new code path is only taken if the filter is required. The filter consumes data directly from the fd. If it fails, the original data is not immediately available. The condition can easily be handled as a fatal error, which is expected for a required filter anyway. If the filter was not required, the condition would need to be handled in a different way, like seeking to 0 and reading the data. But this would require more restructuring of the code and is probably not worth it. The obvious approach of falling back to reading all data would not help achieving the main purpose of this patch, which is to handle large files with limited address space. If reading all data is an option, we can simply take the old code path right away and mmap the entire file. The environment variable GIT_MMAP_LIMIT, which has been introduced in a previous commit is used to test that the expected code path is taken. A related test that exercises required filters is modified to verify that the data actually has been modified on its way from the file system to the object store. Signed-off-by: Steffen Prohaska <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent b29763a commit 9035d75

File tree

4 files changed

+99
-12
lines changed

4 files changed

+99
-12
lines changed

convert.c

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -312,11 +312,12 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
312312
struct filter_params {
313313
const char *src;
314314
unsigned long size;
315+
int fd;
315316
const char *cmd;
316317
const char *path;
317318
};
318319

319-
static int filter_buffer(int in, int out, void *data)
320+
static int filter_buffer_or_fd(int in, int out, void *data)
320321
{
321322
/*
322323
* Spawn cmd and feed the buffer contents through its stdin.
@@ -355,7 +356,12 @@ static int filter_buffer(int in, int out, void *data)
355356

356357
sigchain_push(SIGPIPE, SIG_IGN);
357358

358-
write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
359+
if (params->src) {
360+
write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
361+
} else {
362+
write_err = copy_fd(params->fd, child_process.in);
363+
}
364+
359365
if (close(child_process.in))
360366
write_err = 1;
361367
if (write_err)
@@ -371,7 +377,7 @@ static int filter_buffer(int in, int out, void *data)
371377
return (write_err || status);
372378
}
373379

374-
static int apply_filter(const char *path, const char *src, size_t len,
380+
static int apply_filter(const char *path, const char *src, size_t len, int fd,
375381
struct strbuf *dst, const char *cmd)
376382
{
377383
/*
@@ -392,11 +398,12 @@ static int apply_filter(const char *path, const char *src, size_t len,
392398
return 1;
393399

394400
memset(&async, 0, sizeof(async));
395-
async.proc = filter_buffer;
401+
async.proc = filter_buffer_or_fd;
396402
async.data = &params;
397403
async.out = -1;
398404
params.src = src;
399405
params.size = len;
406+
params.fd = fd;
400407
params.cmd = cmd;
401408
params.path = path;
402409

@@ -747,6 +754,25 @@ static void convert_attrs(struct conv_attrs *ca, const char *path)
747754
}
748755
}
749756

757+
int would_convert_to_git_filter_fd(const char *path)
758+
{
759+
struct conv_attrs ca;
760+
761+
convert_attrs(&ca, path);
762+
if (!ca.drv)
763+
return 0;
764+
765+
/*
766+
* Apply a filter to an fd only if the filter is required to succeed.
767+
* We must die if the filter fails, because the original data before
768+
* filtering is not available.
769+
*/
770+
if (!ca.drv->required)
771+
return 0;
772+
773+
return apply_filter(path, NULL, 0, -1, NULL, ca.drv->clean);
774+
}
775+
750776
int convert_to_git(const char *path, const char *src, size_t len,
751777
struct strbuf *dst, enum safe_crlf checksafe)
752778
{
@@ -761,7 +787,7 @@ int convert_to_git(const char *path, const char *src, size_t len,
761787
required = ca.drv->required;
762788
}
763789

764-
ret |= apply_filter(path, src, len, dst, filter);
790+
ret |= apply_filter(path, src, len, -1, dst, filter);
765791
if (!ret && required)
766792
die("%s: clean filter '%s' failed", path, ca.drv->name);
767793

@@ -778,6 +804,23 @@ int convert_to_git(const char *path, const char *src, size_t len,
778804
return ret | ident_to_git(path, src, len, dst, ca.ident);
779805
}
780806

807+
void convert_to_git_filter_fd(const char *path, int fd, struct strbuf *dst,
808+
enum safe_crlf checksafe)
809+
{
810+
struct conv_attrs ca;
811+
convert_attrs(&ca, path);
812+
813+
assert(ca.drv);
814+
assert(ca.drv->clean);
815+
816+
if (!apply_filter(path, NULL, 0, fd, dst, ca.drv->clean))
817+
die("%s: clean filter '%s' failed", path, ca.drv->name);
818+
819+
ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr);
820+
crlf_to_git(path, dst->buf, dst->len, dst, ca.crlf_action, checksafe);
821+
ident_to_git(path, dst->buf, dst->len, dst, ca.ident);
822+
}
823+
781824
static int convert_to_working_tree_internal(const char *path, const char *src,
782825
size_t len, struct strbuf *dst,
783826
int normalizing)
@@ -811,7 +854,7 @@ static int convert_to_working_tree_internal(const char *path, const char *src,
811854
}
812855
}
813856

814-
ret_filter = apply_filter(path, src, len, dst, filter);
857+
ret_filter = apply_filter(path, src, len, -1, dst, filter);
815858
if (!ret_filter && required)
816859
die("%s: smudge filter %s failed", path, ca.drv->name);
817860

convert.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ static inline int would_convert_to_git(const char *path)
4444
{
4545
return convert_to_git(path, NULL, 0, NULL, 0);
4646
}
47+
/* Precondition: would_convert_to_git_filter_fd(path) == true */
48+
extern void convert_to_git_filter_fd(const char *path, int fd,
49+
struct strbuf *dst,
50+
enum safe_crlf checksafe);
51+
extern int would_convert_to_git_filter_fd(const char *path);
4752

4853
/*****************************************************************
4954
*

sha1_file.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3090,6 +3090,29 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
30903090
return ret;
30913091
}
30923092

3093+
static int index_stream_convert_blob(unsigned char *sha1, int fd,
3094+
const char *path, unsigned flags)
3095+
{
3096+
int ret;
3097+
const int write_object = flags & HASH_WRITE_OBJECT;
3098+
struct strbuf sbuf = STRBUF_INIT;
3099+
3100+
assert(path);
3101+
assert(would_convert_to_git_filter_fd(path));
3102+
3103+
convert_to_git_filter_fd(path, fd, &sbuf,
3104+
write_object ? safe_crlf : SAFE_CRLF_FALSE);
3105+
3106+
if (write_object)
3107+
ret = write_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
3108+
sha1);
3109+
else
3110+
ret = hash_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
3111+
sha1);
3112+
strbuf_release(&sbuf);
3113+
return ret;
3114+
}
3115+
30933116
static int index_pipe(unsigned char *sha1, int fd, enum object_type type,
30943117
const char *path, unsigned flags)
30953118
{
@@ -3157,7 +3180,9 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
31573180
int ret;
31583181
size_t size = xsize_t(st->st_size);
31593182

3160-
if (!S_ISREG(st->st_mode))
3183+
if (type == OBJ_BLOB && path && would_convert_to_git_filter_fd(path))
3184+
ret = index_stream_convert_blob(sha1, fd, path, flags);
3185+
else if (!S_ISREG(st->st_mode))
31613186
ret = index_pipe(sha1, fd, type, path, flags);
31623187
else if (size <= big_file_threshold || type != OBJ_BLOB ||
31633188
(path && would_convert_to_git(path)))

t/t0021-conversion.sh

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,17 +153,23 @@ test_expect_success 'filter shell-escaped filenames' '
153153
:
154154
'
155155

156-
test_expect_success 'required filter success' '
157-
git config filter.required.smudge cat &&
158-
git config filter.required.clean cat &&
156+
test_expect_success 'required filter should filter data' '
157+
git config filter.required.smudge ./rot13.sh &&
158+
git config filter.required.clean ./rot13.sh &&
159159
git config filter.required.required true &&
160160
161161
echo "*.r filter=required" >.gitattributes &&
162162
163-
echo test >test.r &&
163+
cat test.o >test.r &&
164164
git add test.r &&
165+
165166
rm -f test.r &&
166-
git checkout -- test.r
167+
git checkout -- test.r &&
168+
cmp test.o test.r &&
169+
170+
./rot13.sh <test.o >expected &&
171+
git cat-file blob :test.r >actual &&
172+
cmp expected actual
167173
'
168174

169175
test_expect_success 'required filter smudge failure' '
@@ -190,6 +196,14 @@ test_expect_success 'required filter clean failure' '
190196
test_must_fail git add test.fc
191197
'
192198

199+
test_expect_success 'filtering large input to small output should use little memory' '
200+
git config filter.devnull.clean "cat >/dev/null" &&
201+
git config filter.devnull.required true &&
202+
for i in $(test_seq 1 30); do printf "%1048576d" 1; done >30MB &&
203+
echo "30MB filter=devnull" >.gitattributes &&
204+
GIT_MMAP_LIMIT=1m GIT_ALLOC_LIMIT=1m git add 30MB
205+
'
206+
193207
test_expect_success EXPENSIVE 'filter large file' '
194208
git config filter.largefile.smudge cat &&
195209
git config filter.largefile.clean cat &&

0 commit comments

Comments
 (0)