Skip to content

Commit a0ce724

Browse files
committed
recursiveloader: use less memory in assert_directory_verifies
In order to avoid loading all manifest entries into memory at once, make assert_directory_verifies use a shared sort_key to iterate over directories and manifest entries in the same order. For verification of the entire gentoo tree, my tests have shown that this change reduces the memory footprint by about 63%, while consuming about 20% more time.
1 parent 21a1d34 commit a0ce724

File tree

1 file changed

+161
-37
lines changed

1 file changed

+161
-37
lines changed

Diff for: gemato/recursiveloader.py

+161-37
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ def _iter_unordered_manifests_for_path(self, path, recursive=False):
341341
elif recursive and gemato.util.path_starts_with(d, path):
342342
yield (k, d, v)
343343

344-
def _iter_manifests_for_path(self, path, recursive=False):
344+
def _iter_manifests_for_path(self, path, recursive=False, sort_key=lambda kdv: len(kdv[1])):
345345
"""
346346
Iterate over loaded Manifests that can apply to path.
347347
If @recursive is True, returns also Manifests for subdirectories
@@ -354,7 +354,7 @@ def _iter_manifests_for_path(self, path, recursive=False):
354354
return sorted(
355355
self._iter_unordered_manifests_for_path(
356356
path, recursive=recursive),
357-
key=lambda kdv: len(kdv[1]),
357+
key=sort_key,
358358
reverse=True)
359359

360360
def load_manifests_for_path(self, path, recursive=False, verify=True):
@@ -368,20 +368,38 @@ def load_manifests_for_path(self, path, recursive=False, verify=True):
368368
on mismatch. Otherwise, sub-Manifests will be loaded
369369
unconditionally of whether they match parent checksums.
370370
"""
371+
for curmpath, relpath, m in self._iter_load_manifests_for_path(
372+
path, recursive=recursive, verify=verify):
373+
self.loaded_manifests[curmpath] = m
371374

375+
def _iter_load_manifests_for_path(self, path, recursive=False, verify=True,
376+
sort_key=lambda kdv: kdv[1].split(os.sep)):
377+
"""
378+
Traverse manifests in depth-first order with directories sorted by
379+
name. Only hold references to a minimum number of ManifestFile
380+
instances, in order to conserve memory.
381+
382+
The caller can traverse manifest and directory iterators in unison,
383+
minimizing the amount of data in memory.
384+
"""
372385
pool = multiprocessing.Pool(processes=self.max_jobs)
373386

387+
# Manifests pop from the stack in depth-first order
388+
manifest_stack = list(self._iter_manifests_for_path(path,
389+
recursive=recursive, sort_key=sort_key))
390+
traversed = set(curmpath for curmpath, relpath, m in manifest_stack)
374391
try:
375392
# TODO: figure out how to avoid confusing uses of 'recursive'
376-
while True:
393+
while manifest_stack:
377394
to_load = []
378-
for curmpath, relpath, m in self._iter_manifests_for_path(
379-
path, recursive):
380-
for e in m.entries:
395+
curmpath, relpath, m = manifest_stack.pop()
396+
yield (curmpath, relpath, m)
397+
398+
for e in m.entries:
381399
if e.tag != 'MANIFEST':
382400
continue
383401
mpath = os.path.join(relpath, e.path)
384-
if curmpath == mpath or mpath in self.loaded_manifests:
402+
if curmpath == mpath or mpath in traversed:
385403
continue
386404
mdir = os.path.dirname(mpath)
387405
if not verify:
@@ -390,12 +408,19 @@ def load_manifests_for_path(self, path, recursive=False, verify=True):
390408
to_load.append((mpath, e))
391409
elif recursive and gemato.util.path_starts_with(mdir, path):
392410
to_load.append((mpath, e))
393-
if not to_load:
394-
break
395411

396412
manifests = pool.imap_unordered(self.manifest_loader, to_load,
397413
chunksize=16)
398-
self.loaded_manifests.update(manifests)
414+
415+
manifests = [(mpath, os.path.dirname(mpath), e)
416+
for mpath, e in manifests]
417+
418+
# Manifests pop from the stack in depth-first order
419+
manifests.sort(key=sort_key, reverse=True)
420+
for mpath, mdir, m in manifests:
421+
traversed.add(mpath)
422+
manifest_stack.append(
423+
(mpath, os.path.dirname(mpath), m))
399424

400425
pool.close()
401426
pool.join()
@@ -511,13 +536,54 @@ def get_file_entry_dict(self, path='', only_types=None,
511536
be verified against MANIFEST entries. Pass False only when
512537
doing updates.
513538
"""
539+
out = {}
540+
for dirpath, dirout in self._iter_file_entry_dict(
541+
path=path, only_types=only_types,
542+
verify_manifests=verify_manifests):
543+
other = out.get(dirpath)
544+
if other is None:
545+
out[dirpath] = dirout
546+
else:
547+
# This happens due to the relpath = '' setting
548+
# for all DIST entries.
549+
for filename, e in dirout.items():
550+
if filename in other:
551+
e = self._merge_entries(other[filename], e)
552+
other[filename] = e
553+
return out
514554

515-
self.load_manifests_for_path(path, recursive=True,
516-
verify=verify_manifests)
555+
@staticmethod
556+
def _merge_entries(e1, e2):
557+
# compare the two entries
558+
ret, diff = gemato.verify.verify_entry_compatibility(
559+
e1, e2)
560+
if not ret:
561+
raise gemato.exceptions.ManifestIncompatibleEntry(
562+
e1, e2, diff)
563+
# we need to construct a single entry with both checksums
564+
if diff:
565+
new_checksums = dict(e2.checksums)
566+
for k, d1, d2 in diff:
567+
if d2 is None:
568+
new_checksums[k] = d1
569+
e1 = type(e1)(e1.path, e1.size, new_checksums)
570+
return e1
571+
572+
def _iter_file_entry_dict(self, path='', only_types=None,
573+
verify_manifests=True,
574+
sort_key=lambda p: p.split(os.sep)):
517575
out = {}
518-
for mpath, relpath, m in self._iter_manifests_for_path(path,
519-
recursive=True):
520-
for e in m.entries:
576+
dir_stack = [path]
577+
iter_load = self._iter_load_manifests_for_path(path,
578+
recursive=True, verify=verify_manifests)
579+
mpath, mdir, m = next(iter_load, (None, None, None))
580+
581+
while dir_stack or mdir is not None:
582+
if not dir_stack or (mdir is not None and
583+
sort_key(mdir) <= sort_key(dir_stack[-1])):
584+
subdirs = []
585+
relpath = mdir
586+
for e in m.entries:
521587
if only_types is not None:
522588
if e.tag not in only_types:
523589
continue
@@ -533,23 +599,20 @@ def get_file_entry_dict(self, path='', only_types=None,
533599
if gemato.util.path_starts_with(fullpath, path):
534600
dirpath = os.path.dirname(fullpath)
535601
filename = os.path.basename(e.path)
602+
subdirs.append(dirpath)
536603
dirout = out.setdefault(dirpath, {})
537604
if filename in dirout:
538-
# compare the two entries
539-
ret, diff = gemato.verify.verify_entry_compatibility(
540-
dirout[filename], e)
541-
if not ret:
542-
raise gemato.exceptions.ManifestIncompatibleEntry(
543-
dirout[filename], e, diff)
544-
# we need to construct a single entry with both checksums
545-
if diff:
546-
new_checksums = dict(e.checksums)
547-
for k, d1, d2 in diff:
548-
if d2 is None:
549-
new_checksums[k] = d1
550-
e = type(e)(e.path, e.size, new_checksums)
605+
e = self._merge_entries(dirout[filename], e)
551606
dirout[filename] = e
552-
return out
607+
subdirs.sort(key=sort_key, reverse=True)
608+
dir_stack.extend(subdirs)
609+
mpath, mdir, m = next(iter_load, (None, None, None))
610+
else:
611+
dirpath = dir_stack.pop()
612+
try:
613+
yield dirpath, out.pop(dirpath)
614+
except KeyError:
615+
pass
553616

554617
def assert_directory_verifies(self, path='',
555618
fail_handler=gemato.util.throw_exception,
@@ -580,22 +643,83 @@ def assert_directory_verifies(self, path='',
580643
to None (the default), the number of system CPUs will be used.
581644
"""
582645

583-
entry_dict = self.get_file_entry_dict(path)
646+
remaining_entries = {}
647+
entry_iter = self._iter_file_entry_dict(path)
584648
it = os.walk(os.path.join(self.root_directory, path),
585649
onerror=gemato.util.throw_exception,
586650
followlinks=True)
651+
sort_key = lambda p: p.split(os.sep)
652+
dir_stack = []
587653

588654
def _walk_directory(it):
589655
"""
590656
Pre-process os.walk() result for verification. Yield objects
591657
suitable to passing to subprocesses.
592658
"""
593-
for dirpath, dirnames, filenames in it:
594-
relpath = os.path.relpath(dirpath, self.root_directory)
595-
# strip dot to avoid matching problems
596-
if relpath == '.':
597-
relpath = ''
598-
dirdict = entry_dict.pop(relpath, {})
659+
pop_until = None
660+
entry_dir, entry_dict = next(entry_iter, (None, None))
661+
while True:
662+
if pop_until is not None:
663+
dirpath, dirnames, filenames, relpath = dir_stack.pop()
664+
if pop_until is relpath:
665+
pop_until = None
666+
elif (dir_stack and entry_dir is not None and
667+
gemato.util.path_starts_with(dir_stack[-1][-1], entry_dir)):
668+
dirpath, dirnames, filenames, relpath = dir_stack.pop()
669+
else:
670+
try:
671+
dirpath, dirnames, filenames = next(it)
672+
except StopIteration:
673+
while entry_dir is not None:
674+
remaining_entries[entry_dir] = entry_dict
675+
entry_dir, entry_dict = next(entry_iter, (None, None))
676+
break
677+
678+
relpath = os.path.relpath(dirpath, self.root_directory)
679+
680+
# strip dot to avoid matching problems
681+
if relpath == '.':
682+
relpath = ''
683+
684+
dirnames.sort()
685+
686+
if relpath == entry_dir:
687+
dirdict = entry_dict
688+
entry_dir, entry_dict = next(entry_iter, (None, None))
689+
elif entry_dir is not None and gemato.util.path_starts_with(relpath, entry_dir):
690+
dirdict = {}
691+
else:
692+
relpath_key = sort_key(relpath)
693+
if dir_stack and entry_dir is not None:
694+
entry_dir_key = sort_key(entry_dir)
695+
if relpath_key > entry_dir_key and entry_dir_key <= sort_key(dir_stack[-1][-1]):
696+
# Try to insert it into the stack for later processing.
697+
for i, item in enumerate(dir_stack):
698+
if item[-1] and relpath_key > sort_key(item[-1]):
699+
dir_stack.insert(i, (dirpath, dirnames, filenames, relpath))
700+
dirpath = None
701+
break
702+
if dirpath is None:
703+
if pop_until is None:
704+
pop_until = relpath
705+
continue
706+
while entry_dir is not None and relpath_key > sort_key(entry_dir):
707+
remaining_entries[entry_dir] = entry_dict
708+
entry_dir, entry_dict = next(entry_iter, (None, None))
709+
710+
if relpath == entry_dir:
711+
dirdict = entry_dict
712+
entry_dir, entry_dict = next(entry_iter, (None, None))
713+
elif entry_dir is not None:
714+
relpath_key = sort_key(relpath)
715+
entry_dir_key = sort_key(entry_dir)
716+
if relpath_key < entry_dir_key and len(relpath_key) <= len(entry_dir_key):
717+
dir_stack.append((dirpath, dirnames, filenames, relpath))
718+
continue
719+
else:
720+
dirdict = {}
721+
else:
722+
dirdict = {}
599723

600724
skip_dirs = []
601725
for d in dirnames:
@@ -643,7 +767,7 @@ def _walk_directory(it):
643767
pool.close()
644768

645769
# check for missing directories
646-
for relpath, dirdict in entry_dict.items():
770+
for relpath, dirdict in remaining_entries.items():
647771
for f, e in dirdict.items():
648772
fpath = os.path.join(relpath, f)
649773
syspath = os.path.join(self.root_directory, fpath)

0 commit comments

Comments
 (0)