@@ -341,7 +341,7 @@ def _iter_unordered_manifests_for_path(self, path, recursive=False):
341
341
elif recursive and gemato .util .path_starts_with (d , path ):
342
342
yield (k , d , v )
343
343
344
- def _iter_manifests_for_path (self , path , recursive = False ):
344
+ def _iter_manifests_for_path (self , path , recursive = False , sort_key = lambda kdv : len ( kdv [ 1 ]) ):
345
345
"""
346
346
Iterate over loaded Manifests that can apply to path.
347
347
If @recursive is True, returns also Manifests for subdirectories
@@ -354,7 +354,7 @@ def _iter_manifests_for_path(self, path, recursive=False):
354
354
return sorted (
355
355
self ._iter_unordered_manifests_for_path (
356
356
path , recursive = recursive ),
357
- key = lambda kdv : len ( kdv [ 1 ]) ,
357
+ key = sort_key ,
358
358
reverse = True )
359
359
360
360
def load_manifests_for_path (self , path , recursive = False , verify = True ):
@@ -368,20 +368,38 @@ def load_manifests_for_path(self, path, recursive=False, verify=True):
368
368
on mismatch. Otherwise, sub-Manifests will be loaded
369
369
unconditionally of whether they match parent checksums.
370
370
"""
371
+ for curmpath , relpath , m in self ._iter_load_manifests_for_path (
372
+ path , recursive = recursive , verify = verify ):
373
+ self .loaded_manifests [curmpath ] = m
371
374
375
+ def _iter_load_manifests_for_path (self , path , recursive = False , verify = True ,
376
+ sort_key = lambda kdv : kdv [1 ].split (os .sep )):
377
+ """
378
+ Traverse manifests in depth-first order with directories sorted by
379
+ name. Only hold references to a minimum number of ManifestFile
380
+ instances, in order to conserve memory.
381
+
382
+ The caller can traverse manifest and directory iterators in unison,
383
+ minimizing the amount of data in memory.
384
+ """
372
385
pool = multiprocessing .Pool (processes = self .max_jobs )
373
386
387
+ # Manifests pop from the stack in depth-first order
388
+ manifest_stack = list (self ._iter_manifests_for_path (path ,
389
+ recursive = recursive , sort_key = sort_key ))
390
+ traversed = set (curmpath for curmpath , relpath , m in manifest_stack )
374
391
try :
375
392
# TODO: figure out how to avoid confusing uses of 'recursive'
376
- while True :
393
+ while manifest_stack :
377
394
to_load = []
378
- for curmpath , relpath , m in self ._iter_manifests_for_path (
379
- path , recursive ):
380
- for e in m .entries :
395
+ curmpath , relpath , m = manifest_stack .pop ()
396
+ yield (curmpath , relpath , m )
397
+
398
+ for e in m .entries :
381
399
if e .tag != 'MANIFEST' :
382
400
continue
383
401
mpath = os .path .join (relpath , e .path )
384
- if curmpath == mpath or mpath in self . loaded_manifests :
402
+ if curmpath == mpath or mpath in traversed :
385
403
continue
386
404
mdir = os .path .dirname (mpath )
387
405
if not verify :
@@ -390,12 +408,19 @@ def load_manifests_for_path(self, path, recursive=False, verify=True):
390
408
to_load .append ((mpath , e ))
391
409
elif recursive and gemato .util .path_starts_with (mdir , path ):
392
410
to_load .append ((mpath , e ))
393
- if not to_load :
394
- break
395
411
396
412
manifests = pool .imap_unordered (self .manifest_loader , to_load ,
397
413
chunksize = 16 )
398
- self .loaded_manifests .update (manifests )
414
+
415
+ manifests = [(mpath , os .path .dirname (mpath ), e )
416
+ for mpath , e in manifests ]
417
+
418
+ # Manifests pop from the stack in depth-first order
419
+ manifests .sort (key = sort_key , reverse = True )
420
+ for mpath , mdir , m in manifests :
421
+ traversed .add (mpath )
422
+ manifest_stack .append (
423
+ (mpath , os .path .dirname (mpath ), m ))
399
424
400
425
pool .close ()
401
426
pool .join ()
@@ -511,13 +536,54 @@ def get_file_entry_dict(self, path='', only_types=None,
511
536
be verified against MANIFEST entries. Pass False only when
512
537
doing updates.
513
538
"""
539
+ out = {}
540
+ for dirpath , dirout in self ._iter_file_entry_dict (
541
+ path = path , only_types = only_types ,
542
+ verify_manifests = verify_manifests ):
543
+ other = out .get (dirpath )
544
+ if other is None :
545
+ out [dirpath ] = dirout
546
+ else :
547
+ # This happens due to the relpath = '' setting
548
+ # for all DIST entries.
549
+ for filename , e in dirout .items ():
550
+ if filename in other :
551
+ e = self ._merge_entries (other [filename ], e )
552
+ other [filename ] = e
553
+ return out
514
554
515
- self .load_manifests_for_path (path , recursive = True ,
516
- verify = verify_manifests )
555
+ @staticmethod
556
+ def _merge_entries (e1 , e2 ):
557
+ # compare the two entries
558
+ ret , diff = gemato .verify .verify_entry_compatibility (
559
+ e1 , e2 )
560
+ if not ret :
561
+ raise gemato .exceptions .ManifestIncompatibleEntry (
562
+ e1 , e2 , diff )
563
+ # we need to construct a single entry with both checksums
564
+ if diff :
565
+ new_checksums = dict (e2 .checksums )
566
+ for k , d1 , d2 in diff :
567
+ if d2 is None :
568
+ new_checksums [k ] = d1
569
+ e1 = type (e1 )(e1 .path , e1 .size , new_checksums )
570
+ return e1
571
+
572
+ def _iter_file_entry_dict (self , path = '' , only_types = None ,
573
+ verify_manifests = True ,
574
+ sort_key = lambda p : p .split (os .sep )):
517
575
out = {}
518
- for mpath , relpath , m in self ._iter_manifests_for_path (path ,
519
- recursive = True ):
520
- for e in m .entries :
576
+ dir_stack = [path ]
577
+ iter_load = self ._iter_load_manifests_for_path (path ,
578
+ recursive = True , verify = verify_manifests )
579
+ mpath , mdir , m = next (iter_load , (None , None , None ))
580
+
581
+ while dir_stack or mdir is not None :
582
+ if not dir_stack or (mdir is not None and
583
+ sort_key (mdir ) <= sort_key (dir_stack [- 1 ])):
584
+ subdirs = []
585
+ relpath = mdir
586
+ for e in m .entries :
521
587
if only_types is not None :
522
588
if e .tag not in only_types :
523
589
continue
@@ -533,23 +599,20 @@ def get_file_entry_dict(self, path='', only_types=None,
533
599
if gemato .util .path_starts_with (fullpath , path ):
534
600
dirpath = os .path .dirname (fullpath )
535
601
filename = os .path .basename (e .path )
602
+ subdirs .append (dirpath )
536
603
dirout = out .setdefault (dirpath , {})
537
604
if filename in dirout :
538
- # compare the two entries
539
- ret , diff = gemato .verify .verify_entry_compatibility (
540
- dirout [filename ], e )
541
- if not ret :
542
- raise gemato .exceptions .ManifestIncompatibleEntry (
543
- dirout [filename ], e , diff )
544
- # we need to construct a single entry with both checksums
545
- if diff :
546
- new_checksums = dict (e .checksums )
547
- for k , d1 , d2 in diff :
548
- if d2 is None :
549
- new_checksums [k ] = d1
550
- e = type (e )(e .path , e .size , new_checksums )
605
+ e = self ._merge_entries (dirout [filename ], e )
551
606
dirout [filename ] = e
552
- return out
607
+ subdirs .sort (key = sort_key , reverse = True )
608
+ dir_stack .extend (subdirs )
609
+ mpath , mdir , m = next (iter_load , (None , None , None ))
610
+ else :
611
+ dirpath = dir_stack .pop ()
612
+ try :
613
+ yield dirpath , out .pop (dirpath )
614
+ except KeyError :
615
+ pass
553
616
554
617
def assert_directory_verifies (self , path = '' ,
555
618
fail_handler = gemato .util .throw_exception ,
@@ -580,22 +643,83 @@ def assert_directory_verifies(self, path='',
580
643
to None (the default), the number of system CPUs will be used.
581
644
"""
582
645
583
- entry_dict = self .get_file_entry_dict (path )
646
+ remaining_entries = {}
647
+ entry_iter = self ._iter_file_entry_dict (path )
584
648
it = os .walk (os .path .join (self .root_directory , path ),
585
649
onerror = gemato .util .throw_exception ,
586
650
followlinks = True )
651
+ sort_key = lambda p : p .split (os .sep )
652
+ dir_stack = []
587
653
588
654
def _walk_directory (it ):
589
655
"""
590
656
Pre-process os.walk() result for verification. Yield objects
591
657
suitable to passing to subprocesses.
592
658
"""
593
- for dirpath , dirnames , filenames in it :
594
- relpath = os .path .relpath (dirpath , self .root_directory )
595
- # strip dot to avoid matching problems
596
- if relpath == '.' :
597
- relpath = ''
598
- dirdict = entry_dict .pop (relpath , {})
659
+ pop_until = None
660
+ entry_dir , entry_dict = next (entry_iter , (None , None ))
661
+ while True :
662
+ if pop_until is not None :
663
+ dirpath , dirnames , filenames , relpath = dir_stack .pop ()
664
+ if pop_until is relpath :
665
+ pop_until = None
666
+ elif (dir_stack and entry_dir is not None and
667
+ gemato .util .path_starts_with (dir_stack [- 1 ][- 1 ], entry_dir )):
668
+ dirpath , dirnames , filenames , relpath = dir_stack .pop ()
669
+ else :
670
+ try :
671
+ dirpath , dirnames , filenames = next (it )
672
+ except StopIteration :
673
+ while entry_dir is not None :
674
+ remaining_entries [entry_dir ] = entry_dict
675
+ entry_dir , entry_dict = next (entry_iter , (None , None ))
676
+ break
677
+
678
+ relpath = os .path .relpath (dirpath , self .root_directory )
679
+
680
+ # strip dot to avoid matching problems
681
+ if relpath == '.' :
682
+ relpath = ''
683
+
684
+ dirnames .sort ()
685
+
686
+ if relpath == entry_dir :
687
+ dirdict = entry_dict
688
+ entry_dir , entry_dict = next (entry_iter , (None , None ))
689
+ elif entry_dir is not None and gemato .util .path_starts_with (relpath , entry_dir ):
690
+ dirdict = {}
691
+ else :
692
+ relpath_key = sort_key (relpath )
693
+ if dir_stack and entry_dir is not None :
694
+ entry_dir_key = sort_key (entry_dir )
695
+ if relpath_key > entry_dir_key and entry_dir_key <= sort_key (dir_stack [- 1 ][- 1 ]):
696
+ # Try to insert it into the stack for later processing.
697
+ for i , item in enumerate (dir_stack ):
698
+ if item [- 1 ] and relpath_key > sort_key (item [- 1 ]):
699
+ dir_stack .insert (i , (dirpath , dirnames , filenames , relpath ))
700
+ dirpath = None
701
+ break
702
+ if dirpath is None :
703
+ if pop_until is None :
704
+ pop_until = relpath
705
+ continue
706
+ while entry_dir is not None and relpath_key > sort_key (entry_dir ):
707
+ remaining_entries [entry_dir ] = entry_dict
708
+ entry_dir , entry_dict = next (entry_iter , (None , None ))
709
+
710
+ if relpath == entry_dir :
711
+ dirdict = entry_dict
712
+ entry_dir , entry_dict = next (entry_iter , (None , None ))
713
+ elif entry_dir is not None :
714
+ relpath_key = sort_key (relpath )
715
+ entry_dir_key = sort_key (entry_dir )
716
+ if relpath_key < entry_dir_key and len (relpath_key ) <= len (entry_dir_key ):
717
+ dir_stack .append ((dirpath , dirnames , filenames , relpath ))
718
+ continue
719
+ else :
720
+ dirdict = {}
721
+ else :
722
+ dirdict = {}
599
723
600
724
skip_dirs = []
601
725
for d in dirnames :
@@ -643,7 +767,7 @@ def _walk_directory(it):
643
767
pool .close ()
644
768
645
769
# check for missing directories
646
- for relpath , dirdict in entry_dict .items ():
770
+ for relpath , dirdict in remaining_entries .items ():
647
771
for f , e in dirdict .items ():
648
772
fpath = os .path .join (relpath , f )
649
773
syspath = os .path .join (self .root_directory , fpath )
0 commit comments