Skip to content

Commit f85c28e

Browse files
authored
heal: large objects fix and avoid .healing.bin corner case premature exit (minio#20577)
xlStorage.Healing() returns nil if there is an error reading .healing.bin or if this latter is empty. healing.bin update() call returns early if .healing.bin is empty; hence, no further update of .healing.bin is possible. A .healing.bin can be empty if os.Open() with O_TRUNC is successful but the next Write returns an error. To avoid this weird situation, avoid making healingTracker.update() to return early if .healing.bin is empty, so write again. This commit also fixes wrong error log printing when an object is healed in another drive in the same erasure set but not in the drive that is actively healing by fresh drive healing code. Currently, it prints <nil> instead of a factual error. * heal: Scan .minio.sys metadata only during site-wide heal (minio#137) mc admin heal always invoke .minio.sys heal, but sometimes, this latter contains a lot of data, many service accounts, STS accounts etc, which makes mc admin heal command very slow. Only invoke .minio.sys healing when no bucket was specified in `mc admin heal` command.
1 parent f7e176d commit f85c28e

4 files changed

+25
-27
lines changed

cmd/admin-heal-ops.go

+10-9
Original file line numberDiff line numberDiff line change
@@ -806,18 +806,20 @@ func (h *healSequence) healDiskMeta(objAPI ObjectLayer) error {
806806
return h.healMinioSysMeta(objAPI, minioConfigPrefix)()
807807
}
808808

809-
func (h *healSequence) healItems(objAPI ObjectLayer, bucketsOnly bool) error {
809+
func (h *healSequence) healItems(objAPI ObjectLayer) error {
810810
if h.clientToken == bgHealingUUID {
811811
// For background heal do nothing.
812812
return nil
813813
}
814814

815-
if err := h.healDiskMeta(objAPI); err != nil {
816-
return err
815+
if h.bucket == "" { // heal internal meta only during a site-wide heal
816+
if err := h.healDiskMeta(objAPI); err != nil {
817+
return err
818+
}
817819
}
818820

819821
// Heal buckets and objects
820-
return h.healBuckets(objAPI, bucketsOnly)
822+
return h.healBuckets(objAPI)
821823
}
822824

823825
// traverseAndHeal - traverses on-disk data and performs healing
@@ -828,8 +830,7 @@ func (h *healSequence) healItems(objAPI ObjectLayer, bucketsOnly bool) error {
828830
// has to wait until a safe point is reached, such as between scanning
829831
// two objects.
830832
func (h *healSequence) traverseAndHeal(objAPI ObjectLayer) {
831-
bucketsOnly := false // Heals buckets and objects also.
832-
h.traverseAndHealDoneCh <- h.healItems(objAPI, bucketsOnly)
833+
h.traverseAndHealDoneCh <- h.healItems(objAPI)
833834
xioutil.SafeClose(h.traverseAndHealDoneCh)
834835
}
835836

@@ -856,14 +857,14 @@ func (h *healSequence) healMinioSysMeta(objAPI ObjectLayer, metaPrefix string) f
856857
}
857858

858859
// healBuckets - check for all buckets heal or just particular bucket.
859-
func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error {
860+
func (h *healSequence) healBuckets(objAPI ObjectLayer) error {
860861
if h.isQuitting() {
861862
return errHealStopSignalled
862863
}
863864

864865
// 1. If a bucket was specified, heal only the bucket.
865866
if h.bucket != "" {
866-
return h.healBucket(objAPI, h.bucket, bucketsOnly)
867+
return h.healBucket(objAPI, h.bucket, false)
867868
}
868869

869870
buckets, err := objAPI.ListBuckets(h.ctx, BucketOptions{})
@@ -877,7 +878,7 @@ func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error {
877878
})
878879

879880
for _, bucket := range buckets {
880-
if err = h.healBucket(objAPI, bucket.Name, bucketsOnly); err != nil {
881+
if err = h.healBucket(objAPI, bucket.Name, false); err != nil {
881882
return err
882883
}
883884
}

cmd/background-newdisks-heal-ops.go

-3
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,6 @@ func (h *healingTracker) updateProgress(success, skipped bool, bytes uint64) {
223223
// update will update the tracker on the disk.
224224
// If the tracker has been deleted an error is returned.
225225
func (h *healingTracker) update(ctx context.Context) error {
226-
if h.disk.Healing() == nil {
227-
return fmt.Errorf("healingTracker: drive %q is not marked as healing", h.ID)
228-
}
229226
h.mu.Lock()
230227
if h.ID == "" || h.PoolIndex < 0 || h.SetIndex < 0 || h.DiskIndex < 0 {
231228
h.ID, _ = h.disk.GetDiskID()

cmd/global-heal.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,6 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
459459
continue
460460
}
461461

462-
var versionHealed bool
463-
464462
res, err := er.HealObject(ctx, bucket, encodedEntryName,
465463
version.VersionID, madmin.HealOpts{
466464
ScanMode: scanMode,
@@ -475,21 +473,23 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
475473
}
476474
} else {
477475
// Look for the healing results
478-
if res.After.Drives[tracker.DiskIndex].State == madmin.DriveStateOk {
479-
versionHealed = true
476+
if res.After.Drives[tracker.DiskIndex].State != madmin.DriveStateOk {
477+
err = fmt.Errorf("unexpected after heal state: %s", res.After.Drives[tracker.DiskIndex].State)
480478
}
481479
}
482480

483-
if versionHealed {
481+
if err == nil {
484482
bgSeq.countHealed(madmin.HealItemObject)
485483
result = healEntrySuccess(uint64(version.Size))
486484
} else {
487485
bgSeq.countFailed(madmin.HealItemObject)
488486
result = healEntryFailure(uint64(version.Size))
489487
if version.VersionID != "" {
490-
healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s-v(%s): %w", bucket, version.Name, version.VersionID, err))
488+
healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s (version-id=%s): %w",
489+
bucket, version.Name, version.VersionID, err))
491490
} else {
492-
healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s: %w", bucket, version.Name, err))
491+
healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s: %w",
492+
bucket, version.Name, err))
493493
}
494494
}
495495

cmd/xl-storage.go

+8-8
Original file line numberDiff line numberDiff line change
@@ -432,15 +432,19 @@ func (s *xlStorage) Healing() *healingTracker {
432432
bucketMetaPrefix, healingTrackerFilename)
433433
b, err := os.ReadFile(healingFile)
434434
if err != nil {
435+
if !errors.Is(err, os.ErrNotExist) {
436+
internalLogIf(GlobalContext, fmt.Errorf("unable to read %s: %w", healingFile, err))
437+
}
435438
return nil
436439
}
437440
if len(b) == 0 {
441+
internalLogIf(GlobalContext, fmt.Errorf("%s is empty", healingFile))
438442
// 'healing.bin' might be truncated
439443
return nil
440444
}
441445
h := newHealingTracker()
442446
_, err = h.UnmarshalMsg(b)
443-
bugLogIf(GlobalContext, err)
447+
internalLogIf(GlobalContext, err)
444448
return h
445449
}
446450

@@ -2246,6 +2250,7 @@ func (s *xlStorage) writeAllMeta(ctx context.Context, volume string, path string
22462250
return renameAll(tmpFilePath, filePath, volumeDir)
22472251
}
22482252

2253+
// Create or truncate an existing file before writing
22492254
func (s *xlStorage) writeAllInternal(ctx context.Context, filePath string, b []byte, sync bool, skipParent string) (err error) {
22502255
flags := os.O_CREATE | os.O_WRONLY | os.O_TRUNC
22512256

@@ -2268,16 +2273,11 @@ func (s *xlStorage) writeAllInternal(ctx context.Context, filePath string, b []b
22682273
return err
22692274
}
22702275

2271-
n, err := w.Write(b)
2276+
_, err = w.Write(b)
22722277
if err != nil {
2273-
w.Close()
2274-
return err
2275-
}
2276-
2277-
if n != len(b) {
22782278
w.Truncate(0) // to indicate that we did partial write.
22792279
w.Close()
2280-
return io.ErrShortWrite
2280+
return err
22812281
}
22822282

22832283
// Dealing with error returns from close() - 'man 2 close'

0 commit comments

Comments
 (0)