Skip to content

Commit e39dafe

Browse files
craig[bot]fqazi
andcommitted
Merge #119305
119305: sql: crdb_internal.leases could cause a deadlock with the lease manager r=fqazi a=fqazi Previously, a deadlock could occur between crdb_internal.leases and the lease manager when attempting to renew or release leases on the role_member inside crdb_internal.leases table. This deadlock occurs because lease manager lock is held while visting leases. The issue was easily reproducible when the allow_role_memberships_to_change_during_transaction setting was enabled. To resolve this, the patch introduces in-memory caching of all leases within crdb_internal.leases before privilege checks are performed, and any locks are subsequently released. Fixes: #119253 Fixes: #97280 Release note (bug fix): crdb_internal.leases could cause a node to become unavailable due to a deadlock in the leasing subsystem. Co-authored-by: Faizan Qazi <[email protected]>
2 parents a58e89a + 10618f3 commit e39dafe

File tree

2 files changed

+50
-17
lines changed

2 files changed

+50
-17
lines changed

pkg/sql/crdb_internal.go

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -856,9 +856,14 @@ CREATE TABLE crdb_internal.schema_changes (
856856
},
857857
}
858858

859+
type crdbInternalLeasesTableEntry struct {
860+
desc catalog.Descriptor
861+
takenOffline bool
862+
expiration tree.DTimestamp
863+
}
864+
859865
// TODO(tbg): prefix with node_.
860866
var crdbInternalLeasesTable = virtualSchemaTable{
861-
comment: `acquired table leases (RAM; local node only)`,
862867
schema: `
863868
CREATE TABLE crdb_internal.leases (
864869
node_id INT NOT NULL,
@@ -868,33 +873,45 @@ CREATE TABLE crdb_internal.leases (
868873
expiration TIMESTAMP NOT NULL,
869874
deleted BOOL NOT NULL
870875
)`,
876+
comment: `acquired table leases (RAM; local node only)`,
871877
populate: func(
872878
ctx context.Context, p *planner, _ catalog.DatabaseDescriptor, addRow func(...tree.Datum) error,
873879
) error {
874880
nodeID, _ := p.execCfg.NodeInfo.NodeID.OptionalNodeID() // zero if not available
875-
var iterErr error
881+
var leaseEntries []crdbInternalLeasesTableEntry
876882
p.LeaseMgr().VisitLeases(func(desc catalog.Descriptor, takenOffline bool, _ int, expiration tree.DTimestamp) (wantMore bool) {
877-
if ok, err := p.HasAnyPrivilege(ctx, desc); err != nil {
878-
iterErr = err
879-
return false
883+
// Because we have locked the lease manager while visiting every lease,
884+
// it not safe to *acquire* or *release* any leases in the process. As
885+
// a result, build an in memory cache of entries and process them after
886+
// all the locks have been released. Previously we would check privileges,
887+
// which would need to get the lease on the role_member table. Simply skipping
888+
// this check on that table will not be sufficient since any active lease
889+
// on it could expire or need an renewal and cause the sample problem.
890+
leaseEntries = append(leaseEntries, crdbInternalLeasesTableEntry{
891+
desc: desc,
892+
takenOffline: takenOffline,
893+
expiration: expiration,
894+
})
895+
return true
896+
})
897+
for _, entry := range leaseEntries {
898+
if ok, err := p.HasAnyPrivilege(ctx, entry.desc); err != nil {
899+
return err
880900
} else if !ok {
881-
return true
901+
continue
882902
}
883-
884903
if err := addRow(
885904
tree.NewDInt(tree.DInt(nodeID)),
886-
tree.NewDInt(tree.DInt(int64(desc.GetID()))),
887-
tree.NewDString(desc.GetName()),
888-
tree.NewDInt(tree.DInt(int64(desc.GetParentID()))),
889-
&expiration,
890-
tree.MakeDBool(tree.DBool(takenOffline)),
905+
tree.NewDInt(tree.DInt(int64(entry.desc.GetID()))),
906+
tree.NewDString(entry.desc.GetName()),
907+
tree.NewDInt(tree.DInt(int64(entry.desc.GetParentID()))),
908+
&entry.expiration,
909+
tree.MakeDBool(tree.DBool(entry.takenOffline)),
891910
); err != nil {
892-
iterErr = err
893-
return false
911+
return err
894912
}
895-
return true
896-
})
897-
return iterErr
913+
}
914+
return nil
898915
},
899916
}
900917

pkg/sql/logictest/testdata/logic_test/crdb_internal

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,3 +1647,19 @@ USE test;
16471647
DROP DATABASE test_cross_db;
16481648

16491649
subtest end
1650+
1651+
1652+
# In #119253 we observed a deadlock when visiting leases inside crdb_internal.lease
1653+
# between the lease manager and the authorization check for each descriptor. This
1654+
# test validates that this is no longer the case.
1655+
subtest allow_role_memberships_to_change_during_transaction
1656+
1657+
user testuser
1658+
1659+
statement ok
1660+
set allow_role_memberships_to_change_during_transaction=true
1661+
1662+
statement ok
1663+
SELECT * FROM crdb_internal.leases;
1664+
1665+
subtest end

0 commit comments

Comments
 (0)