Skip to content

Commit dc91652

Browse files
gthelenjb-essential
authored andcommitted
writeback: safer lock nesting
commit 2e898e4c0a3897ccd434adac5abb8330194f527b upstream. lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end <interrupts mistakenly enabled> <interrupt> end_page_writeback test_clear_page_writeback lock_page_memcg <deadlock> unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [[email protected]: v4] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/[email protected] Fixes: 682aa8e ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen <[email protected]> Reported-by: Wang Long <[email protected]> Acked-by: Wang Long <[email protected]> Acked-by: Michal Hocko <[email protected]> Reviewed-by: Andrew Morton <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Tejun Heo <[email protected]> Cc: Nicholas Piggin <[email protected]> Cc: <[email protected]> [v4.2+] Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> [natechancellor: Applied to 4.4 based on Greg's backport on lkml.org] Signed-off-by: Nathan Chancellor <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 39c575b commit dc91652

File tree

4 files changed

+35
-26
lines changed

4 files changed

+35
-26
lines changed

fs/fs-writeback.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -747,11 +747,12 @@ int inode_congested(struct inode *inode, int cong_bits)
747747
*/
748748
if (inode && inode_to_wb_is_valid(inode)) {
749749
struct bdi_writeback *wb;
750-
bool locked, congested;
750+
struct wb_lock_cookie lock_cookie = {};
751+
bool congested;
751752

752-
wb = unlocked_inode_to_wb_begin(inode, &locked);
753+
wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
753754
congested = wb_congested(wb, cong_bits);
754-
unlocked_inode_to_wb_end(inode, locked);
755+
unlocked_inode_to_wb_end(inode, &lock_cookie);
755756
return congested;
756757
}
757758

include/linux/backing-dev-defs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
191191
set_wb_congested(bdi->wb.congested, sync);
192192
}
193193

194+
struct wb_lock_cookie {
195+
bool locked;
196+
unsigned long flags;
197+
};
198+
194199
#ifdef CONFIG_CGROUP_WRITEBACK
195200

196201
/**

include/linux/backing-dev.h

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -366,31 +366,31 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
366366
/**
367367
* unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
368368
* @inode: target inode
369-
* @lockedp: temp bool output param, to be passed to the end function
369+
* @cookie: output param, to be passed to the end function
370370
*
371371
* The caller wants to access the wb associated with @inode but isn't
372372
* holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
373373
* function determines the wb associated with @inode and ensures that the
374374
* association doesn't change until the transaction is finished with
375375
* unlocked_inode_to_wb_end().
376376
*
377-
* The caller must call unlocked_inode_to_wb_end() with *@lockdep
378-
* afterwards and can't sleep during transaction. IRQ may or may not be
379-
* disabled on return.
377+
* The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
378+
* can't sleep during the transaction. IRQs may or may not be disabled on
379+
* return.
380380
*/
381381
static inline struct bdi_writeback *
382-
unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
382+
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
383383
{
384384
rcu_read_lock();
385385

386386
/*
387387
* Paired with store_release in inode_switch_wb_work_fn() and
388388
* ensures that we see the new wb if we see cleared I_WB_SWITCH.
389389
*/
390-
*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
390+
cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
391391

392-
if (unlikely(*lockedp))
393-
spin_lock_irq(&inode->i_mapping->tree_lock);
392+
if (unlikely(cookie->locked))
393+
spin_lock_irqsave(&inode->i_mapping->tree_lock, cookie->flags);
394394

395395
/*
396396
* Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
@@ -402,12 +402,14 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
402402
/**
403403
* unlocked_inode_to_wb_end - end inode wb access transaction
404404
* @inode: target inode
405-
* @locked: *@lockedp from unlocked_inode_to_wb_begin()
405+
* @cookie: @cookie from unlocked_inode_to_wb_begin()
406406
*/
407-
static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
407+
static inline void unlocked_inode_to_wb_end(struct inode *inode,
408+
struct wb_lock_cookie *cookie)
408409
{
409-
if (unlikely(locked))
410-
spin_unlock_irq(&inode->i_mapping->tree_lock);
410+
if (unlikely(cookie->locked))
411+
spin_unlock_irqrestore(&inode->i_mapping->tree_lock,
412+
cookie->flags);
411413

412414
rcu_read_unlock();
413415
}
@@ -454,12 +456,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
454456
}
455457

456458
static inline struct bdi_writeback *
457-
unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
459+
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
458460
{
459461
return inode_to_wb(inode);
460462
}
461463

462-
static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
464+
static inline void unlocked_inode_to_wb_end(struct inode *inode,
465+
struct wb_lock_cookie *cookie)
463466
{
464467
}
465468

mm/page-writeback.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2526,13 +2526,13 @@ void account_page_redirty(struct page *page)
25262526
if (mapping && mapping_cap_account_dirty(mapping)) {
25272527
struct inode *inode = mapping->host;
25282528
struct bdi_writeback *wb;
2529-
bool locked;
2529+
struct wb_lock_cookie cookie = {};
25302530

2531-
wb = unlocked_inode_to_wb_begin(inode, &locked);
2531+
wb = unlocked_inode_to_wb_begin(inode, &cookie);
25322532
current->nr_dirtied--;
25332533
dec_zone_page_state(page, NR_DIRTIED);
25342534
dec_wb_stat(wb, WB_DIRTIED);
2535-
unlocked_inode_to_wb_end(inode, locked);
2535+
unlocked_inode_to_wb_end(inode, &cookie);
25362536
}
25372537
}
25382538
EXPORT_SYMBOL(account_page_redirty);
@@ -2638,15 +2638,15 @@ void cancel_dirty_page(struct page *page)
26382638
struct inode *inode = mapping->host;
26392639
struct bdi_writeback *wb;
26402640
struct mem_cgroup *memcg;
2641-
bool locked;
2641+
struct wb_lock_cookie cookie = {};
26422642

26432643
memcg = mem_cgroup_begin_page_stat(page);
2644-
wb = unlocked_inode_to_wb_begin(inode, &locked);
2644+
wb = unlocked_inode_to_wb_begin(inode, &cookie);
26452645

26462646
if (TestClearPageDirty(page))
26472647
account_page_cleaned(page, mapping, memcg, wb);
26482648

2649-
unlocked_inode_to_wb_end(inode, locked);
2649+
unlocked_inode_to_wb_end(inode, &cookie);
26502650
mem_cgroup_end_page_stat(memcg);
26512651
} else {
26522652
ClearPageDirty(page);
@@ -2679,7 +2679,7 @@ int clear_page_dirty_for_io(struct page *page)
26792679
struct inode *inode = mapping->host;
26802680
struct bdi_writeback *wb;
26812681
struct mem_cgroup *memcg;
2682-
bool locked;
2682+
struct wb_lock_cookie cookie = {};
26832683

26842684
/*
26852685
* Yes, Virginia, this is indeed insane.
@@ -2717,14 +2717,14 @@ int clear_page_dirty_for_io(struct page *page)
27172717
* exclusion.
27182718
*/
27192719
memcg = mem_cgroup_begin_page_stat(page);
2720-
wb = unlocked_inode_to_wb_begin(inode, &locked);
2720+
wb = unlocked_inode_to_wb_begin(inode, &cookie);
27212721
if (TestClearPageDirty(page)) {
27222722
mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
27232723
dec_zone_page_state(page, NR_FILE_DIRTY);
27242724
dec_wb_stat(wb, WB_RECLAIMABLE);
27252725
ret = 1;
27262726
}
2727-
unlocked_inode_to_wb_end(inode, locked);
2727+
unlocked_inode_to_wb_end(inode, &cookie);
27282728
mem_cgroup_end_page_stat(memcg);
27292729
return ret;
27302730
}

0 commit comments

Comments
 (0)