Skip to content

Commit 06e8b42

Browse files
Michal Hockojb-essential
authored andcommitted
mm: allow GFP_{FS,IO} for page_cache_read page cache allocation
commit c20cd45eb01748f0fba77a504f956b000df4ea73 upstream. page_cache_read has been historically using page_cache_alloc_cold to allocate a new page. This means that mapping_gfp_mask is used as the base for the gfp_mask. Many filesystems are setting this mask to GFP_NOFS to prevent from fs recursion issues. page_cache_read is called from the vm_operations_struct::fault() context during the page fault. This context doesn't need the reclaim protection normally. ceph and ocfs2 which call filemap_fault from their fault handlers seem to be OK because they are not taking any fs lock before invoking generic implementation. xfs which takes XFS_MMAPLOCK_SHARED is safe from the reclaim recursion POV because this lock serializes truncate and punch hole with the page faults and it doesn't get involved in the reclaim. There is simply no reason to deliberately use a weaker allocation context when a __GFP_FS | __GFP_IO can be used. The GFP_NOFS protection might be even harmful. There is a push to fail GFP_NOFS allocations rather than loop within allocator indefinitely with a very limited reclaim ability. Once we start failing those requests the OOM killer might be triggered prematurely because the page cache allocation failure is propagated up the page fault path and end up in pagefault_out_of_memory. We cannot play with mapping_gfp_mask directly because that would be racy wrt. parallel page faults and it might interfere with other users who really rely on NOFS semantic from the stored gfp_mask. The mask is also inode proper so it would even be a layering violation. What we can do instead is to push the gfp_mask into struct vm_fault and allow fs layer to overwrite it should the callback need to be called with a different allocation context. Initialize the default to (mapping_gfp_mask | __GFP_FS | __GFP_IO) because this should be safe from the page fault path normally. Why do we care about mapping_gfp_mask at all then? Because this doesn't hold only reclaim protection flags but it also might contain zone and movability restrictions (GFP_DMA32, __GFP_MOVABLE and others) so we have to respect those. Signed-off-by: Michal Hocko <[email protected]> Reported-by: Tetsuo Handa <[email protected]> Acked-by: Jan Kara <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Dave Chinner <[email protected]> Cc: Mark Fasheh <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 7533d39 commit 06e8b42

File tree

3 files changed

+25
-5
lines changed

3 files changed

+25
-5
lines changed

include/linux/mm.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,14 @@ extern pgprot_t protection_map[16];
237237
* ->fault function. The vma's ->fault is responsible for returning a bitmask
238238
* of VM_FAULT_xxx flags that give details about how the fault was handled.
239239
*
240+
* MM layer fills up gfp_mask for page allocations but fault handler might
241+
* alter it if its implementation requires a different allocation context.
242+
*
240243
* pgoff should be used in favour of virtual_address, if possible.
241244
*/
242245
struct vm_fault {
243246
unsigned int flags; /* FAULT_FLAG_xxx flags */
247+
gfp_t gfp_mask; /* gfp mask to be used for allocations */
244248
pgoff_t pgoff; /* Logical page offset based on vma */
245249
void __user *virtual_address; /* Faulting virtual address */
246250

mm/filemap.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1829,19 +1829,18 @@ EXPORT_SYMBOL(generic_file_read_iter);
18291829
* This adds the requested page to the page cache if it isn't already there,
18301830
* and schedules an I/O to read in its contents from disk.
18311831
*/
1832-
static int page_cache_read(struct file *file, pgoff_t offset)
1832+
static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
18331833
{
18341834
struct address_space *mapping = file->f_mapping;
18351835
struct page *page;
18361836
int ret;
18371837

18381838
do {
1839-
page = page_cache_alloc_cold(mapping);
1839+
page = __page_cache_alloc(gfp_mask|__GFP_COLD);
18401840
if (!page)
18411841
return -ENOMEM;
18421842

1843-
ret = add_to_page_cache_lru(page, mapping, offset,
1844-
mapping_gfp_constraint(mapping, GFP_KERNEL));
1843+
ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
18451844
if (ret == 0)
18461845
ret = mapping->a_ops->readpage(file, page);
18471846
else if (ret == -EEXIST)
@@ -2022,7 +2021,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
20222021
* We're only likely to ever get here if MADV_RANDOM is in
20232022
* effect.
20242023
*/
2025-
error = page_cache_read(file, offset);
2024+
error = page_cache_read(file, offset, vmf->gfp_mask);
20262025

20272026
/*
20282027
* The page we want has now been added to the page cache.

mm/memory.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1990,6 +1990,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
19901990
copy_user_highpage(dst, src, va, vma);
19911991
}
19921992

1993+
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
1994+
{
1995+
struct file *vm_file = vma->vm_file;
1996+
1997+
if (vm_file)
1998+
return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
1999+
2000+
/*
2001+
* Special mappings (e.g. VDSO) do not have any file so fake
2002+
* a default GFP_KERNEL for them.
2003+
*/
2004+
return GFP_KERNEL;
2005+
}
2006+
19932007
/*
19942008
* Notify the address space that the page is about to become writable so that
19952009
* it can prohibit this or wait for the page to get into an appropriate state.
@@ -2005,6 +2019,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
20052019
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
20062020
vmf.pgoff = page->index;
20072021
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2022+
vmf.gfp_mask = __get_fault_gfp_mask(vma);
20082023
vmf.page = page;
20092024
vmf.cow_page = NULL;
20102025

@@ -2771,6 +2786,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
27712786
vmf.pgoff = pgoff;
27722787
vmf.flags = flags;
27732788
vmf.page = NULL;
2789+
vmf.gfp_mask = __get_fault_gfp_mask(vma);
27742790
vmf.cow_page = cow_page;
27752791

27762792
ret = vma->vm_ops->fault(vma, &vmf);
@@ -2937,6 +2953,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
29372953
vmf.pgoff = pgoff;
29382954
vmf.max_pgoff = max_pgoff;
29392955
vmf.flags = flags;
2956+
vmf.gfp_mask = __get_fault_gfp_mask(vma);
29402957
vma->vm_ops->map_pages(vma, &vmf);
29412958
}
29422959

0 commit comments

Comments
 (0)