From 1c59827d1da9bcd6970800d4f8a031b5859e8b4c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Oct 2005 21:23:43 -0700 Subject: [PATCH] mm: hugetlb truncation fixes hugetlbfs allows truncation of its files (should it?), but hugetlb.c often forgets that: crashes and misaccounting ensue. copy_hugetlb_page_range better grab the src page_table_lock since we don't want to guess what happens if concurrently truncated. unmap_hugepage_range rss accounting must not assume the full range was mapped. follow_hugetlb_page must guard with page_table_lock and be prepared to exit early. Restyle copy_hugetlb_page_range with a for loop like the others there. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 901ac523a1c3..a1b30d45459e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -274,21 +274,22 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long addr; - while (addr < end) { + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { dst_pte = huge_pte_alloc(dst, addr); if (!dst_pte) goto nomem; + spin_lock(&src->page_table_lock); src_pte = huge_pte_offset(src, addr); - BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */ - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(dst, addr, dst_pte, entry); - addr += HPAGE_SIZE; + if (src_pte && !pte_none(*src_pte)) { + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(dst, addr, dst_pte, entry); + } + spin_unlock(&src->page_table_lock); } return 0; @@ -323,8 +324,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, page = pte_page(pte); put_page(page); + add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); } - add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); flush_tlb_range(vma, start, end); } @@ -403,6 +404,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, BUG_ON(!is_vm_hugetlb_page(vma)); vpfn = vaddr/PAGE_SIZE; + spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { if (pages) { @@ -415,8 +417,13 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * indexing below to work. */ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); + /* the hugetlb file might have been truncated */ + if (!pte || pte_none(*pte)) { + remainder = 0; + if (!i) + i = -EFAULT; + break; + } page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; @@ -434,7 +441,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, --remainder; ++i; } - + spin_unlock(&mm->page_table_lock); *length = remainder; *position = vaddr; -- cgit v1.2.3 From ac9b9c667c2e1194e22ebe0a441ae1c37aaa9b90 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Oct 2005 16:24:28 +0100 Subject: [PATCH] Fix handling spurious page fault for hugetlb region This reverts commit 3359b54c8c07338f3a863d1109b42eebccdcf379 and replaces it with a cleaner version that is purely based on page table operations, so that the synchronization between inode size and hugetlb mappings becomes moot. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 16 +++------------- mm/hugetlb.c | 22 ++++++++++++++++++++++ mm/memory.c | 14 ++------------ 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 42cb7d70f9ac..d664330d900e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); struct page *alloc_huge_page(void); void free_huge_page(struct page *); +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access); extern unsigned long max_huge_pages; extern const unsigned long hugetlb_zero, hugetlb_infinity; @@ -99,6 +101,7 @@ static inline unsigned long hugetlb_total_pages(void) do { } while (0) #define alloc_huge_page() ({ NULL; }) #define free_huge_page(p) ({ (void)(p); BUG(); }) +#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #ifndef HPAGE_MASK #define HPAGE_MASK 0 /* Keep the compiler happy */ @@ -155,24 +158,11 @@ static inline void set_file_hugepages(struct file *file) { file->f_op = &hugetlbfs_file_operations; } - -static inline int valid_hugetlb_file_off(struct vm_area_struct *vma, - unsigned long address) -{ - struct inode *inode = vma->vm_file->f_dentry->d_inode; - loff_t file_off = address - vma->vm_start; - - file_off += (vma->vm_pgoff << PAGE_SHIFT); - - return (file_off < inode->i_size); -} - #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() #define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) -#define valid_hugetlb_file_off(vma, address) 0 #endif /* !CONFIG_HUGETLBFS */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1b30d45459e..61d380678030 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,6 +394,28 @@ out: return ret; } +/* + * On ia64 at least, it is possible to receive a hugetlb fault from a + * stale zero entry left in the TLB from earlier hardware prefetching. + * Low-level arch code should already have flushed the stale entry as + * part of its fault handling, but we do need to accept this minor fault + * and return successfully. Whereas the "normal" case is that this is + * an access to a hugetlb page which has been truncated off since mmap. + */ +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + int ret = VM_FAULT_SIGBUS; + pte_t *pte; + + spin_lock(&mm->page_table_lock); + pte = huge_pte_offset(mm, address); + if (pte && !pte_none(*pte)) + ret = VM_FAULT_MINOR; + spin_unlock(&mm->page_table_lock); + return ret; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) diff --git a/mm/memory.c b/mm/memory.c index 8c88b973abc5..1db40e935e55 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2045,18 +2045,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, inc_page_state(pgfault); - if (unlikely(is_vm_hugetlb_page(vma))) { - if (valid_hugetlb_file_off(vma, address)) - /* We get here only if there was a stale(zero) TLB entry - * (because of HW prefetching). - * Low-level arch code (if needed) should have already - * purged the stale entry as part of this fault handling. - * Here we just return. - */ - return VM_FAULT_MINOR; - else - return VM_FAULT_SIGBUS; /* mapping truncation does this. */ - } + if (unlikely(is_vm_hugetlb_page(vma))) + return hugetlb_fault(mm, vma, address, write_access); /* * We need the page table lock to synchronize with kswapd -- cgit v1.2.3