From 723012cab779eee8228376754e22c6594229bf8f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:44 +0000 Subject: ubifs: Set page uptodate in the correct place Page cache reads are lockless, so setting the freshly allocated page uptodate before we've overwritten it with the data it's supposed to have in it will allow a simultaneous reader to see old data. Move the call to SetPageUptodate into ubifs_write_end(), which is after we copied the new data into the page. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5029eb3390a5..d0694b83dd02 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -261,9 +261,6 @@ static int write_begin_slow(struct address_space *mapping, return err; } } - - SetPageUptodate(page); - ClearPageError(page); } if (PagePrivate(page)) @@ -463,9 +460,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return err; } } - - SetPageUptodate(page); - ClearPageError(page); } err = allocate_budget(c, page, ui, appending); @@ -475,10 +469,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * If we skipped reading the page because we were going to * write all of it, then it is not up to date. */ - if (skipped_read) { + if (skipped_read) ClearPageChecked(page); - ClearPageUptodate(page); - } /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -569,6 +561,9 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, goto out; } + if (len == PAGE_SIZE) + SetPageUptodate(page); + if (!PagePrivate(page)) { attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); -- cgit v1.2.3 From 0df030d082d5b226984733b2e7386fa9760a7ca1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:45 +0000 Subject: ubifs: Convert from writepage to writepages This is a simplistic conversion to separate out any effects of no longer having a writepage method. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d0694b83dd02..2022a31006df 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1001,8 +1001,10 @@ static int do_writepage(struct page *page, int len) * on the page lock and it would not write the truncated inode node to the * journal before we have finished. */ -static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, + void *data) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); @@ -1074,6 +1076,12 @@ out_unlock: return err; } +static int ubifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return write_cache_pages(mapping, wbc, ubifs_writepage, NULL); +} + /** * do_attr_changes - change inode attributes. * @inode: inode to change attributes for @@ -1643,7 +1651,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct address_space_operations ubifs_file_address_operations = { .read_folio = ubifs_read_folio, - .writepage = ubifs_writepage, + .writepages = ubifs_writepages, .write_begin = ubifs_write_begin, .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, -- cgit v1.2.3 From c35acef383f4a2f2cfc30a5d8d3b0d49a86a1f7f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:46 +0000 Subject: ubifs: Convert ubifs_writepage to use a folio We still pass the page down to do_writepage(), but ubifs_writepage() itself is now large folio safe. It also contains far fewer hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2022a31006df..a4e8bec6c03c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1004,21 +1004,18 @@ static int do_writepage(struct page *page, int len) static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); loff_t i_size = i_size_read(inode), synced_i_size; - pgoff_t end_index = i_size >> PAGE_SHIFT; - int err, len = i_size & (PAGE_SIZE - 1); - void *kaddr; + int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, page->index, page->flags); - ubifs_assert(c, PagePrivate(page)); + inode->i_ino, folio->index, folio->flags); + ubifs_assert(c, folio->private != NULL); - /* Is the page fully outside @i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !len)) { + /* Is the folio fully outside @i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { err = 0; goto out_unlock; } @@ -1027,9 +1024,9 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, synced_i_size = ui->synced_i_size; spin_unlock(&ui->ui_lock); - /* Is the page fully inside @i_size? */ - if (page->index < end_index) { - if (page->index >= synced_i_size >> PAGE_SHIFT) { + /* Is the folio fully inside i_size? */ + if (folio_pos(folio) + len <= i_size) { + if (folio_pos(folio) >= synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; @@ -1042,20 +1039,18 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, * with this. */ } - return do_writepage(page, PAGE_SIZE); + return do_writepage(&folio->page, len); } /* - * The page straddles @i_size. It must be zeroed out on each and every + * The folio straddles @i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - kaddr = kmap_atomic(page); - memset(kaddr + len, 0, PAGE_SIZE - len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + len = i_size - folio_pos(folio); + folio_zero_segment(folio, len, folio_size(folio)); if (i_size > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); @@ -1063,16 +1058,16 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, goto out_redirty; } - return do_writepage(page, len); + return do_writepage(&folio->page, len); out_redirty: /* - * redirty_page_for_writepage() won't call ubifs_dirty_inode() because + * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so * there is no need to do space budget for dirty inode. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } -- cgit v1.2.3 From 783d074167711d8109131b66dd0b70743bfc2918 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:47 +0000 Subject: ubifs: Use a folio in do_truncation() Convert from the old page APIs to the new folio APIs which saves a few hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a4e8bec6c03c..7039c9006f24 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1153,11 +1153,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (offset) { pgoff_t index = new_size >> PAGE_SHIFT; - struct page *page; + struct folio *folio; - page = find_lock_page(inode->i_mapping, index); - if (page) { - if (PageDirty(page)) { + folio = filemap_lock_folio(inode->i_mapping, index); + if (!IS_ERR(folio)) { + if (folio_test_dirty(folio)) { /* * 'ubifs_jnl_truncate()' will try to truncate * the last data node, but it contains @@ -1166,14 +1166,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * 'ubifs_jnl_truncate()' will see an already * truncated (and up to date) data node. */ - ubifs_assert(c, PagePrivate(page)); + ubifs_assert(c, folio->private != NULL); - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); if (UBIFS_BLOCKS_PER_PAGE_SHIFT) - offset = new_size & - (PAGE_SIZE - 1); - err = do_writepage(page, offset); - put_page(page); + offset = offset_in_folio(folio, + new_size); + err = do_writepage(&folio->page, offset); + folio_put(folio); if (err) goto out_budg; /* @@ -1186,8 +1186,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * to 'ubifs_jnl_truncate()' to save it from * having to read it. */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } } -- cgit v1.2.3 From 0c2d140c1f73f610c7e3f71888e1a50d770117e0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:48 +0000 Subject: ubifs: Convert do_writepage() to take a folio Replace the call to SetPageError() with a call to mapping_set_error(). Support large folios by using kmap_local_folio() and remapping each time we cross a page boundary. Saves a lot of hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 56 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 7039c9006f24..70d391b72897 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -898,60 +898,64 @@ static int ubifs_read_folio(struct file *file, struct folio *folio) return 0; } -static int do_writepage(struct page *page, int len) +static int do_writepage(struct folio *folio, size_t len) { - int err = 0, i, blen; + int err = 0, blen; unsigned int block; void *addr; + size_t offset = 0; union ubifs_key key; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT); + ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT); spin_unlock(&ui->ui_lock); #endif - /* Update radix tree tags */ - set_page_writeback(page); + folio_start_writeback(folio); - addr = kmap(page); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; - i = 0; - while (len) { - blen = min_t(int, len, UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, offset); + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + for (;;) { + blen = min_t(size_t, len, UBIFS_BLOCK_SIZE); data_key_init(c, &key, inode->i_ino, block); err = ubifs_jnl_write_data(c, inode, &key, addr, blen); if (err) break; - if (++i >= UBIFS_BLOCKS_PER_PAGE) + len -= blen; + if (!len) break; block += 1; addr += blen; - len -= blen; + if (folio_test_highmem(folio) && !offset_in_page(addr)) { + kunmap_local(addr - blen); + offset += PAGE_SIZE; + addr = kmap_local_folio(folio, offset); + } } + kunmap_local(addr); if (err) { - SetPageError(page); - ubifs_err(c, "cannot write page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); + mapping_set_error(folio->mapping, err); + ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); ubifs_ro_mode(c, err); } - ubifs_assert(c, PagePrivate(page)); - if (PageChecked(page)) + ubifs_assert(c, folio->private != NULL); + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - detach_page_private(page); - ClearPageChecked(page); + folio_detach_private(folio); + folio_clear_checked(folio); - kunmap(page); - unlock_page(page); - end_page_writeback(page); + folio_unlock(folio); + folio_end_writeback(folio); return err; } @@ -1039,7 +1043,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, * with this. */ } - return do_writepage(&folio->page, len); + return do_writepage(folio, len); } /* @@ -1058,7 +1062,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, goto out_redirty; } - return do_writepage(&folio->page, len); + return do_writepage(folio, len); out_redirty: /* * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because @@ -1172,7 +1176,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (UBIFS_BLOCKS_PER_PAGE_SHIFT) offset = offset_in_folio(folio, new_size); - err = do_writepage(&folio->page, offset); + err = do_writepage(folio, offset); folio_put(folio); if (err) goto out_budg; -- cgit v1.2.3 From 85ffbf555794c2484b9ab440d3bc23638a0cb315 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:49 +0000 Subject: ubifs: Convert ubifs_vm_page_mkwrite() to use a folio Replace six implicit calls to compound_head() with one. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- Documentation/mm/page_cache.rst | 10 ++++++++++ fs/ubifs/file.c | 36 ++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'fs/ubifs') diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst index 75eba7c431b2..138d61f869df 100644 --- a/Documentation/mm/page_cache.rst +++ b/Documentation/mm/page_cache.rst @@ -3,3 +3,13 @@ ========== Page Cache ========== + +The page cache is the primary way that the user and the rest of the kernel +interact with filesystems. It can be bypassed (e.g. with O_DIRECT), +but normal reads, writes and mmaps go through the page cache. + +Folios +====== + +The folio is the unit of memory management within the page cache. +Operations diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 70d391b72897..3c4a98476c23 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1514,14 +1514,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) */ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec64 now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; - dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index, i_size_read(inode)); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -1529,17 +1529,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* -EROFS */ /* - * We have not locked @page so far so we may budget for changing the - * page. Note, we cannot do this after we locked the page, because + * We have not locked @folio so far so we may budget for changing the + * folio. Note, we cannot do this after we locked the folio, because * budgeting may cause write-back which would cause deadlock. * - * At the moment we do not know whether the page is dirty or not, so we - * assume that it is not and budget for a new page. We could look at + * At the moment we do not know whether the folio is dirty or not, so we + * assume that it is not and budget for a new folio. We could look at * the @PG_private flag and figure this out, but we may race with write - * back and the page state may change by the time we lock it, so this + * back and the folio state may change by the time we lock it, so this * would need additional care. We do not bother with this at the * moment, although it might be good idea to do. Instead, we allocate - * budget for a new page and amend it later on if the page was in fact + * budget for a new folio and amend it later on if the folio was in fact * dirty. * * The budgeting-related logic of this function is similar to what we @@ -1562,21 +1562,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping || - page_offset(page) > i_size_read(inode))) { - /* Page got truncated out from underneath us */ + folio_lock(folio); + if (unlikely(folio->mapping != inode->i_mapping || + folio_pos(folio) >= i_size_read(inode))) { + /* Folio got truncated out from underneath us */ goto sigbus; } - if (PagePrivate(page)) + if (folio->private) release_new_page_budget(c); else { - if (!PageChecked(page)) + if (!folio_test_checked(folio)) ubifs_convert_page_budget(c); - attach_page_private(page, (void *)1); + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(folio->mapping, folio); } if (update_time) { @@ -1592,11 +1592,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) ubifs_release_dirty_inode_budget(c, ui); } - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; sigbus: - unlock_page(page); + folio_unlock(folio); ubifs_release_budget(c, &req); return VM_FAULT_SIGBUS; } -- cgit v1.2.3 From 2ec718435abb42901c83fbf62fc4d638d4078c27 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:50 +0000 Subject: ubifs: Convert write_begin_slow() to use a folio Update to new APIs, removing several calls to compound_head() and including support for large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3c4a98476c23..cc4d2ec95b70 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -222,16 +222,16 @@ static int write_begin_slow(struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; struct ubifs_budget_req req = { .new_page = 1 }; int err, appending = !!(pos + len > inode->i_size); - struct page *page; + struct folio *folio; dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", inode->i_ino, pos, len, inode->i_size); /* - * At the slow path we have to budget before locking the page, because - * budgeting may force write-back, which would wait on locked pages and - * deadlock if we had the page locked. At this point we do not know - * anything about the page, so assume that this is a new page which is + * At the slow path we have to budget before locking the folio, because + * budgeting may force write-back, which would wait on locked folios and + * deadlock if we had the folio locked. At this point we do not know + * anything about the folio, so assume that this is a new folio which is * written to a hole. This corresponds to largest budget. Later the * budget will be amended if this is not true. */ @@ -243,42 +243,43 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { ubifs_release_budget(c, &req); - return -ENOMEM; + return PTR_ERR(folio); } - if (!PageUptodate(page)) { - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) - SetPageChecked(page); + if (!folio_test_uptodate(folio)) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) + folio_set_checked(folio); else { - err = do_readpage(page); + err = do_readpage(&folio->page); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ubifs_release_budget(c, &req); return err; } } } - if (PagePrivate(page)) + if (folio->private) /* - * The page is dirty, which means it was budgeted twice: + * The folio is dirty, which means it was budgeted twice: * o first time the budget was allocated by the task which - * made the page dirty and set the PG_private flag; + * made the folio dirty and set the private field; * o and then we budgeted for it for the second time at the * very beginning of this function. * - * So what we have to do is to release the page budget we + * So what we have to do is to release the folio budget we * allocated. */ release_new_page_budget(c); - else if (!PageChecked(page)) + else if (!folio_test_checked(folio)) /* - * We are changing a page which already exists on the media. - * This means that changing the page does not make the amount + * We are changing a folio which already exists on the media. + * This means that changing the folio does not make the amount * of indexing information larger, and this part of the budget * which we have already acquired may be released. */ @@ -301,7 +302,7 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = page; + *pagep = &folio->page; return 0; } -- cgit v1.2.3 From f60d356e6c5f966af5ab86701c93eb2028a66b31 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:51 +0000 Subject: ubifs: Convert ubifs_write_begin() to use a folio Save eight calls to compound_head() by using the new folio API. Remove a few assumptions that would break with large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cc4d2ec95b70..13c078bdf156 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -426,7 +426,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; int err, appending = !!(pos + len > inode->i_size); int skipped_read = 0; - struct page *page; + struct folio *folio; ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -435,13 +435,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return -EROFS; /* Try out the fast-path part first */ - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) { /* * We change whole page so no need to load it. But we * do not know whether this page exists on the media or @@ -451,19 +452,19 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * media. Thus, we are setting the @PG_checked flag * here. */ - SetPageChecked(page); + folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(page); + err = do_readpage(&folio->page); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } } } - err = allocate_budget(c, page, ui, appending); + err = allocate_budget(c, &folio->page, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* @@ -471,7 +472,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * write all of it, then it is not up to date. */ if (skipped_read) - ClearPageChecked(page); + folio_clear_checked(folio); /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -483,8 +484,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); mutex_unlock(&ui->ui_mutex); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return write_begin_slow(mapping, pos, len, pagep); } @@ -495,9 +496,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = page; + *pagep = &folio->page; return 0; - } /** -- cgit v1.2.3 From ffdff813d5b1fa738b5433c5698108c69cd4b71f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:52 +0000 Subject: ubifs: Convert ubifs_write_end() to use a folio Convert the incoming page pointer to a folio and use it throughout, saving several calls to compound_head(). Also remove some PAGE_SIZE assumptions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 13c078bdf156..feab95b59b05 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -530,6 +530,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -537,47 +538,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, int appending = !!(end_pos > inode->i_size); dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", - inode->i_ino, pos, page->index, len, copied, inode->i_size); + inode->i_ino, pos, folio->index, len, copied, inode->i_size); - if (unlikely(copied < len && len == PAGE_SIZE)) { + if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* - * VFS copied less data to the page that it intended and + * VFS copied less data to the folio than it intended and * declared in its '->write_begin()' call via the @len - * argument. If the page was not up-to-date, and @len was - * @PAGE_SIZE, the 'ubifs_write_begin()' function did + * argument. If the folio was not up-to-date, + * the 'ubifs_write_begin()' function did * not load it from the media (for optimization reasons). This - * means that part of the page contains garbage. So read the - * page now. + * means that part of the folio contains garbage. So read the + * folio now. */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, page, ui, appending); - ClearPageChecked(page); + cancel_budget(c, &folio->page, ui, appending); + folio_clear_checked(folio); /* * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(page); + copied = do_readpage(&folio->page); goto out; } - if (len == PAGE_SIZE) - SetPageUptodate(page); + if (len == folio_size(folio)) + folio_mark_uptodate(folio); - if (!PagePrivate(page)) { - attach_page_private(page, (void *)1); + if (!folio->private) { + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(mapping, folio); } if (appending) { i_size_write(inode, end_pos); ui->ui_size = end_pos; /* - * Note, we do not set @I_DIRTY_PAGES (which means that the - * inode has dirty pages), this has been done in - * '__set_page_dirty_nobuffers()'. + * We do not set @I_DIRTY_PAGES (which means that + * the inode has dirty pages), this was done in + * filemap_dirty_folio(). */ __mark_inode_dirty(inode, I_DIRTY_DATASYNC); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); @@ -585,8 +586,8 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } -- cgit v1.2.3 From b96af1fdb47cf85eaddc3c82413b580b75aae2d2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:53 +0000 Subject: ubifs: Convert do_readpage() to take a folio All the callers now have a folio, so pass it in, and convert do_readpage() to us folios directly. Includes unifying the exit paths from this function and using kmap_local instead of plain kmap. This function should now work with large folios, but this is not tested. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 64 +++++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 34 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index feab95b59b05..654af636b11d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -96,36 +96,36 @@ dump: return -EINVAL; } -static int do_readpage(struct page *page) +static int do_readpage(struct folio *folio) { void *addr; int err = 0, i; unsigned int block, beyond; - struct ubifs_data_node *dn; - struct inode *inode = page->mapping->host; + struct ubifs_data_node *dn = NULL; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; loff_t i_size = i_size_read(inode); dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); - ubifs_assert(c, !PageChecked(page)); - ubifs_assert(c, !PagePrivate(page)); + inode->i_ino, folio->index, i_size, folio->flags); + ubifs_assert(c, !folio_test_checked(folio)); + ubifs_assert(c, !folio->private); - addr = kmap(page); + addr = kmap_local_folio(folio, 0); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; if (block >= beyond) { /* Reading beyond inode */ - SetPageChecked(page); - memset(addr, 0, PAGE_SIZE); + folio_set_checked(folio); + addr = folio_zero_tail(folio, 0, addr); goto out; } dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); if (!dn) { err = -ENOMEM; - goto error; + goto out; } i = 0; @@ -150,39 +150,35 @@ static int do_readpage(struct page *page) memset(addr + ilen, 0, dlen - ilen); } } - if (++i >= UBIFS_BLOCKS_PER_PAGE) + if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio))) break; block += 1; addr += UBIFS_BLOCK_SIZE; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } + if (err) { struct ubifs_info *c = inode->i_sb->s_fs_info; if (err == -ENOENT) { /* Not found, so it must be a hole */ - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); - goto out_free; + err = 0; + } else { + ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); } - ubifs_err(c, "cannot read page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); - goto error; } -out_free: - kfree(dn); out: - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); - return 0; - -error: kfree(dn); - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + if (!err) + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); return err; } @@ -254,7 +250,7 @@ static int write_begin_slow(struct address_space *mapping, if (pos == folio_pos(folio) && len >= folio_size(folio)) folio_set_checked(folio); else { - err = do_readpage(&folio->page); + err = do_readpage(folio); if (err) { folio_unlock(folio); folio_put(folio); @@ -455,7 +451,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(&folio->page); + err = do_readpage(folio); if (err) { folio_unlock(folio); folio_put(folio); @@ -559,7 +555,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(&folio->page); + copied = do_readpage(folio); goto out; } @@ -895,7 +891,7 @@ static int ubifs_read_folio(struct file *file, struct folio *folio) if (ubifs_bulk_read(page)) return 0; - do_readpage(page); + do_readpage(folio); folio_unlock(folio); return 0; } -- cgit v1.2.3 From a3c2f196cdfc21404e9678204f55ad7be8e7226e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:54 +0000 Subject: ubifs: Convert allocate_budget() to work on a folio The one caller has a folio, so pass it in instead of the page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 654af636b11d..5542d73db717 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -305,7 +305,7 @@ static int write_begin_slow(struct address_space *mapping, /** * allocate_budget - allocate budget for 'ubifs_write_begin()'. * @c: UBIFS file-system description object - * @page: page to allocate budget for + * @folio: folio to allocate budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * @@ -316,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping, * * Returns: %0 in case of success and %-ENOSPC in case of failure. */ -static int allocate_budget(struct ubifs_info *c, struct page *page, +static int allocate_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { struct ubifs_budget_req req = { .fast = 1 }; - if (PagePrivate(page)) { + if (folio->private) { if (!appending) /* - * The page is dirty and we are not appending, which + * The folio is dirty and we are not appending, which * means no budget is needed at all. */ return 0; @@ -348,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page, */ req.dirtied_ino = 1; } else { - if (PageChecked(page)) + if (folio_test_checked(folio)) /* * The page corresponds to a hole and does not * exist on the media. So changing it makes - * make the amount of indexing information + * the amount of indexing information * larger, and we have to budget for a new * page. */ @@ -460,7 +460,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, } } - err = allocate_budget(c, &folio->page, ui, appending); + err = allocate_budget(c, folio, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* -- cgit v1.2.3 From 45d76698d119fffcd4d39d1cfcbfe30a87a7df77 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:55 +0000 Subject: ubifs: Convert cancel_budget() to take a folio The one caller already has a folio, so pass it in instead of the page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5542d73db717..9cd2e6d37c5d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -499,14 +499,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, /** * cancel_budget - cancel budget. * @c: UBIFS file-system description object - * @page: page to cancel budget for + * @folio: folio to cancel budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * * This is a helper function for a page write operation. It unlocks the * @ui->ui_mutex in case of appending. */ -static void cancel_budget(struct ubifs_info *c, struct page *page, +static void cancel_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { if (appending) { @@ -514,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page, ubifs_release_dirty_inode_budget(c, ui); mutex_unlock(&ui->ui_mutex); } - if (!PagePrivate(page)) { - if (PageChecked(page)) + if (!folio->private) { + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); @@ -548,7 +548,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, &folio->page, ui, appending); + cancel_budget(c, folio, ui, appending); folio_clear_checked(folio); /* -- cgit v1.2.3 From 7f348f8ce51c963d99a66d00ab73717f35dd27e1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:56 +0000 Subject: ubifs: Pass a folio into ubifs_bulk_read() and ubifs_do_bulk_read() This saves a single call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 9cd2e6d37c5d..f481fca7f53d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -703,15 +703,15 @@ out_err: * ubifs_do_bulk_read - do bulk-read. * @c: UBIFS file-system description object * @bu: bulk-read information - * @page1: first page to read + * @folio1: first folio to read * * Returns: %1 if the bulk-read is done, otherwise %0 is returned. */ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, - struct page *page1) + struct folio *folio1) { - pgoff_t offset = page1->index, end_index; - struct address_space *mapping = page1->mapping; + pgoff_t offset = folio1->index, end_index; + struct address_space *mapping = folio1->mapping; struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); int err, page_idx, page_cnt, ret = 0, n = 0; @@ -761,11 +761,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, page1, bu, &n); + err = populate_page(c, &folio1->page, bu, &n); if (err) goto out_warn; - unlock_page(page1); + folio_unlock(folio1); ret = 1; isize = i_size_read(inode); @@ -810,7 +810,7 @@ out_bu_off: /** * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. - * @page: page from which to start bulk-read. + * @folio: folio from which to start bulk-read. * * Some flash media are capable of reading sequentially at faster rates. UBIFS * bulk-read facility is designed to take advantage of that, by reading in one @@ -819,12 +819,12 @@ out_bu_off: * * Returns: %1 if a bulk-read is done and %0 otherwise. */ -static int ubifs_bulk_read(struct page *page) +static int ubifs_bulk_read(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); - pgoff_t index = page->index, last_page_read = ui->last_page_read; + pgoff_t index = folio->index, last_page_read = ui->last_page_read; struct bu_info *bu; int err = 0, allocated = 0; @@ -872,8 +872,8 @@ static int ubifs_bulk_read(struct page *page) bu->buf_len = c->max_bu_buf_len; data_key_init(c, &bu->key, inode->i_ino, - page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); - err = ubifs_do_bulk_read(c, bu, page); + folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, folio); if (!allocated) mutex_unlock(&c->bu_mutex); @@ -887,9 +887,7 @@ out_unlock: static int ubifs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - - if (ubifs_bulk_read(page)) + if (ubifs_bulk_read(folio)) return 0; do_readpage(folio); folio_unlock(folio); -- cgit v1.2.3 From d06192731c335cb7b62ad0dd211de26e03d56f3f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:57 +0000 Subject: ubifs: Use a folio in ubifs_do_bulk_read() When looking in the page cache, retrieve a folio instead of a page. This would need some work to make it safe for large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f481fca7f53d..11bf9f8cca6c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -775,19 +775,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, for (page_idx = 1; page_idx < page_cnt; page_idx++) { pgoff_t page_offset = offset + page_idx; - struct page *page; + struct folio *folio; if (page_offset > end_index) break; - page = pagecache_get_page(mapping, page_offset, + folio = __filemap_get_folio(mapping, page_offset, FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, ra_gfp_mask); - if (!page) + if (IS_ERR(folio)) break; - if (!PageUptodate(page)) - err = populate_page(c, page, bu, &n); - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) + err = populate_page(c, &folio->page, bu, &n); + folio_unlock(folio); + folio_put(folio); if (err) break; } -- cgit v1.2.3 From a16bfab367c60f1a66be03676a374451dc022b37 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Wed, 24 Jan 2024 17:52:58 +0000 Subject: ubifs: Convert populate_page() to take a folio Both callers now have a folio, so pass it in. This function contains several assumptions that folios are not large. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 11bf9f8cca6c..a1f46919934c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -590,35 +590,35 @@ out: /** * populate_page - copy data nodes into a page for bulk-read. * @c: UBIFS file-system description object - * @page: page + * @folio: folio * @bu: bulk-read information * @n: next zbranch slot * * Returns: %0 on success and a negative error code on failure. */ -static int populate_page(struct ubifs_info *c, struct page *page, +static int populate_page(struct ubifs_info *c, struct folio *folio, struct bu_info *bu, int *n) { int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t i_size = i_size_read(inode); unsigned int page_block; void *addr, *zaddr; pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); + inode->i_ino, folio->index, i_size, folio->flags); - addr = zaddr = kmap(page); + addr = zaddr = kmap_local_folio(folio, 0); end_index = (i_size - 1) >> PAGE_SHIFT; - if (!i_size || page->index > end_index) { + if (!i_size || folio->index > end_index) { hole = 1; - memset(addr, 0, PAGE_SIZE); + addr = folio_zero_tail(folio, 0, addr); goto out_hole; } - page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; while (1) { int err, len, out_len, dlen; @@ -667,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page, break; addr += UBIFS_BLOCK_SIZE; page_block += 1; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } - if (end_index == page->index) { + if (end_index == folio->index) { int len = i_size & (PAGE_SIZE - 1); if (len && len < read) @@ -678,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page, out_hole: if (hole) { - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); } - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); *n = nn; return 0; out_err: - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + flush_dcache_folio(folio); + kunmap_local(addr); ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; @@ -761,7 +762,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, &folio1->page, bu, &n); + err = populate_page(c, folio1, bu, &n); if (err) goto out_warn; @@ -785,7 +786,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (IS_ERR(folio)) break; if (!folio_test_uptodate(folio)) - err = populate_page(c, &folio->page, bu, &n); + err = populate_page(c, folio, bu, &n); folio_unlock(folio); folio_put(folio); if (err) -- cgit v1.2.3 From 60f16e912a53ad78306ca7cf0c95913293042411 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:54:07 +0100 Subject: ubifs: fix sort function prototype The global sort() function expects a callback pointer to a function with two void* arguments, but ubifs has a function with specific object types, which causes a warning in clang-16 and higher: fs/ubifs/lprops.c:1272:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct ubifs_lp_stats *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 1272 | (ubifs_lpt_scan_callback)scan_check_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change the prototype to the regular one and cast the object pointers locally instead. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/find.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 873e6e1c92b5..1cb79b167a4f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -726,11 +726,10 @@ out: return err; } -static int cmp_dirty_idx(const struct ubifs_lprops **a, - const struct ubifs_lprops **b) +static int cmp_dirty_idx(const void *a, const void *b) { - const struct ubifs_lprops *lpa = *a; - const struct ubifs_lprops *lpb = *b; + const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a; + const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b; return lpa->dirty + lpa->free - lpb->dirty - lpb->free; } @@ -754,7 +753,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) sizeof(void *) * c->dirty_idx.cnt); /* Sort it so that the dirtiest is now at the end */ sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), - (int (*)(const void *, const void *))cmp_dirty_idx, NULL); + cmp_dirty_idx, NULL); dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); if (c->dirty_idx.cnt) dbg_find("dirtiest index LEB is %d with dirty %d and free %d", -- cgit v1.2.3 From ec724e534dfdd592abc5ac066be77ef15c455ccc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:54:08 +0100 Subject: ubifs: fix function pointer cast warnings ubifs has a number of callback functions for ubifs_lpt_scan_nolock() using two different prototypes, either passing a struct scan_data or a struct ubifs_lp_stats, but the caller expects a void pointer instead. clang-16 now warns about this: fs/ubifs/find.c:170:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct scan_data *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 170 | (ubifs_lpt_scan_callback)scan_for_dirty_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/ubifs/find.c:449:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct scan_data *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 449 | (ubifs_lpt_scan_callback)scan_for_free_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change all of these callback functions to actually take the void * argument that is passed by their caller. Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/find.c | 23 ++++++++++++----------- fs/ubifs/lprops.c | 6 +++--- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 1cb79b167a4f..6ebf3c04ac5f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) */ static int scan_for_dirty_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; data.exclude_index = exclude_index; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_dirty_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb, &data); if (err) return ERR_PTR(err); @@ -349,8 +349,9 @@ out: */ static int scan_for_free_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_free_cb, + scan_for_free_cb, &data); if (err) return ERR_PTR(err); @@ -589,8 +590,9 @@ out: */ static int scan_for_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) int err; data.lnum = -1; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb, &data); if (err) return ERR_PTR(err); @@ -781,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) */ static int scan_dirty_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -841,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c) if (c->pnodes_have >= c->pnode_cnt) /* All pnodes are in memory, so skip scan */ return -ENOSPC; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb, &data); if (err) return err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 6d6cd85c2b4c..a11c3dab7e16 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1014,8 +1014,9 @@ out: */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct ubifs_lp_stats *lst) + void *arg) { + struct ubifs_lp_stats *lst = arg; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; @@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c) memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, - (ubifs_lpt_scan_callback)scan_check_cb, - &lst); + scan_check_cb, &lst); if (err && err != -ENOSPC) goto out; -- cgit v1.2.3 From 788cd161f99638db81b4c9ce836dbdf9a65c7701 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 11 Jan 2024 14:36:56 +0800 Subject: ubifs: Remove unreachable code in dbg_check_ltab_lnum Because there is no break statement in the dead loop above,it is impossible to execute the 'err=0' statement.Delete the code that will never execute. Fixes: 6fb324a4b0c3 ("UBIFS: allocate ltab checking buffer on demand") Signed-off-by: Kunwu Chan Cc: Kunwu Chan Suggested-by: Richard Weinberger Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/lpt_commit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index c4d079328b92..07351fdce722 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) len -= node_len; } - err = 0; out: vfree(buf); return err; -- cgit v1.2.3 From 31a9d5f3290c5483d41c6b5c8ebc0add130da770 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 8 Jan 2024 10:41:04 +0800 Subject: ubifs: dbg_check_idx_size: Fix kmemleak if loading znode failed If function dbg_check_idx_size() failed by loading znode in mounting process, there are two problems: 1. Allocated znodes won't be freed, which causes kmemleak in kernel: ubifs_mount dbg_check_idx_size dbg_walk_index c->zroot.znode = ubifs_load_znode child = ubifs_load_znode // failed // Loaded znodes won't be freed in error handling path. 2. Global variable ubifs_clean_zn_cnt is not decreased, because ubifs_tnc_close() is not invoked in error handling path, which triggers a warning in ubifs_exit(): WARNING: CPU: 1 PID: 1576 at fs/ubifs/super.c:2486 ubifs_exit Modules linked in: zstd ubifs(-) ubi nandsim CPU: 1 PID: 1576 Comm: rmmod Not tainted 6.7.0-rc6 Call Trace: ubifs_exit+0xca/0xc70 [ubifs] __do_sys_delete_module+0x29a/0x4a0 do_syscall_64+0x6f/0x140 Fix it by adding error handling path in dbg_check_idx_size() to release tnc tree. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Suggested-by: Richard Weinberger Signed-off-by: Richard Weinberger --- fs/ubifs/debug.c | 9 +++++++-- fs/ubifs/tnc.c | 9 +-------- fs/ubifs/tnc_misc.c | 22 ++++++++++++++++++++++ fs/ubifs/ubifs.h | 1 + 4 files changed, 31 insertions(+), 10 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d013c5b3f1ed..ac77ac1fd73e 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) err = dbg_walk_index(c, NULL, add_size, &calc); if (err) { ubifs_err(c, "error %d while walking the index", err); - return err; + goto out_err; } if (calc != idx_size) { ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld", calc, idx_size); dump_stack(); - return -EINVAL; + err = -EINVAL; + goto out_err; } return 0; + +out_err: + ubifs_destroy_tnc_tree(c); + return err; } /** diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f4728e65d1bd..45cacdcd4746 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) void ubifs_tnc_close(struct ubifs_info *c) { tnc_destroy_cnext(c); - if (c->zroot.znode) { - long n, freed; - - n = atomic_long_read(&c->clean_zn_cnt); - freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); - ubifs_assert(c, freed == n); - atomic_long_sub(n, &ubifs_clean_zn_cnt); - } + ubifs_destroy_tnc_tree(c); kfree(c->gap_lebs); kfree(c->ilebs); destroy_old_idx(c); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index 4d686e34e64d..d3f8a6aa1f49 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -250,6 +250,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, } } +/** + * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree. + * @c: UBIFS file-system description object + * + * This function destroys the whole TNC tree and updates clean global znode + * count. + */ +void ubifs_destroy_tnc_tree(struct ubifs_info *c) +{ + long n, freed; + + if (!c->zroot.znode) + return; + + n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); + ubifs_assert(c, freed == n); + atomic_long_sub(n, &ubifs_clean_zn_cnt); + + c->zroot.znode = NULL; +} + /** * read_znode - read an indexing node from flash and fill znode. * @c: UBIFS file-system description object diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3916dc4f30ca..6eba287ae66c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1903,6 +1903,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c, struct ubifs_znode *znode); long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, struct ubifs_znode *zr); +void ubifs_destroy_tnc_tree(struct ubifs_info *c); struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr, struct ubifs_znode *parent, int iip); -- cgit v1.2.3 From 6379b44cdcd67f5f5d986b73953e99700591edfa Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 8 Jan 2024 10:41:05 +0800 Subject: ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path For error handling path in ubifs_symlink(), inode will be marked as bad first, then iput() is invoked. If inode->i_link is initialized by fscrypt_encrypt_symlink() in encryption scenario, inode->i_link won't be freed by callchain ubifs_free_inode -> fscrypt_free_inode in error handling path, because make_bad_inode() has changed 'inode->i_mode' as 'S_IFREG'. Following kmemleak is easy to be reproduced by injecting error in ubifs_jnl_update() when doing symlink in encryption scenario: unreferenced object 0xffff888103da3d98 (size 8): comm "ln", pid 1692, jiffies 4294914701 (age 12.045s) backtrace: kmemdup+0x32/0x70 __fscrypt_encrypt_symlink+0xed/0x1c0 ubifs_symlink+0x210/0x300 [ubifs] vfs_symlink+0x216/0x360 do_symlinkat+0x11a/0x190 do_syscall_64+0x3b/0xe0 There are two ways fixing it: 1. Remove make_bad_inode() in error handling path. We can do that because ubifs_evict_inode() will do same processes for good symlink inode and bad symlink inode, for inode->i_nlink checking is before is_bad_inode(). 2. Free inode->i_link before marking inode bad. Method 2 is picked, it has less influence, personally, I think. Cc: stable@vger.kernel.org Fixes: 2c58d548f570 ("fscrypt: cache decrypted symlink target in ->i_link") Signed-off-by: Zhihao Cheng Suggested-by: Eric Biggers Reviewed-by: Eric Biggers Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ubifs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e413a9cf8ee3..6b3db00d9b12 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1134,6 +1134,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: + /* Free inode->i_link before inode is marked as bad. */ + fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: -- cgit v1.2.3 From 556c19f563b64dd5463b0030d19c8681f4f7446e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 22 Jan 2024 14:31:03 +0800 Subject: ubifs: Queue up space reservation tasks if retrying many times Recently we catched ENOSPC returned by make_reservation() while doing fsstress on UBIFS, we got following information when it occurred (See details in Link): UBIFS error (ubi0:0 pid 3640152): make_reservation [ubifs]: cannot reserve 112 bytes in jhead 2, error -28 CPU: 2 PID: 3640152 Comm: kworker/u16:2 Tainted: G B W Hardware name: Hisilicon PhosphorHi1230 EMU (DT) Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call trace: dump_stack+0x114/0x198 make_reservation+0x564/0x610 [ubifs] ubifs_jnl_write_data+0x328/0x48c [ubifs] do_writepage+0x2a8/0x3e4 [ubifs] ubifs_writepage+0x16c/0x374 [ubifs] generic_writepages+0xb4/0x114 do_writepages+0xcc/0x11c writeback_sb_inodes+0x2d0/0x564 wb_writeback+0x20c/0x2b4 wb_workfn+0x404/0x510 process_one_work+0x304/0x4ac worker_thread+0x31c/0x4e4 kthread+0x23c/0x290 Budgeting info: data budget sum 17576, total budget sum 17768 budg_data_growth 4144, budg_dd_growth 13432, budg_idx_growth 192 min_idx_lebs 13, old_idx_sz 988640, uncommitted_idx 0 page_budget 4144, inode_budget 160, dent_budget 312 nospace 0, nospace_rp 0 dark_wm 8192, dead_wm 4096, max_idx_node_sz 192 freeable_cnt 0, calc_idx_sz 988640, idx_gc_cnt 0 dirty_pg_cnt 4, dirty_zn_cnt 0, clean_zn_cnt 4811 gc_lnum 21, ihead_lnum 14 jhead 0 (GC) LEB 16 jhead 1 (base) LEB 34 jhead 2 (data) LEB 23 bud LEB 16 bud LEB 23 bud LEB 34 old bud LEB 33 old bud LEB 31 old bud LEB 15 commit state 4 Budgeting predictions: available: 33832, outstanding 17576, free 15356 (pid 3640152) start dumping LEB properties (pid 3640152) Lprops statistics: empty_lebs 3, idx_lebs 11 taken_empty_lebs 1, total_free 1253376, total_dirty 2445736 total_used 3438712, total_dark 65536, total_dead 17248 LEB 15 free 0 dirty 248000 used 5952 (taken) LEB 16 free 110592 dirty 896 used 142464 (taken, jhead 0 (GC)) LEB 21 free 253952 dirty 0 used 0 (taken, GC LEB) LEB 23 free 0 dirty 248104 used 5848 (taken, jhead 2 (data)) LEB 29 free 253952 dirty 0 used 0 (empty) LEB 33 free 0 dirty 253952 used 0 (taken) LEB 34 free 217088 dirty 36544 used 320 (taken, jhead 1 (base)) LEB 37 free 253952 dirty 0 used 0 (empty) OTHERS: index lebs, zero-available non-index lebs According to the budget algorithm, there are 5 LEBs reserved for budget: three journal heads(16,23,34), 1 GC LEB(21) and 1 deletion LEB(can be used in make_reservation()). There are 2 empty LEBs used for index nodes, which is calculated as min_idx_lebs - idx_lebs = 2. In theory, LEB 15 and 33 should be reclaimed as free state after committing, but it is now in taken state. After looking the realization of reserve_space(), there's a possible situation: LEB 15: free 2000 dirty 248000 used 3952 (jhead 2) LEB 23: free 2000 dirty 248104 used 3848 (bud, taken) LEB 33: free 2000 dirty 251952 used 0 (bud, taken) wb_workfn wb_workfn_2 do_writepage // write 3000 bytes ubifs_jnl_write_data make_reservation reserve_space ubifs_garbage_collect ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken nospc_retries++ // 1 ubifs_run_commit do_commit LEB 15: free 2000 dirty 248000 used 3952 (jhead 2) LEB 23: free 2000 dirty 248104 used 3848 (dirty) LEB 33: free 2000 dirty 251952 used 0 (dirty) do_writepage // write 2000 bytes for 3 times ubifs_jnl_write_data // grabs 15\23\33 LEB 15: free 0 dirty 248000 used 5952 (bud, taken) LEB 23: free 0 dirty 248104 used 5848 (jhead 2) LEB 33: free 0 dirty 253952 used 0 (bud, taken) reserve_space ubifs_garbage_collect ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken if (nospc_retries++ < 2) // false ubifs_ro_mode ! Fetch a reproducer in Link. The dirty LEBs could be grabbed by other threads, which fails finding dirty LEBs of GC in current thread, so make_reservation() could try many times to invoke GC&&committing, but current realization limits the times of retrying as 'nospc_retries'(twice). Fix it by adding a wait queue, start queuing up space reservation tasks when someone task has retried gc + commit for many times. Then there is only one task making space reservation at any time, and it can always make success under the premise of correct budgeting. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218164 Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Zhang Yi Signed-off-by: Richard Weinberger --- fs/ubifs/journal.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/ubifs/super.c | 2 + fs/ubifs/ubifs.h | 4 ++ 3 files changed, 155 insertions(+), 22 deletions(-) (limited to 'fs/ubifs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index f0a5538c84b0..74aee92433d7 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -292,6 +292,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, return err; } +/** + * __queue_and_wait - queue a task and wait until the task is waked up. + * @c: UBIFS file-system description object + * + * This function adds current task in queue and waits until the task is waked + * up. This function should be called with @c->reserve_space_wq locked. + */ +static void __queue_and_wait(struct ubifs_info *c) +{ + DEFINE_WAIT(wait); + + __add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&c->reserve_space_wq.lock); + + schedule(); + finish_wait(&c->reserve_space_wq, &wait); +} + +/** + * wait_for_reservation - try queuing current task to wait until waked up. + * @c: UBIFS file-system description object + * + * This function queues current task to wait until waked up, if queuing is + * started(@c->need_wait_space is not %0). Returns %true if current task is + * added in queue, otherwise %false is returned. + */ +static bool wait_for_reservation(struct ubifs_info *c) +{ + if (likely(atomic_read(&c->need_wait_space) == 0)) + /* Quick path to check whether queuing is started. */ + return false; + + spin_lock(&c->reserve_space_wq.lock); + if (atomic_read(&c->need_wait_space) == 0) { + /* Queuing is not started, don't queue current task. */ + spin_unlock(&c->reserve_space_wq.lock); + return false; + } + + __queue_and_wait(c); + return true; +} + +/** + * wake_up_reservation - wake up first task in queue or stop queuing. + * @c: UBIFS file-system description object + * + * This function wakes up the first task in queue if it exists, or stops + * queuing if no tasks in queue. + */ +static void wake_up_reservation(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (waitqueue_active(&c->reserve_space_wq)) + wake_up_locked(&c->reserve_space_wq); + else + /* + * Compared with wait_for_reservation(), set @c->need_wait_space + * under the protection of wait queue lock, which can avoid that + * @c->need_wait_space is set to 0 after new task queued. + */ + atomic_set(&c->need_wait_space, 0); + spin_unlock(&c->reserve_space_wq.lock); +} + +/** + * wake_up_reservation - add current task in queue or start queuing. + * @c: UBIFS file-system description object + * + * This function starts queuing if queuing is not started, otherwise adds + * current task in queue. + */ +static void add_or_start_queue(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) { + /* Starts queuing, task can go on directly. */ + spin_unlock(&c->reserve_space_wq.lock); + return; + } + + /* + * There are at least two tasks have retried more than 32 times + * at certain point, first task has started queuing, just queue + * the left tasks. + */ + __queue_and_wait(c); +} + /** * make_reservation - reserve journal space. * @c: UBIFS file-system description object @@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, static int make_reservation(struct ubifs_info *c, int jhead, int len) { int err, cmt_retries = 0, nospc_retries = 0; + bool blocked = wait_for_reservation(c); again: down_read(&c->commit_sem); err = reserve_space(c, jhead, len); - if (!err) + if (!err) { /* c->commit_sem will get released via finish_reservation(). */ - return 0; + goto out_wake_up; + } up_read(&c->commit_sem); if (err == -ENOSPC) { /* * GC could not make any progress. We should try to commit - * once because it could make some dirty space and GC would - * make progress, so make the error -EAGAIN so that the below + * because it could make some dirty space and GC would make + * progress, so make the error -EAGAIN so that the below * will commit and re-try. */ - if (nospc_retries++ < 2) { - dbg_jnl("no space, retry"); - err = -EAGAIN; - } - - /* - * This means that the budgeting is incorrect. We always have - * to be able to write to the media, because all operations are - * budgeted. Deletions are not budgeted, though, but we reserve - * an extra LEB for them. - */ + nospc_retries++; + dbg_jnl("no space, retry"); + err = -EAGAIN; } if (err != -EAGAIN) @@ -349,15 +433,37 @@ again: */ if (cmt_retries > 128) { /* - * This should not happen unless the journal size limitations - * are too tough. + * This should not happen unless: + * 1. The journal size limitations are too tough. + * 2. The budgeting is incorrect. We always have to be able to + * write to the media, because all operations are budgeted. + * Deletions are not budgeted, though, but we reserve an + * extra LEB for them. */ - ubifs_err(c, "stuck in space allocation"); + ubifs_err(c, "stuck in space allocation, nospc_retries %d", + nospc_retries); err = -ENOSPC; goto out; - } else if (cmt_retries > 32) - ubifs_warn(c, "too many space allocation re-tries (%d)", - cmt_retries); + } else if (cmt_retries > 32) { + /* + * It's almost impossible to happen, unless there are many tasks + * making reservation concurrently and someone task has retried + * gc + commit for many times, generated available space during + * this period are grabbed by other tasks. + * But if it happens, start queuing up all tasks that will make + * space reservation, then there is only one task making space + * reservation at any time, and it can always make success under + * the premise of correct budgeting. + */ + ubifs_warn(c, "too many space allocation cmt_retries (%d) " + "nospc_retries (%d), start queuing tasks", + cmt_retries, nospc_retries); + + if (!blocked) { + blocked = true; + add_or_start_queue(c); + } + } dbg_jnl("-EAGAIN, commit and retry (retried %d times)", cmt_retries); @@ -365,7 +471,7 @@ again: err = ubifs_run_commit(c); if (err) - return err; + goto out_wake_up; goto again; out: @@ -380,6 +486,27 @@ out: cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } +out_wake_up: + if (blocked) { + /* + * Only tasks that have ever started queuing or ever been queued + * can wake up other queued tasks, which can make sure that + * there is only one task waked up to make space reservation. + * For example: + * task A task B task C + * make_reservation make_reservation + * reserve_space // 0 + * wake_up_reservation + * atomic_cmpxchg // 0, start queuing + * reserve_space + * wait_for_reservation + * __queue_and_wait + * add_wait_queue + * if (blocked) // false + * // So that task C won't be waked up to race with task B + */ + wake_up_reservation(c); + } return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 09e270d6ed02..571c9dc92b94 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); + init_waitqueue_head(&c->reserve_space_wq); + atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 6eba287ae66c..1f3ea879d93a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1047,6 +1047,8 @@ struct ubifs_debug_info; * @bg_bud_bytes: number of bud bytes when background commit is initiated * @old_buds: buds to be released after commit ends * @max_bud_cnt: maximum number of buds + * @need_wait_space: Non %0 means space reservation tasks need to wait in queue + * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0 * * @commit_sem: synchronizes committer with other processes * @cmt_state: commit state @@ -1305,6 +1307,8 @@ struct ubifs_info { long long bg_bud_bytes; struct list_head old_buds; int max_bud_cnt; + atomic_t need_wait_space; + wait_queue_head_t reserve_space_wq; struct rw_semaphore commit_sem; int cmt_state; -- cgit v1.2.3