From 4d5a0565cebf12c2ef854e4ac1961f13a710a950 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 30 Apr 2012 11:17:25 +0200 Subject: Btrfs: remove call to btrfs_header_nritems with no effect This is a leftover from cleanup patch 559af821. Before the cleanup, btrfs_header_nritems was called inside an if condition. As it has no side effects we need to preserve here, it should simply be dropped. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d7a96cfdc50a..836e4e03edca 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1650,8 +1650,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - btrfs_header_nritems(mid); - left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); @@ -1681,7 +1679,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - btrfs_header_nritems(mid); } /* -- cgit v1.2.3 From f617e2fd52484fb74236a597d0f9068ec7d9d2dd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 14 Jun 2012 16:10:13 +0200 Subject: Btrfs: remove obsolete btrfs_next_leaf call from __resolve_indirect_ref When resolving indirect refs, we used to call btrfs_next_leaf in case we didn't find an exact match. While we should find exact matches most of the time, in case we don't, we must continue searching. Treating those matches differently depending on the level we're searching doesn't make sense. Even worse, we might end up searching for a key larger than the largest, in which case there is no next_leaf and subsequent jobs would fail. This commit drops the bogous lines. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3f75895c919b..579131de1b3b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -294,16 +294,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (level == 0) { - if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - eb = path->nodes[0]; - } - + if (level == 0) btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - } ret = add_all_parents(root, path, parents, level, &key, ref->wanted_disk_byte, extent_item_pos); -- cgit v1.2.3 From 8ba97a15e7d4f70b9af71fa1db86a28dd17ad1b2 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 4 Jun 2012 16:54:57 +0200 Subject: Btrfs: use btrfs_read_lock_root_node in get_old_root get_old_root could race with root node updates because we weren't locking the node early enough. Use btrfs_read_lock_root_node to grab the root locked in the very beginning and release the lock as soon as possible (just like btrfs_search_slot does). Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 836e4e03edca..2cde7b0a0106 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1143,6 +1143,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, return eb_rewin; } +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ static inline struct extent_buffer * get_old_root(struct btrfs_root *root, u64 time_seq) { @@ -1151,6 +1158,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root; u64 old_generation; + eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); if (!tm) return root->node; @@ -1173,15 +1181,21 @@ get_old_root(struct btrfs_root *root, u64 time_seq) /* there's a root replace operation for the current root */ eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, root->nodesize); + } + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + if (!eb) + return NULL; + btrfs_tree_read_lock(eb); + if (old_root->logical != root->node->start) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(eb, root->root_key.objectid); } - if (!eb) - return NULL; btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); __tree_mod_log_rewind(eb, time_seq, tm); + extent_buffer_get(eb); return eb; } @@ -2612,9 +2626,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, again: b = get_old_root(root, time_seq); - extent_buffer_get(b); level = btrfs_header_level(b); - btrfs_tree_read_lock(b); p->locks[level] = BTRFS_READ_LOCK; while (b) { -- cgit v1.2.3 From a95236d99fa56766f11056903439f55fe5038bcf Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 5 Jun 2012 16:41:24 +0200 Subject: Btrfs: fix return value for __tree_mod_log_oldest_root In __tree_mod_log_oldest_root() we must return the found operation even if it's not a ROOT_REPLACE operation. Otherwise, the caller assumes that there are no operations to be rewinded and returns immediately. The code in the caller is modified to improve readability. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2cde7b0a0106..50d7c99ddce7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1023,6 +1023,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, looped = 1; } + /* if there's no old root to return, return what we found instead */ + if (!found) + found = tm; + return found; } @@ -1155,18 +1159,24 @@ get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; struct extent_buffer *eb; - struct tree_mod_root *old_root; + struct tree_mod_root *old_root = NULL; u64 old_generation; + u64 logical; eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); if (!tm) return root->node; - old_root = &tm->old_root; - old_generation = tm->generation; + if (tm->op == MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + } else { + logical = root->node->start; + } - tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); + tm = tree_mod_log_search(root->fs_info, logical, time_seq); /* * there was an item in the log when __tree_mod_log_oldest_root * returned. this one must not go away, because the time_seq passed to @@ -1174,26 +1184,23 @@ get_old_root(struct btrfs_root *root, u64 time_seq) */ BUG_ON(!tm); - if (old_root->logical == root->node->start) { - /* there are logged operations for the current root */ - eb = btrfs_clone_extent_buffer(root->node); - } else { - /* there's a root replace operation for the current root */ + if (old_root) eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, root->nodesize); - } + else + eb = btrfs_clone_extent_buffer(root->node); btrfs_tree_read_unlock(root->node); free_extent_buffer(root->node); if (!eb) return NULL; btrfs_tree_read_lock(eb); - if (old_root->logical != root->node->start) { + if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(eb, root->root_key.objectid); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); } - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); __tree_mod_log_rewind(eb, time_seq, tm); extent_buffer_get(eb); -- cgit v1.2.3 From 3d7806eca43e73a9721d2e09369200ed93036bd0 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 11 Jun 2012 08:29:29 +0200 Subject: Btrfs: add btrfs_next_old_leaf To make sense of the tree mod log, the backref walker not only needs btrfs_search_old_slot, but it also called btrfs_next_leaf, which in turn was calling btrfs_search_slot. This obviously didn't give the correct result. This commit adds btrfs_next_old_leaf, a drop-in replacement for btrfs_next_leaf with a time_seq parameter. If it is zero, it behaves exactly like btrfs_next_leaf. If it is non-zero, it will use btrfs_search_old_slot with this time_seq parameter. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 7 ++++--- fs/btrfs/ctree.c | 11 ++++++++++- fs/btrfs/ctree.h | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 579131de1b3b..8f7d1237b7a0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -179,7 +179,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, int level, - struct btrfs_key *key, u64 wanted_disk_byte, + struct btrfs_key *key, u64 time_seq, + u64 wanted_disk_byte, const u64 *extent_item_pos) { int ret; @@ -212,7 +213,7 @@ add_parent: */ while (1) { eie = NULL; - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_old_leaf(root, path, time_seq); if (ret < 0) return ret; if (ret) @@ -297,7 +298,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (level == 0) btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - ret = add_all_parents(root, path, parents, level, &key, + ret = add_all_parents(root, path, parents, level, &key, time_seq, ref->wanted_disk_byte, extent_item_pos); out: btrfs_free_path(path); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50d7c99ddce7..cb76b2a1b908 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5016,6 +5016,12 @@ next: * returns < 0 on io errors. */ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq) { int slot; int level; @@ -5041,7 +5047,10 @@ again: path->keep_locks = 1; path->leave_spinning = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (time_seq) + ret = btrfs_search_old_slot(root, &key, path, time_seq); + else + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0151ca1ac657..db15e9e7b91c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2753,6 +2753,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq); static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { ++p->slots[0]; -- cgit v1.2.3 From 3310c36eef330766fd23d50796287ad764929e24 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 11 Jun 2012 10:52:38 +0200 Subject: Btrfs: fix race in tree mod log addition When adding to the tree modification log, we grab two locks at different stages. We must not drop the outer lock until we're done with section protected by the inner lock. This moves the unlock call for the outer lock to the appropriate position. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cb76b2a1b908..04b06bcf2ca9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -467,6 +467,15 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 0; } +/* + * This allocates memory and gets a tree modification sequence number when + * needed. + * + * Returns 0 when no sequence number is needed, < 0 on error. + * Returns 1 when a sequence number was added. In this case, + * fs_info->tree_mod_seq_lock was acquired and must be released by the caller + * after inserting into the rb tree. + */ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { @@ -491,11 +500,11 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, */ kfree(tm); seq = 0; + spin_unlock(&fs_info->tree_mod_seq_lock); } else { __get_tree_mod_seq(fs_info, &tm->elem); seq = tm->elem.seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); return seq; } @@ -521,7 +530,9 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static noinline int @@ -559,7 +570,9 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->move.nr_items = nr_items; tm->op = MOD_LOG_MOVE_KEYS; - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static noinline int @@ -580,7 +593,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static struct tree_mod_elem * -- cgit v1.2.3 From beb42dd793193a3d4e72970bfa73cd8810f63cea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 May 2012 15:35:17 -0400 Subject: Btrfs: pass locked_page into extent_clear_unlock_delalloc if theres an error While doing my enospc work I got a transaction abortion that resulted in a panic when we tried to unlock_page() an already unlocked page. This is because we aren't calling extent_clear_unlock_delalloc with the locked page so it was unlocking all the pages in the range. This is wrong since __extent_writepage expects to have the page locked still unless we return *page_started as 1. This should keep us from panicing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 92df0a5d1d94..ccd6355af73f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -830,7 +830,7 @@ static noinline int cow_file_range(struct inode *inode, if (IS_ERR(trans)) { extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, + start, end, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | @@ -963,7 +963,7 @@ out: out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, + start, end, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | -- cgit v1.2.3 From b939d1ab76b4aa816343cdbd8b44ce9929a3b9ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 11:06:33 -0400 Subject: Btrfs: fix locking in btrfs_destroy_delayed_refs The transaction abort stuff was throwing warnings from the list debugging code because we do a list_del_init outside of the delayed_refs spin lock. The delayed refs locking makes baby Jesus cry so it's not hard to get wrong, but we need to take the ref head mutex to make sure it's not being processed currently, and so if it is we need to drop the spin lock and then take and drop the mutex and do the search again. If we can take the mutex then we can safely remove the head from the list and carry on. Now when the transaction aborts I don't get the list debugging warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b99d5127ba18..c79ddc756081 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3400,7 +3400,6 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; -again: spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { spin_unlock(&delayed_refs->lock); @@ -3408,31 +3407,36 @@ again: return ret; } - node = rb_first(&delayed_refs->root); - while (node) { + while ((node = rb_first(&delayed_refs->root)) != NULL) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); - spin_unlock(&delayed_refs->lock); - mutex_lock(&head->mutex); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + /* Need to wait for the delayed ref to run */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + + continue; + } + kfree(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; list_del_init(&head->cluster); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; } + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); -- cgit v1.2.3 From d7096fc3ef8360f30f228344bc564d4f97d8a158 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:49:57 -0400 Subject: Btrfs: wake up transaction waiters when aborting a transaction I was getting lots of hung tasks and a NULL pointer dereference because we are not cleaning up the transaction properly when it aborts. First we need to reset the running_transaction to NULL so we don't get a bad dereference for any start_transaction callers after this. Also we cannot rely on waitqueue_active() since it's just a list_empty(), so just call wake_up() directly since that will do the barrier for us and such. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 9 +++------ fs/btrfs/transaction.c | 4 ++++ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c79ddc756081..19b4db70dcb1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3589,16 +3589,13 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, /* FIXME: cleanup wait for commit */ cur_trans->in_commit = 1; cur_trans->blocked = 1; - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); + wake_up(&root->fs_info->transaction_blocked_wait); cur_trans->blocked = 0; - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_wait); cur_trans->commit_done = 1; - if (waitqueue_active(&cur_trans->commit_wait)) - wake_up(&cur_trans->commit_wait); + wake_up(&cur_trans->commit_wait); btrfs_destroy_pending_snapshots(cur_trans); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1791c6e3d834..59e0203bfb95 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1221,6 +1221,10 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); + if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + } spin_unlock(&root->fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, root); -- cgit v1.2.3 From 7b8b92af58db347de64a237861fcf13374b34a9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:52:43 -0400 Subject: Btrfs: abort the transaction if the commit fails If a transaction commit fails we don't abort it so we don't set an error on the file system. This patch fixes that by actually calling the abort stuff and then adding a check for a fs error in the transaction start stuff to make sure it is caught properly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 59e0203bfb95..b72b068183ec 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -100,6 +100,10 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; + } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&root->fs_info->trans_lock); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + return -EROFS; } atomic_set(&cur_trans->num_writers, 1); @@ -1213,12 +1217,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, static void cleanup_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; WARN_ON(trans->use_count > 1); + btrfs_abort_transaction(trans, root, err); + spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { @@ -1530,7 +1536,7 @@ cleanup_transaction: // WARN_ON(1); if (current->journal_info == trans) current->journal_info = NULL; - cleanup_transaction(trans, root); + cleanup_transaction(trans, root, ret); return ret; } -- cgit v1.2.3 From ee670f0af35871edb492db5ba406cef36d1b7c21 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:54:30 -0400 Subject: Btrfs: fix btrfs_destroy_marked_extents So we're forcing the eb's to have their ref count set to 1 so invalidatepage works but this breaks lots of things, for example root nodes, and is just plain wrong, we don't need to just evict all of this stuff. Also drop the invalidatepage altogether and add a page_cache_release(). With this patch we no longer hang when trying to access the root nodes after an aborted transaction and we no longer leak memory. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 19b4db70dcb1..5a3bf323e2bd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3524,11 +3524,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, offset >> PAGE_CACHE_SHIFT); spin_unlock(&dirty_pages->buffer_lock); - if (eb) { + if (eb) ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - atomic_set(&eb->refs, 1); - } if (PageWriteback(page)) end_page_writeback(page); @@ -3542,8 +3540,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, spin_unlock_irq(&page->mapping->tree_lock); } - page->mapping->a_ops->invalidatepage(page, 0); unlock_page(page); + page_cache_release(page); } } -- cgit v1.2.3 From 17ca04aff7e6171df684b7b65804df8830eb8c15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:58:55 -0400 Subject: Btrfs: unlock everything properly in the error case for nocow I was getting hung on umount when a transaction was aborted because a range of one of the free space inodes was still locked. This is because the nocow stuff doesn't unlock anything on error. This fixed the problem and I verified that is what was happening. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ccd6355af73f..b7f398c36cb7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1136,8 +1136,18 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - if (!path) + if (!path) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); return -ENOMEM; + } nolock = btrfs_is_free_space_inode(root, inode); @@ -1147,6 +1157,15 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); btrfs_free_path(path); return PTR_ERR(trans); } @@ -1327,8 +1346,11 @@ out_check: } btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) + if (cur_offset <= end && cow_start == (u64)-1) { cow_start = cur_offset; + cur_offset = end; + } + if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); @@ -1347,6 +1369,17 @@ error: if (!ret) ret = err; + if (ret && cur_offset < end) + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + cur_offset, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 606686eeac4550d2212bf3d621a810407ef5e9bf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 4 Jun 2012 14:03:51 -0400 Subject: Btrfs: use rcu to protect device->name Al pointed out that we can just toss out the old name on a device and add a new one arbitrarily, so anybody who uses device->name in printk could possibly use free'd memory. Instead of adding locking around all of this he suggested doing it with RCU, so I've introduced a struct rcu_string that does just that and have gone through and protected all accesses to device->name that aren't under the uuid_mutex with rcu_read_lock(). This protects us and I will use it for dealing with removing the device that we used to mount the file system in a later patch. Thanks, Reviewed-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/check-integrity.c | 16 ++++---- fs/btrfs/disk-io.c | 10 +++-- fs/btrfs/extent_io.c | 7 ++-- fs/btrfs/ioctl.c | 13 +++++-- fs/btrfs/rcu-string.h | 56 ++++++++++++++++++++++++++++ fs/btrfs/scrub.c | 30 +++++++++------ fs/btrfs/volumes.c | 92 +++++++++++++++++++++++++++++----------------- fs/btrfs/volumes.h | 2 +- 8 files changed, 162 insertions(+), 64 deletions(-) create mode 100644 fs/btrfs/rcu-string.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9cebb1fd6a3c..da6e9364a5e3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -93,6 +93,7 @@ #include "print-tree.h" #include "locking.h" #include "check-integrity.h" +#include "rcu-string.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -843,13 +844,14 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", - superblock_bdev, device->name, - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - superblock_mirror_num); + printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, + rcu_str_deref(device->name), + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(superblock_tmp, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5a3bf323e2bd..1c9664b16cd1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "check-integrity.h" +#include "rcu-string.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -2575,8 +2576,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", device->name); + printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ @@ -2749,8 +2751,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk("btrfs: disabling barriers on dev %s\n", - device->name); + printk_in_rcu("btrfs: disabling barriers on dev %s\n", + rcu_str_deref(device->name)); device->nobarriers = 1; } if (!bio_flagged(bio, BIO_UPTODATE)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2c8f7b204617..aaa12c1eb348 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "check-integrity.h" #include "locking.h" +#include "rcu-string.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1917,9 +1918,9 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return -EIO; } - printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " - "sector %llu)\n", page->mapping->host->i_ino, start, - dev->name, sector); + printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 24b776c08d99..c5254dde1f0f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -52,6 +52,7 @@ #include "locking.h" #include "inode-map.h" #include "backref.h" +#include "rcu-string.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -1345,8 +1346,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "btrfs: new size for %s is %llu\n", - device->name, (unsigned long long)new_size); + printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", + rcu_str_deref(device->name), + (unsigned long long)new_size); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); @@ -2264,7 +2266,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + strncpy(di_args->path, name->str, sizeof(di_args->path)); + rcu_read_unlock(); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h new file mode 100644 index 000000000000..9e111e4576d4 --- /dev/null +++ b/fs/btrfs/rcu-string.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct rcu_string { + struct rcu_head rcu; + char str[0]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ + size_t len = strlen(src) + 1; + struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + + (len * sizeof(char)), mask); + if (!ret) + return ret; + strncpy(ret->str, src, len); + return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ + if (str) + kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk_ratelimited(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define rcu_str_deref(rcu_str) ({ \ + struct rcu_string *__str = rcu_dereference(rcu_str); \ + __str->str; \ +}) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a38cfa4f251e..b223620cd5a6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -26,6 +26,7 @@ #include "backref.h" #include "extent_io.h" #include "check-integrity.h" +#include "rcu-string.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -320,10 +321,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, - swarn->logical, swarn->dev->name, + swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); @@ -332,10 +333,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) return 0; err: - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, - swarn->logical, swarn->dev->name, + swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); free_ipath(ipath); @@ -390,10 +391,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, dev->name, + "%llu\n", errstr, swarn.logical, + rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -580,9 +582,11 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + + printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, sdev->dev->name); + (unsigned long long)fixup->logical, + rcu_str_deref(sdev->dev->name)); } btrfs_free_path(path); @@ -936,18 +940,20 @@ corrected_error: spin_lock(&sdev->stat_lock); sdev->stat.corrected_errors++; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + (unsigned long long)logical, + rcu_str_deref(sdev->dev->name)); } } else { did_not_correct_error: spin_lock(&sdev->stat_lock); sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + (unsigned long long)logical, + rcu_str_deref(sdev->dev->name)); } out: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7782020996fe..8a3d2594b807 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" +#include "rcu-string.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -64,7 +65,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } kfree(fs_devices); @@ -334,8 +335,8 @@ static noinline int device_list_add(const char *path, { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; + struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); - char *name; fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { @@ -369,11 +370,13 @@ static noinline int device_list_add(const char *path, memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); spin_lock_init(&device->io_lock); - device->name = kstrdup(path, GFP_NOFS); - if (!device->name) { + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) { kfree(device); return -ENOMEM; } + rcu_assign_pointer(device->name, name); INIT_LIST_HEAD(&device->dev_alloc_list); /* init readahead state */ @@ -390,12 +393,12 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; - } else if (!device->name || strcmp(device->name, path)) { - name = kstrdup(path, GFP_NOFS); + } else if (!device->name || strcmp(device->name->str, path)) { + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; - kfree(device->name); - device->name = name; + rcu_string_free(device->name); + rcu_assign_pointer(device->name, name); if (device->missing) { fs_devices->missing_devices--; device->missing = 0; @@ -430,15 +433,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { + struct rcu_string *name; + device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) goto error; - device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) { + /* + * This is ok to do without rcu read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { kfree(device); goto error; } + rcu_assign_pointer(device->name, name); device->devid = orig_dev->devid; device->work.func = pending_bios_fn; @@ -491,7 +501,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - kfree(device->name); + rcu_string_free(device->name); kfree(device); } @@ -516,7 +526,7 @@ static void __free_device(struct work_struct *work) if (device->bdev) blkdev_put(device->bdev, device->mode); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } @@ -540,6 +550,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; + struct rcu_string *name; if (device->bdev) fs_devices->open_devices--; @@ -555,8 +566,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device = kmalloc(sizeof(*new_device), GFP_NOFS); BUG_ON(!new_device); /* -ENOMEM */ memcpy(new_device, device, sizeof(*new_device)); - new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(device->name && !new_device->name); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(device->name && !name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; @@ -621,9 +635,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - bdev = blkdev_get_by_path(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name->str, flags, holder); if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name); + printk(KERN_INFO "open %s failed\n", device->name->str); goto error; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -1632,6 +1646,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct block_device *bdev; struct list_head *devices; struct super_block *sb = root->fs_info->sb; + struct rcu_string *name; u64 total_bytes; int seeding_dev = 0; int ret = 0; @@ -1671,23 +1686,24 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - device->name = kstrdup(device_path, GFP_NOFS); - if (!device->name) { + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { kfree(device); ret = -ENOMEM; goto error; } + rcu_assign_pointer(device->name, name); ret = find_next_devid(root, &device->devid); if (ret) { - kfree(device->name); + rcu_string_free(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - kfree(device->name); + rcu_string_free(device->name); kfree(device); ret = PTR_ERR(trans); goto error; @@ -1796,7 +1812,7 @@ error_trans: unlock_chunks(root); btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); - kfree(device->name); + rcu_string_free(device->name); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -4204,10 +4220,17 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { +#ifdef DEBUG + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - dev->name, dev->devid, bio->bi_size); + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); +#endif bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -4694,8 +4717,9 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", - device->name, (unsigned long long)device->devid); + printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + rcu_str_deref(device->name), + (unsigned long long)device->devid); __btrfs_reset_dev_stats(device); device->dev_stats_valid = 1; btrfs_release_path(path); @@ -4747,8 +4771,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", - ret, device->name); + printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + ret, rcu_str_deref(device->name)); goto out; } @@ -4757,8 +4781,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", - device->name, ret); + printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); goto out; } ret = 1; @@ -4770,8 +4794,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", - device->name, ret); + printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); goto out; } } @@ -4823,9 +4847,9 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", - dev->name, + rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -4837,8 +4861,8 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) { - printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", - dev->name, + printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3406a88ca83e..74366f27a76b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; - char *name; + struct rcu_string *name; /* the internal btrfs device id */ u64 devid; -- cgit v1.2.3 From 9c5085c147989d48dfe74194b48affc23f376650 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Jun 2012 14:13:12 -0400 Subject: Btrfs: implement ->show_devname Because btrfs can remove the device that was mounted we need to have a ->show_devname so that in this case we can print out some other device in the file system to /proc/mount. So if there are multiple devices in a btrfs file system we will just print the device with the lowest devid that we can find. This will make everything consistent and deal with device removal properly. The drawback is if you mount with a device that is higher than the lowest devicd it won't show up as the mounted device in /proc/mounts, but this is a small price to pay. This was inspired by Miao Xie's patch. Thanks, Reviewed-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 96eb9fef7bd2..0eb9a4da069e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,7 @@ #include "version.h" #include "export.h" #include "compression.h" +#include "rcu-string.h" #define CREATE_TRACE_POINTS #include @@ -1482,12 +1483,44 @@ static void btrfs_fs_dirty_inode(struct inode *inode, int flags) "error %d\n", btrfs_ino(inode), ret); } +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + struct btrfs_fs_devices *cur_devices; + struct btrfs_device *dev, *first_dev = NULL; + struct list_head *head; + struct rcu_string *name; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + cur_devices = fs_info->fs_devices; + while (cur_devices) { + head = &cur_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; + } + cur_devices = cur_devices->seed; + } + + if (first_dev) { + rcu_read_lock(); + name = rcu_dereference(first_dev->name); + seq_escape(m, name->str, " \t\n\\"); + rcu_read_unlock(); + } else { + WARN_ON(1); + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, + .show_devname = btrfs_show_devname, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_fs_dirty_inode, .alloc_inode = btrfs_alloc_inode, -- cgit v1.2.3 From 8180ef8894fa402443205cff1e23417e8d3434df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Jun 2012 15:16:12 -0400 Subject: Btrfs: keep inode pinned when compressing writes A user reported lots of problems using compression on the new code and it turns out part of the problem was that igrab() was failing when we added a new ordered extent. This is because when writing out an inode under compression we immediately return without actually doing anything to the pages, and then in another thread at some point down the line actually do the ordered dance. The problem is between the point that we start writeback and we actually add the ordered extent we could be trying to reclaim the inode, which makes igrab() return NULL. So we need to do an igrab() when we create the async extent and then drop it when we are done with it. This makes sure we stay pinned in memory until the ordered extent can get a reference on it and we are good to go. With this patch we no longer panic in btrfs_finish_ordered_io(). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b7f398c36cb7..06075043da5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -986,8 +986,10 @@ static noinline void async_cow_start(struct btrfs_work *work) compress_file_range(async_cow->inode, async_cow->locked_page, async_cow->start, async_cow->end, async_cow, &num_added); - if (num_added == 0) + if (num_added == 0) { + iput(async_cow->inode); async_cow->inode = NULL; + } } /* @@ -1020,6 +1022,8 @@ static noinline void async_cow_free(struct btrfs_work *work) { struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); + if (async_cow->inode) + iput(async_cow->inode); kfree(async_cow); } @@ -1038,7 +1042,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = inode; + async_cow->inode = igrab(inode); async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; -- cgit v1.2.3 From 7ddf5a42d311d74fd9f7373cb56def0843c219f8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Jun 2012 15:26:47 -0400 Subject: Btrfs: call filemap_fdatawrite twice for compression I removed this in an earlier commit and I was wrong. Because compression can return from filemap_fdatawrite() without having actually set any of it's pages as writeback() it can make filemap_fdatawait() do essentially nothing, and then we won't find any ordered extents because they may not have been created yet. So not only does this make fsync() completely useless, but it will also screw up if you truncate on a non-page aligned offset since we zero out the end and then wait on ordered extents and then call drop caches. We can drop the cache before the io completes and then we try to unpin the extent we just wrote we won't find it and everything goes sideways. So fix this by putting it back and put a giant comment there to keep me from trying to remove it in the future. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/inode.c | 15 +++++++++------ fs/btrfs/ordered-data.c | 22 +++++++++++++++++++++- 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e616f8872e69..12394a90d60f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -37,6 +37,7 @@ #define BTRFS_INODE_IN_DEFRAG 3 #define BTRFS_INODE_DELALLOC_META_RESERVED 4 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06075043da5d..7a090fb4eb98 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1398,20 +1398,23 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) + } else if (!btrfs_test_opt(root, COMPRESS) && + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); - else + } else { + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); + } return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9e138cdc36c5..643335a4fe3c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -627,7 +627,27 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - filemap_write_and_wait_range(inode->i_mapping, start, orig_end); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; -- cgit v1.2.3 From 6c282eb40ed6f64a51ff447707714df551d85b8e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 Jun 2012 16:03:35 +0800 Subject: Btrfs: fix defrag regression If a file has 3 small extents: | ext1 | ext2 | ext3 | Running "btrfs fi defrag" will only defrag the last two extents, if those extent mappings hasn't been read into memory from disk. This bug was introduced by commit 17ce6ef8d731af5edac8c39e806db4c7e1f6956f ("Btrfs: add a check to decide if we should defrag the range") The cause is, that commit looked into previous and next extents using lookup_extent_mapping() only. While at it, remove the code that checks the previous extent, since it's sufficient to check the next extent. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 97 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c5254dde1f0f..a98f7d252829 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -786,39 +786,57 @@ none: return -ENOENT; } -/* - * Validaty check of prev em and next em: - * 1) no prev/next em - * 2) prev/next em is an hole/inline extent - */ -static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *prev = NULL, *next = NULL; - int ret = 0; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 len = PAGE_CACHE_SIZE; + /* + * hopefully we have this extent in the tree already, try without + * the full extent lock + */ read_lock(&em_tree->lock); - prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); - next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); + em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); - if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && - (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) - ret = 1; - free_extent_map(prev); - free_extent_map(next); + if (!em) { + /* get the big lock and read metadata off disk */ + lock_extent(io_tree, start, start + len - 1); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + unlock_extent(io_tree, start, start + len - 1); + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +{ + struct extent_map *next; + bool ret = true; + + /* this is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + next = defrag_lookup_extent(inode, em->start + em->len); + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + + free_extent_map(next); return ret; } -static int should_defrag_range(struct inode *inode, u64 start, u64 len, - int thresh, u64 *last_len, u64 *skip, - u64 *defrag_end) +static int should_defrag_range(struct inode *inode, u64 start, int thresh, + u64 *last_len, u64 *skip, u64 *defrag_end) { - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; int ret = 1; + bool next_mergeable = true; /* * make sure that once we start defragging an extent, we keep on @@ -829,23 +847,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, *skip = 0; - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); - - if (!em) { - /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); - - if (IS_ERR(em)) - return 0; - } + em = defrag_lookup_extent(inode, start); + if (!em) + return 0; /* this will cover holes, and inline extents */ if (em->block_start >= EXTENT_MAP_LAST_BYTE) { @@ -853,18 +857,15 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, goto out; } - /* If we have nothing to merge with us, just skip. */ - if (check_adjacent_extents(inode, em)) { - ret = 0; - goto out; - } + next_mergeable = defrag_check_next_extent(inode, em); /* - * we hit a real extent, if it is big don't bother defragging it again + * we hit a real extent, if it is big or the next extent is not a + * real extent, don't bother defragging it */ - if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) + if ((*last_len == 0 || *last_len >= thresh) && + (em->len >= thresh || !next_mergeable)) ret = 0; - out: /* * last_len ends up being a counter of how many bytes we've defragged. @@ -1143,8 +1144,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, extent_thresh, - &last_len, &skip, &defrag_end)) { + extent_thresh, &last_len, &skip, + &defrag_end)) { unsigned long next; /* * the should_defrag function tells us how much to skip -- cgit v1.2.3 From 69e380d176e73653e09ce2cf3a5dc09682a69229 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 Jun 2012 17:10:51 +0800 Subject: Btrfs: fix incompat flags setting It's a bug, but it happens to work, as BTRFS_COMPRESS_LZO == 2, which has only one bit set. Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c9664b16cd1..9a569aef72ea 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2119,7 +2119,7 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; /* -- cgit v1.2.3 From bc1782374b128103ae9689e0753e0610f35b6bfd Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:18 -0600 Subject: Btrfs: fix missing inherited flag in rename When we move a file into a directory with compression flag, we need to inherite BTRFS_INODE_COMPRESS and clear BTRFS_INODE_NOCOMPRESS as well. But if we move a file into a directory without compression flag, we need to clear both of them. It is the way how our setflags deals with compression flag, so keep the same behaviour here. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7a090fb4eb98..3f2c8cbe5ba6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7122,10 +7122,13 @@ static void fixup_inode_flags(struct inode *dir, struct inode *inode) else b_inode->flags &= ~BTRFS_INODE_NODATACOW; - if (b_dir->flags & BTRFS_INODE_COMPRESS) + if (b_dir->flags & BTRFS_INODE_COMPRESS) { b_inode->flags |= BTRFS_INODE_COMPRESS; - else - b_inode->flags &= ~BTRFS_INODE_COMPRESS; + b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + } else { + b_inode->flags &= ~(BTRFS_INODE_COMPRESS | + BTRFS_INODE_NOCOMPRESS); + } } static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- cgit v1.2.3 From 4e42ae1bdcda77fc958a17d7ff4ba5a9c9c207da Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:19 -0600 Subject: Btrfs: do not resize a seeding device Seeding devices are not supposed to change any more. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a98f7d252829..58adbd0356d6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1306,6 +1306,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = -EINVAL; goto out_free; } + if (device->fs_devices && device->fs_devices->seeding) { + printk(KERN_INFO "btrfs: resizer unable to apply on " + "seeding device %llu\n", devid); + ret = -EINVAL; + goto out_free; + } + if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; else { -- cgit v1.2.3 From 6e841e32b159e298debbb2cb0e9158e894fcaf05 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:20 -0600 Subject: Btrfs: avoid memory leak of extent state in error handling routine We've forgotten to clear extent states in pinned tree, which will results in space counter mismatch and memory leak: WARNING: at fs/btrfs/extent-tree.c:7537 btrfs_free_block_groups+0x1f3/0x2e0 [btrfs]() ... space_info 2 has 8380416 free, is not full space_info total=12582912, used=4096, pinned=4096, reserved=0, may_use=0, readonly=4194304 btrfs state leak: start 29364224 end 29376511 state 1 in tree ffff880075f20090 refs 1 ... Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9a569aef72ea..63a36232788b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3601,6 +3601,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); /* memset(cur_trans, 0, sizeof(*cur_trans)); -- cgit v1.2.3 From ed0eaa14981e87a1e185b61e4ef621c440e3930c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:21 -0600 Subject: Btrfs: make sure that we've made everything in pinned tree clean Since we have two trees for recording pinned extents, we need to go through both of them to make sure that we've done everything clean. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 63a36232788b..ffdd76bf05d4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3557,8 +3557,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, u64 start; u64 end; int ret; + bool loop = true; unpin = pinned_extents; +again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -3576,6 +3578,15 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, cond_resched(); } + if (loop) { + if (unpin == &root->fs_info->freed_extents[0]) + unpin = &root->fs_info->freed_extents[1]; + else + unpin = &root->fs_info->freed_extents[0]; + loop = false; + goto again; + } + return 0; } -- cgit v1.2.3 From 67cde3448d951b55088a6ea3bb1aee0160068fb9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 14 Jun 2012 02:23:22 -0600 Subject: Btrfs: destroy the items of the delayed inodes in error handling routine the items of the delayed inodes were forgotten to be freed, this patch fixes it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 18 ++++++++++++++++++ fs/btrfs/delayed-inode.h | 3 +++ fs/btrfs/disk-io.c | 6 ++++++ 3 files changed, 27 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c18d0442ae6d..2399f4086915 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1879,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) } } } + +void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node) { + __btrfs_kill_delayed_node(curr_node); + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } +} + diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7083d08b2a21..f5aa4023d3e1 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_root *root); + /* Used for readdir() */ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, struct list_head *del_list); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ffdd76bf05d4..e22c5bbf0223 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3608,6 +3608,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->commit_done = 1; wake_up(&cur_trans->commit_wait); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pending_snapshots(cur_trans); btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, @@ -3662,6 +3665,9 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pending_snapshots(t); btrfs_destroy_delalloc_inodes(root); -- cgit v1.2.3 From 4325edd0786fdd3909f9550e52a963b2fe54f78b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Jun 2012 20:02:02 -0400 Subject: Btrfs: init old_generation in get_old_root gcc was giving an uninit variable warning here. Strictly speaking we don't need to init it, but this will make things much less error prone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 04b06bcf2ca9..15cbc2bf4ff0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1175,7 +1175,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb; struct tree_mod_root *old_root = NULL; - u64 old_generation; + u64 old_generation = 0; u64 logical; eb = btrfs_read_lock_root_node(root); -- cgit v1.2.3 From a8c4a33b98b097e69cd1a10672c43d17ad0ffb25 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Jun 2012 20:07:17 -0400 Subject: Btrfs: cast devid to unsigned long long for printk %llu Avoid warning in 32 bit machines Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 58adbd0356d6..0e92e5763005 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1308,7 +1308,8 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } if (device->fs_devices && device->fs_devices->seeding) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", devid); + "seeding device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_free; } -- cgit v1.2.3 From 1c8f52a5e9539600543347bcdefafa1854e07986 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 19 Jun 2012 07:42:25 -0600 Subject: Btrfs: introduce btrfs_next_old_item We introduce btrfs_next_old_item that uses btrfs_next_old_leaf instead of btrfs_next_leaf. btrfs_next_item is also changed to simply call btrfs_next_old_item with time_seq being 0. Signed-off-by: Alexander Block Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index db15e9e7b91c..84ac723f58f8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2755,13 +2755,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); -static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +static inline int btrfs_next_old_item(struct btrfs_root *root, + struct btrfs_path *p, u64 time_seq) { ++p->slots[0]; if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_leaf(root, p); + return btrfs_next_old_leaf(root, p, time_seq); return 0; } +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + return btrfs_next_old_item(root, p, 0); +} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, -- cgit v1.2.3 From 69bca40d41c613927b150c5392505f1894fe3010 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 19 Jun 2012 07:42:26 -0600 Subject: Btrfs: don't assume to be on the correct extent in add_all_parents add_all_parents did assume that path is already at a correct extent data item, which may not be true in case of data extents that were partly rewritten and splitted. We need to check if we're on a matching extent for every item and only for the ones after the first. The loop is changed to do this now. This patch also fixes a bug introduced with commit 3b127fd8 "Btrfs: remove obsolete btrfs_next_leaf call from __resolve_indirect_ref". The removal of next_leaf did sometimes result in slot==nritems when the above described case happens, and thus resulting in invalid values (e.g. wanted_obejctid) in add_all_parents (leading to missed backrefs or even crashes). Signed-off-by: Alexander Block Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 94 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 42 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8f7d1237b7a0..7301cdb4b2cb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -179,61 +179,74 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, int level, - struct btrfs_key *key, u64 time_seq, + struct btrfs_key *key_for_search, u64 time_seq, u64 wanted_disk_byte, const u64 *extent_item_pos) { - int ret; - int slot = path->slots[level]; - struct extent_buffer *eb = path->nodes[level]; + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL; u64 disk_byte; - u64 wanted_objectid = key->objectid; -add_parent: - if (level == 0 && extent_item_pos) { - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); if (ret < 0) return ret; - } - ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); - if (ret < 0) - return ret; - - if (level != 0) return 0; + } /* - * if the current leaf is full with EXTENT_DATA items, we must - * check the next one if that holds a reference as well. - * ref->count cannot be used to skip this check. - * repeat this until we don't find any additional EXTENT_DATA items. + * We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot==nritems. In that case, go to the next leaf before we continue. */ - while (1) { - eie = NULL; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - if (ret < 0) - return ret; - if (ret) - return 0; + while (!ret) { eb = path->nodes[0]; - for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, key, slot); - if (key->objectid != wanted_objectid || - key->type != BTRFS_EXTENT_DATA_KEY) - return 0; - fi = btrfs_item_ptr(eb, slot, - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == wanted_disk_byte) - goto add_parent; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + if (extent_item_pos) { + ret = check_extent_in_eb(&key, eb, fi, + *extent_item_pos, + &eie); + if (ret < 0) + break; + } + if (!ret) { + ret = ulist_add(parents, eb->start, + (unsigned long)eie, GFP_NOFS); + if (ret < 0) + break; + if (!extent_item_pos) { + ret = btrfs_next_old_leaf(root, path, + time_seq); + continue; + } + } } + ret = btrfs_next_old_item(root, path, time_seq); } - return 0; + if (ret > 0) + ret = 0; + return ret; } /* @@ -250,7 +263,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct btrfs_root *root; struct btrfs_key root_key; - struct btrfs_key key = {0}; struct extent_buffer *eb; int ret = 0; int root_level; @@ -295,11 +307,9 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - - ret = add_all_parents(root, path, parents, level, &key, time_seq, - ref->wanted_disk_byte, extent_item_pos); + ret = add_all_parents(root, path, parents, level, &ref->key_for_search, + time_seq, ref->wanted_disk_byte, + extent_item_pos); out: btrfs_free_path(path); return ret; -- cgit v1.2.3 From e18fca734278784bd6591de63ca148cc27344ca9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Jun 2012 07:23:18 -0600 Subject: Btrfs: add a missing spin_lock When fixing up the locking in the delayed ref destruction work I accidently broke the locking myself ;(. Add back a spin_lock that should be there and we are now all set. Thanks, Btrfs: add a missing spin_lock When fixing up the locking in the delayed ref destruction work I accidently broke the locking myself ;(. Add back a spin_lock that should be there and we are now all set. Thanks, Reported-by: Dan Carpenter Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e22c5bbf0223..7d7bc8eace86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3426,6 +3426,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, mutex_unlock(&head->mutex); btrfs_put_delayed_ref(ref); + spin_lock(&delayed_refs->lock); continue; } -- cgit v1.2.3 From cb77fcd88569cd2b7b25ecd4086ea932a53be9b3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jun 2012 12:19:48 -0600 Subject: Btrfs: delay iput with async extents There is some concern that these iput()'s could be the final iputs and could induce lockups on people waiting on writeback. This would happen in the rare case that we don't create ordered extents because of an error, but it is theoretically possible and we already have a mechanism to deal with this so just make them delayed iputs to negate any worry. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f2c8cbe5ba6..4a4f2d59a64b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -987,7 +987,7 @@ static noinline void async_cow_start(struct btrfs_work *work) async_cow->start, async_cow->end, async_cow, &num_added); if (num_added == 0) { - iput(async_cow->inode); + btrfs_add_delayed_iput(async_cow->inode); async_cow->inode = NULL; } } @@ -1023,7 +1023,7 @@ static noinline void async_cow_free(struct btrfs_work *work) struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); if (async_cow->inode) - iput(async_cow->inode); + btrfs_add_delayed_iput(async_cow->inode); kfree(async_cow); } -- cgit v1.2.3