diff options
Diffstat (limited to 'fs')
71 files changed, 882 insertions, 411 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee834ef7beb4..43e1660f450f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); + kthread_associate_blkcg(blkcg_css); } refcount_set(&cb->pending_bios, 1); @@ -491,6 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -517,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } + if (blkcg_css) + kthread_associate_blkcg(NULL); + return 0; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b6e86aaf2e1..24658b5a5787 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -379,7 +379,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2e8fd8a8e59..54efb21c2727 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2787,7 +2787,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); + struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 153f71a5bba9..274318e9114e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1869,8 +1869,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, head->bytenr, - head->num_bytes); + ret = btrfs_del_csums(trans, fs_info->csum_root, + head->bytenr, head->num_bytes); } } @@ -3175,7 +3175,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info, bytenr, num_bytes); + ret = btrfs_del_csums(trans, info->csum_root, bytenr, + num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3799,6 +3800,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; + int cache_block_group_error = 0; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; @@ -3958,7 +3960,20 @@ have_block_group: if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); - BUG_ON(ret < 0); + + /* + * If we get ENOMEM here or something else we want to + * try other block groups, because it may not be fatal. + * However if we can't find anything else we need to + * save our return here so that we return the actual + * error that caused problems, not ENOSPC. + */ + if (ret < 0) { + if (!cache_block_group_error) + cache_block_group_error = ret; + ret = 0; + goto loop; + } ret = 0; } @@ -4045,7 +4060,7 @@ loop: if (ret > 0) goto search; - if (ret == -ENOSPC) { + if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. @@ -4056,6 +4071,8 @@ loop: space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl.max_extent_size; + } else if (ret == -ENOSPC) { + ret = cache_block_group_error; } return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb8bd0258360..2f4802f405a2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5074,12 +5074,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3270a40b0777..b1bfdc5c1387 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -590,9 +590,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, * range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) + struct btrfs_root *root, u64 bytenr, u64 len) { - struct btrfs_root *root = fs_info->csum_root; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; @@ -602,6 +602,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int blocksize_bits = fs_info->sb->s_blocksize_bits; + ASSERT(root == fs_info->csum_root || + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0cb43b682789..8d47c76b7bd1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2599,8 +2599,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - u64 clone_len = drop_end - cur_offset; + if (clone_info && drop_end > clone_info->file_offset) { + u64 clone_len = drop_end - clone_info->file_offset; ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, clone_len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 56032c518b26..5509c41a4f43 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1479,10 +1479,10 @@ next_slot: disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); /* - * If extent we got ends before our range starts, skip - * to next extent + * If the extent we got ends before our current offset, + * skip to the next extent. */ - if (extent_end <= start) { + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } @@ -5728,7 +5728,6 @@ static void inode_tree_add(struct inode *inode) static void inode_tree_del(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int empty = 0; @@ -5741,7 +5740,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -9556,9 +9554,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_init_log_ctx(&ctx_dest, new_inode); /* close the race window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&fs_info->subvol_sem); - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* @@ -9792,9 +9789,8 @@ out_fail: ret = ret ? ret : ret2; } out_notrans: - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&fs_info->subvol_sem); - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); ASSERT(list_empty(&ctx_root.list)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1ee0b775e65..18e328ce4b54 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -704,11 +704,17 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); @@ -3720,24 +3726,18 @@ process_slot: ret = 0; if (last_dest_end < destoff + len) { - struct btrfs_clone_extent_info clone_info = { 0 }; /* - * We have an implicit hole (NO_HOLES feature is enabled) that - * fully or partially overlaps our cloning range at its end. + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); path->leave_spinning = 0; - /* - * We are dealing with a hole and our clone_info already has a - * disk_offset of 0, we only need to fill the data length and - * file offset. - */ - clone_info.data_len = destoff + len - last_dest_end; - clone_info.file_offset = last_dest_end; ret = btrfs_punch_hole_range(inode, path, last_dest_end, destoff + len - 1, - &clone_info, &trans); + NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93aeb2e539a4..d4282e12f2a6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3232,12 +3232,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup is not enabled"); + "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup rescan is not queued"); + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d897a8e5e430..c58245797f30 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4552,6 +4552,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ae2db5eb1549..091e5bc8c7ea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7084,12 +7084,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) spin_unlock(&send_root->root_item_lock); /* - * This is done when we lookup the root, it should already be complete - * by the time we get here. - */ - WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); - - /* * Userspace tools do the checks and warn the user if it's * not RO. */ diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1a846bf6e197..914eea5ba6a7 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -452,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 09aaca1efd62..ac035a6fa003 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) * *cough*backref walking code*cough* */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_err("couldn't allocate dummy buffer"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 493d4d9e0f79..97f3520b8d98 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -227,7 +227,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, */ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { file_extent_err(leaf, slot, - "invalid item size, have %u expect [%lu, %u)", + "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; @@ -332,7 +332,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, } static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, - int slot) + int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; @@ -356,6 +356,20 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + + prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (prev_csum_end > key->offset) { + generic_err(leaf, slot - 1, +"csum end range (%llu) goes beyond the start range (%llu) of the next csum item", + prev_csum_end, key->offset); + return -EUCLEAN; + } + } return 0; } @@ -1355,7 +1369,7 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_item(leaf, key, slot, prev_key); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(leaf, key, slot); + ret = check_csum_item(leaf, key, slot, prev_key); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6f757361db53..d3f115909ff0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -808,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_del_csums(trans, fs_info, + ret = btrfs_del_csums(trans, + fs_info->csum_root, sums->bytenr, sums->len); if (!ret) @@ -3909,6 +3910,28 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return 0; } +static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_ordered_sum *sums) +{ + int ret; + + /* + * Due to extent cloning, we might have logged a csum item that covers a + * subrange of a cloned extent, and later we can end up logging a csum + * item for a larger subrange of the same extent or the entire range. + * This would leave csum items in the log tree that cover the same range + * and break the searches for checksums in the log tree, resulting in + * some checksums missing in the fs/subvolume tree. So just delete (or + * trim and adjust) any existing csum items in the log for this range. + */ + ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + if (ret) + return ret; + + return btrfs_csum_file_blocks(trans, log_root, sums); +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, @@ -4054,7 +4077,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = log_csums(trans, log, sums); list_del(&sums->list); kfree(sums); } @@ -4274,7 +4297,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log_root, sums); + ret = log_csums(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -6294,9 +6317,28 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(fs_info, + log->node->start, + log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); + + if (!ret) + goto next; btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root for tree log recovery."); goto error; @@ -6328,7 +6370,6 @@ again: &root->highest_objectid); } - key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); free_extent_buffer(log->commit_root); @@ -6336,9 +6377,10 @@ again: if (ret) goto error; - +next: if (found_key.offset == 0) break; + key.offset = found_key.offset - 1; } btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 91caab63bdf5..76b84f2397b1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -324,6 +324,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8e5560db285..a6d3f08bfff3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -61,7 +61,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, @@ -73,7 +73,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, diff --git a/fs/buffer.c b/fs/buffer.c index d8c7242426bb..e94a6619464c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3034,8 +3034,6 @@ static void end_bio_bh_io_sync(struct bio *bio) void guard_bio_eod(int op, struct bio *bio) { sector_t maxsector; - struct bio_vec *bvec = bio_last_bvec_all(bio); - unsigned truncated_bytes; struct hd_struct *part; rcu_read_lock(); @@ -3061,28 +3059,7 @@ void guard_bio_eod(int op, struct bio *bio) if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bio that straddles the device size! */ - truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); - - /* - * The bio contains more than one segment which spans EOD, just return - * and let IO layer turn it into an EIO - */ - if (truncated_bytes > bvec->bv_len) - return; - - /* Truncate the bio.. */ - bio->bi_iter.bi_size -= truncated_bytes; - bvec->bv_len -= truncated_bytes; - - /* ..and clear the end of the buffer for reads */ - if (op == REQ_OP_READ) { - struct bio_vec bv; - - mp_bvec_last_segment(bvec, &bv); - zero_user(bv.bv_page, bv.bv_offset + bv.bv_len, - truncated_bytes); - } + bio_truncate(bio, maxsector << 9); } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index fd0262ce5ad5..40705e862451 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1061,7 +1061,7 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; - + bool has_lease:1; struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; @@ -1693,6 +1693,7 @@ struct cifs_fattr { struct timespec64 cf_atime; struct timespec64 cf_mtime; struct timespec64 cf_ctime; + u32 cf_cifstag; }; static inline void free_dfs_info_param(struct dfs_info3_param *param) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4f554f019a98..cc86a67225d1 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -42,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "smb2proto.h" #include "fscache.h" #include "smbdirect.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -112,6 +113,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; + /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ + close_shroot_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 3925a7bfc74d..d17587c2c4ab 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -139,6 +139,28 @@ retry: dput(dentry); } +static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) +{ + if (!(fattr->cf_cifsattrs & ATTR_REPARSE)) + return false; + /* + * The DFS tags should be only intepreted by server side as per + * MS-FSCC 2.1.2.1, but let's include them anyway. + * + * Besides, if cf_cifstag is unset (0), then we still need it to be + * revalidated to know exactly what reparse point it is. + */ + switch (fattr->cf_cifstag) { + case IO_REPARSE_TAG_DFS: + case IO_REPARSE_TAG_DFSR: + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_NFS: + case 0: + return true; + } + return false; +} + static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -158,7 +180,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * is a symbolic link, DFS referral or a reparse point with a direct * access like junctions, deduplicated files, NFS symlinks. */ - if (fattr->cf_cifsattrs & ATTR_REPARSE) + if (reparse_file_needs_reval(fattr)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; /* non-unix readdir doesn't provide nlink */ @@ -194,19 +216,37 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } } +static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) +{ + const FILE_DIRECTORY_INFO *fi = info; + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(fi->ExtFileAttributes); + fattr->cf_eof = le64_to_cpu(fi->EndOfFile); + fattr->cf_bytes = le64_to_cpu(fi->AllocationSize); + fattr->cf_createtime = le64_to_cpu(fi->CreationTime); + fattr->cf_atime = cifs_NTtimeToUnix(fi->LastAccessTime); + fattr->cf_ctime = cifs_NTtimeToUnix(fi->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(fi->LastWriteTime); +} + void cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, struct cifs_sb_info *cifs_sb) { - memset(fattr, 0, sizeof(*fattr)); - fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); - fattr->cf_eof = le64_to_cpu(info->EndOfFile); - fattr->cf_bytes = le64_to_cpu(info->AllocationSize); - fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); - fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + __dir_info_to_fattr(fattr, info); + cifs_fill_common_info(fattr, cifs_sb); +} +static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, + SEARCH_ID_FULL_DIR_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + __dir_info_to_fattr(fattr, info); + + /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */ + if (fattr->cf_cifsattrs & ATTR_REPARSE) + fattr->cf_cifstag = le32_to_cpu(info->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -755,6 +795,11 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fulldir_info_to_fattr(&fattr, + (SEARCH_ID_FULL_DIR_INFO *)find_entry, + cifs_sb); + break; default: cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)find_entry, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 8b0b512c5792..afe1f03aabe3 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -67,7 +67,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, goto out; - if (oparms->tcon->use_resilient) { + if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 18c7a33adceb..5ef5e97a6d13 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -95,6 +95,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } + memset(&oparms, 0, sizeof(struct cifs_open_parms)); oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = create_disposition; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a5c96bc522cb..6250370c1170 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -616,6 +616,7 @@ smb2_close_cached_fid(struct kref *ref) cfid->fid->volatile_fid); cfid->is_valid = false; cfid->file_all_info_is_valid = false; + cfid->has_lease = false; } } @@ -626,13 +627,28 @@ void close_shroot(struct cached_fid *cfid) mutex_unlock(&cfid->fid_mutex); } +void close_shroot_lease_locked(struct cached_fid *cfid) +{ + if (cfid->has_lease) { + cfid->has_lease = false; + kref_put(&cfid->refcount, smb2_close_cached_fid); + } +} + +void close_shroot_lease(struct cached_fid *cfid) +{ + mutex_lock(&cfid->fid_mutex); + close_shroot_lease_locked(cfid); + mutex_unlock(&cfid->fid_mutex); +} + void smb2_cached_lease_break(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot(cfid); + close_shroot_lease(cfid); } /* @@ -773,6 +789,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); + tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0ab6b1200288..9434f6dd8df3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1847,7 +1847,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot(&tcon->crfid); + close_shroot_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a18272c987fe..27d29f2eb6c8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,6 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid); extern void close_shroot(struct cached_fid *cfid); +extern void close_shroot_lease(struct cached_fid *cfid); +extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/direct-io.c b/fs/direct-io.c index 0ec4f270139f..00b4d15bb811 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,6 +39,8 @@ #include <linux/atomic.h> #include <linux/prefetch.h> +#include "internal.h" + /* * How many user pages to map in one call to get_user_pages(). This determines * the size of a structure in the slab cache diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d31b6c72b476..dc1a1d5d825b 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -35,11 +35,11 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); - cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index d4d4fdfac1a6..1ee04e76bbe0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -133,10 +133,13 @@ static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; + struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks->root); + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -144,6 +147,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) first = 0; node = rb_next(node); } + rcu_read_unlock(); printk(KERN_CONT "\n"); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9fdd2b269d61..9f00fc0bf21d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -72,6 +72,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); + const int next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; @@ -79,8 +80,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - buf) + rlen > size)) + else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + next_offset != size)) + error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index dc333e8e51e8..8ca4a23129aa 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -921,8 +921,8 @@ repeat_in_this_group: if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, - 0, 0); + handle_type, nblocks, 0, + ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 92a9da1774aa..bbce1c328d85 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -25,7 +25,7 @@ * For constructing the negative timestamp lower bound value. * binary: 10000000 00000000 00000000 00000000 */ -#define LOWER_MSB_1 (-0x80000000L) +#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */ /* * For constructing the negative timestamp upper bound value. * binary: 11111111 11111111 11111111 11111111 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f28de0c1b6..629a25d999f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5692,7 +5692,7 @@ int ext4_expand_extra_isize(struct inode *inode, error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); - goto out_stop; + goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, @@ -5702,8 +5702,8 @@ int ext4_expand_extra_isize(struct inode *inode, if (!error) error = rc; +out_unlock: ext4_write_unlock_xattr(inode, &no_expand); -out_stop: ext4_journal_stop(handle); return error; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a856997d87b5..1cb42d940784 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2164,7 +2164,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; +#ifdef CONFIG_UNICODE struct ext4_sb_info *sbi; +#endif struct ext4_filename fname; int retval; int dx_fallback=0; @@ -2176,12 +2178,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; - sbi = EXT4_SB(sb); blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; #ifdef CONFIG_UNICODE + sbi = EXT4_SB(sb); if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) return -EINVAL; @@ -2822,7 +2824,7 @@ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; - struct ext4_dir_entry_2 *de, *de1; + struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { @@ -2847,19 +2849,25 @@ bool ext4_empty_dir(struct inode *inode) return true; de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de, sb->s_blocksize); - if (le32_to_cpu(de->inode) != inode->i_ino || - le32_to_cpu(de1->inode) == 0 || - strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning_inode(inode, "directory missing '.' and/or '..'"); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + 0) || + le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + ext4_warning_inode(inode, "directory missing '.'"); + brelse(bh); + return true; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return true; } - offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + - ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); - de = ext4_next_entry(de1, sb->s_blocksize); + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { - if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -2870,12 +2878,11 @@ bool ext4_empty_dir(struct inode *inode) } if (IS_ERR(bh)) return true; - de = (struct ext4_dir_entry_2 *) bh->b_data; } + de = (struct ext4_dir_entry_2 *) (bh->b_data + + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset)) { - de = (struct ext4_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; continue; } @@ -2884,7 +2891,6 @@ bool ext4_empty_dir(struct inode *inode) return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); return true; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1d82b56d9b11..2937a8873fe1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1900,6 +1900,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { + if ((arg & 1) || + (arg < 4) || + (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { + ext4_msg(sb, KERN_ERR, + "Invalid want_extra_isize %d", arg); + return -1; + } sbi->s_want_extra_isize = arg; } else if (token == Opt_max_batch_time) { sbi->s_max_batch_time = arg; @@ -3554,40 +3561,6 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } -static void ext4_clamp_want_extra_isize(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - unsigned def_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - - if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { - sbi->s_want_extra_isize = 0; - return; - } - if (sbi->s_want_extra_isize < 4) { - sbi->s_want_extra_isize = def_extra_isize; - if (ext4_has_feature_extra_isize(sb)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if ((sbi->s_want_extra_isize > sbi->s_inode_size) || - (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size)) { - sbi->s_want_extra_isize = def_extra_isize; - ext4_msg(sb, KERN_INFO, - "required extra inode space not available"); - } -} - static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; @@ -3795,6 +3768,68 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + goto failed_mount; + } + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + /* + * i_atime_extra is the last extra field available for + * [acm]times in struct ext4_inode. Checking for that + * field should suffice to ensure we have extra space + * for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + sb->s_time_min = EXT4_TIMESTAMP_MIN; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + unsigned v, max = (sbi->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE); + + v = le16_to_cpu(es->s_want_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_want_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + + v = le16_to_cpu(es->s_min_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_min_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + } + } + if (sbi->s_es->s_mount_opts[0]) { char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), @@ -4033,42 +4068,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { - sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { - ext4_msg(sb, KERN_ERR, "invalid first ino: %u", - sbi->s_first_ino); - goto failed_mount; - } - if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext4_msg(sb, KERN_ERR, - "unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - /* - * i_atime_extra is the last extra field available for [acm]times in - * struct ext4_inode. Checking for that field should suffice to ensure - * we have extra space for all three. - */ - if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + - sizeof(((struct ext4_inode *)0)->i_atime_extra)) { - sb->s_time_gran = 1; - sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; - } else { - sb->s_time_gran = NSEC_PER_SEC; - sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; - } - - sb->s_time_min = EXT4_TIMESTAMP_MIN; - } - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || @@ -4517,8 +4516,6 @@ no_journal: } else if (ret) goto failed_mount4a; - ext4_clamp_want_extra_isize(sb); - ext4_set_resv_clusters(sb); err = ext4_setup_system_zone(sb); @@ -5306,8 +5303,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - ext4_clamp_want_extra_isize(sb); - if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d5c2a3158610..a66e425884d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1498,8 +1498,10 @@ static int __init init_hugetlbfs_fs(void) /* other hstates are optional */ i = 0; for_each_hstate(h) { - if (i == default_hstate_idx) + if (i == default_hstate_idx) { + i++; continue; + } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) diff --git a/fs/inode.c b/fs/inode.c index fef457a42882..96d62d97694e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -676,6 +676,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); +again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); @@ -698,6 +699,12 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/locks.c b/fs/locks.c index 6970f55daf54..44b6da032842 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2853,7 +2853,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, + seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { diff --git a/fs/namespace.c b/fs/namespace.c index 2fd0c8bcb8c1..5e1bf611a9eb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1728,7 +1728,7 @@ static bool is_mnt_ns_file(struct dentry *dentry) dentry->d_fsdata == &mntns_operations; } -struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } @@ -3325,8 +3325,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) } EXPORT_SYMBOL(mount_subtree); -int ksys_mount(const char __user *dev_name, const char __user *dir_name, - const char __user *type, unsigned long flags, void __user *data) +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; @@ -3359,12 +3359,6 @@ out_type: return ret; } -SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, - char __user *, type, unsigned long, flags, void __user *, data) -{ - return ksys_mount(dev_name, dir_name, type, flags, data); -} - /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3e77b728a22b..46f225580009 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -57,6 +57,9 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. + * However, we should have been called /after/ evict_inodes + * removed all zero refcount inodes, in any case. Test to + * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); @@ -77,6 +80,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) iput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/nsfs.c b/fs/nsfs.c index a0431642c6b5..f75767bd623a 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -3,6 +3,7 @@ #include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> @@ -11,6 +12,8 @@ #include <linux/nsfs.h> #include <linux/uaccess.h> +#include "internal.h" + static struct vfsmount *nsfs_mnt; static long ns_ioctl(struct file *filp, unsigned int ioctl, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c4c51f3df60..cda1027d0819 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3282,6 +3282,7 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, &dlm_debug->d_filter_secs); + ocfs2_get_dlm_debug(dlm_debug); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1afe57f425a0..68ba354cf361 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + if (replayed) { + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + status = ocfs2_journal_toggle_dirty(osb, 1, replayed); if (status < 0) { mlog_errno(status); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b801c6353100..6220642fe113 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -227,13 +227,17 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; - int fh_type, fh_len, dwords; - void *buf; + int fh_type, dwords; int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &real->d_sb->s_uuid; + int err; - buf = kmalloc(buflen, GFP_KERNEL); - if (!buf) + /* Make sure the real fid stays 32bit aligned */ + BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4); + BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255); + + fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL); + if (!fh) return ERR_PTR(-ENOMEM); /* @@ -242,27 +246,19 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, buf, &dwords, 0); + fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); buflen = (dwords << 2); - fh = ERR_PTR(-EIO); + err = -EIO; if (WARN_ON(fh_type < 0) || WARN_ON(buflen > MAX_HANDLE_SZ) || WARN_ON(fh_type == FILEID_INVALID)) - goto out; + goto out_err; - BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); - fh_len = offsetof(struct ovl_fh, fid) + buflen; - fh = kmalloc(fh_len, GFP_KERNEL); - if (!fh) { - fh = ERR_PTR(-ENOMEM); - goto out; - } - - fh->version = OVL_FH_VERSION; - fh->magic = OVL_FH_MAGIC; - fh->type = fh_type; - fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->fb.version = OVL_FH_VERSION; + fh->fb.magic = OVL_FH_MAGIC; + fh->fb.type = fh_type; + fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN; /* * When we will want to decode an overlay dentry from this handle * and all layers are on the same fs, if we get a disconncted real @@ -270,14 +266,15 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * it to upperdentry or to lowerstack is by checking this flag. */ if (is_upper) - fh->flags |= OVL_FH_FLAG_PATH_UPPER; - fh->len = fh_len; - fh->uuid = *uuid; - memcpy(fh->fid, buf, buflen); + fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; + fh->fb.len = sizeof(fh->fb) + buflen; + fh->fb.uuid = *uuid; -out: - kfree(buf); return fh; + +out_err: + kfree(fh); + return ERR_PTR(err); } int ovl_set_origin(struct dentry *dentry, struct dentry *lower, @@ -300,8 +297,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, - fh ? fh->len : 0, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + fh ? fh->fb.len : 0, 0); kfree(fh); return err; @@ -317,7 +314,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0); kfree(fh); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 702aa63f6774..29abdb1d3b5c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (newdentry == trap) goto out_dput; - if (WARN_ON(olddentry->d_inode == newdentry->d_inode)) + if (olddentry->d_inode == newdentry->d_inode) goto out_dput; err = 0; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 73c9775215b3..70e55588aedc 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -211,10 +211,11 @@ static int ovl_check_encode_origin(struct dentry *dentry) return 1; } -static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; + int len; /* * Check if we should encode a lower or upper file handle and maybe @@ -231,11 +232,12 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) return PTR_ERR(fh); err = -EOVERFLOW; - if (fh->len > buflen) + len = OVL_FH_LEN(fh); + if (len > buflen) goto fail; - memcpy(buf, (char *)fh, fh->len); - err = fh->len; + memcpy(fid, fh, len); + err = len; out: kfree(fh); @@ -243,31 +245,16 @@ out: fail: pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", - dentry, err, buflen, fh ? (int)fh->len : 0, - fh ? fh->type : 0); + dentry, err, buflen, fh ? (int)fh->fb.len : 0, + fh ? fh->fb.type : 0); goto out; } -static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) -{ - int res, len = *max_len << 2; - - res = ovl_d_to_fh(dentry, (char *)fid, len); - if (res <= 0) - return FILEID_INVALID; - - len = res; - - /* Round up to dwords */ - *max_len = (len + 3) >> 2; - return OVL_FILEID; -} - static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct dentry *dentry; - int type; + int bytes = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) @@ -277,10 +264,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - type = ovl_dentry_to_fh(dentry, fid, max_len); - + bytes = ovl_dentry_to_fid(dentry, fid, bytes); dput(dentry); - return type; + if (bytes <= 0) + return FILEID_INVALID; + + *max_len = bytes >> 2; + + return OVL_FILEID_V1; } /* @@ -777,24 +768,45 @@ out_err: goto out; } +static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) +{ + struct ovl_fh *fh; + + /* If on-wire inner fid is aligned - nothing to do */ + if (fh_type == OVL_FILEID_V1) + return (struct ovl_fh *)fid; + + if (fh_type != OVL_FILEID_V0) + return ERR_PTR(-EINVAL); + + fh = kzalloc(buflen, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* Copy unaligned inner fh into aligned buffer */ + memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + return fh; +} + static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; - struct ovl_fh *fh = (struct ovl_fh *) fid; + struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; - err = -EINVAL; - if (fh_type != OVL_FILEID) + fh = ovl_fid_to_fh(fid, len, fh_type); + err = PTR_ERR(fh); + if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; - flags = fh->flags; + flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); @@ -802,12 +814,18 @@ static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, if (IS_ERR(dentry) && err != -ESTALE) goto out_err; +out: + /* We may have needed to re-align OVL_FILEID_V0 */ + if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) + kfree(fh); + return dentry; out_err: pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", - len, fh_type, flags, err); - return ERR_PTR(err); + fh_len, fh_type, flags, err); + dentry = ERR_PTR(err); + goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bc14781886bf..b045cf1826fc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -200,8 +200,14 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { - stat->ino = lowerstat.ino; lower_layer = ovl_layer_lower(dentry); + /* + * Cannot use origin st_dev;st_ino because + * origin inode content may differ from overlay + * inode content. + */ + if (samefs || lower_layer->fsid) + stat->ino = lowerstat.ino; } /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c269d6033525..76ff66339173 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -84,21 +84,21 @@ static int ovl_acceptable(void *ctx, struct dentry *dentry) * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { - if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; - if (fh->magic != OVL_FH_MAGIC) + if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ - if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ - if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && - (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; @@ -119,15 +119,15 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) if (res == 0) return NULL; - fh = kzalloc(res, GFP_KERNEL); + fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, name, fh, res); + res = vfs_getxattr(dentry, name, fh->buf, res); if (res < 0) goto fail; - err = ovl_check_fh_len(fh, res); + err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; @@ -158,12 +158,12 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ - if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid)) return NULL; - bytes = (fh->len - offsetof(struct ovl_fh, fid)); - real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, - bytes >> 2, (int)fh->type, + bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, + bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* @@ -173,7 +173,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && - !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } @@ -323,6 +323,14 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int i; for (i = 0; i < ofs->numlower; i++) { + /* + * If lower fs uuid is not unique among lower fs we cannot match + * fh->uuid to layer. + */ + if (ofs->lower_layers[i].fsid && + ofs->lower_layers[i].fs->bad_uuid) + continue; + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, connected); if (origin) @@ -400,7 +408,7 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name, if (IS_ERR(ofh)) return PTR_ERR(ofh); - if (fh->len != ofh->len || memcmp(fh, ofh, fh->len)) + if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); @@ -431,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); + err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0); if (err) goto fail; @@ -505,20 +513,20 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto fail; err = -EINVAL; - if (index->d_name.len < sizeof(struct ovl_fh)*2) + if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_KERNEL); + fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; - if (hex2bin((u8 *)fh, index->d_name.name, len)) + if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; - err = ovl_check_fh_len(fh, len); + err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; @@ -597,11 +605,11 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) { char *n, *s; - n = kcalloc(fh->len, 2, GFP_KERNEL); + n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; - s = bin2hex(n, fh, fh->len); + s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0..f283b1d69a9e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -71,20 +71,36 @@ enum ovl_entry_flag { #error Endianness not defined #endif -/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ -#define OVL_FILEID 0xfb +/* The type used to be returned by overlay exportfs for misaligned fid */ +#define OVL_FILEID_V0 0xfb +/* The type returned by overlay exportfs for 32bit aligned fid */ +#define OVL_FILEID_V1 0xf8 -/* On-disk and in-memeory format for redirect by file handle */ -struct ovl_fh { +/* On-disk format for "origin" file handle */ +struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u8 fid[0]; /* file identifier */ + u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ } __packed; +/* In-memory and on-wire format for overlay file handle */ +struct ovl_fh { + u8 padding[3]; /* make sure fb.fid is 32bit aligned */ + union { + struct ovl_fb fb; + u8 buf[0]; + }; +} __packed; + +#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) +#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) +#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ + offsetof(struct ovl_fb, fid)) + static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); @@ -302,7 +318,13 @@ static inline void ovl_inode_unlock(struct inode *inode) /* namei.c */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); + +static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); +} + struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index a8279280e88d..28348c44ea5b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -22,6 +22,8 @@ struct ovl_config { struct ovl_sb { struct super_block *sb; dev_t pseudo_dev; + /* Unusable (conflicting) uuid */ + bool bad_uuid; }; struct ovl_layer { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index afbcb116a7f1..7621ff176d15 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1255,7 +1255,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; - if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; for (i = 0; i < ofs->numlowerfs; i++) { @@ -1263,9 +1263,13 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long * as all lower layers with null uuid are on the same fs. + * if we detect multiple lower fs with the same uuid, we + * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) { + ofs->lower_fs[i].bad_uuid = true; return false; + } } return true; } @@ -1277,6 +1281,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) unsigned int i; dev_t dev; int err; + bool bad_uuid = false; /* fsid 0 is reserved for upper fs even with non upper overlay */ if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) @@ -1288,11 +1293,15 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { - ofs->config.index = false; - ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", - uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry); + bad_uuid = true; + if (ofs->config.index || ofs->config.nfs_export) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : + "conflicting", + path->dentry); + } } err = get_anon_bdev(&dev); @@ -1303,6 +1312,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) ofs->lower_fs[ofs->numlowerfs].sb = sb; ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid; ofs->numlowerfs++; return ofs->numlowerfs; diff --git a/fs/pipe.c b/fs/pipe.c index 04d004ee2e8c..57502c3c0fba 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -581,7 +581,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } wait_event_interruptible(pipe->wait, pipe_writable(pipe)); __pipe_lock(pipe); - was_empty = pipe_empty(head, pipe->tail); + was_empty = pipe_empty(pipe->head, pipe->tail); } out: __pipe_unlock(pipe); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84ad1c90d535..249672bf54fe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -631,12 +631,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl + * @inode: target inode + * @mode_p: mode (pointer) for update + * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new - * file mode, set *acl to NULL to indicate that no ACL should be set. + * file mode, set *@acl to NULL to indicate that no ACL should be set. * - * As with chmod, clear the setgit bit if the caller is not in the owning group + * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * * Called from set_acl inode operations. diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 37bdbec5b402..fd931d3e77be 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -134,7 +134,7 @@ static int show_stat(struct seq_file *p, void *v) softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; - guest_nice += cpustat[CPUTIME_USER]; + guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -175,7 +175,7 @@ static int show_stat(struct seq_file *p, void *v) softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; - guest_nice = cpustat[CPUTIME_USER]; + guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 8caff834f002..013486b5125e 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; + /* + * Since this is a new crash dump, we need to reset the buffer in + * case it still has an old dump present. Without this, the new dump + * will get appended, which would seriously confuse anything trying + * to check dump file contents. Specifically, ramoops_read_kmsg_hdr() + * expects to find a dump header in the beginning of buffer data, so + * we must to reset the buffer values, in order to ensure that the + * header will be written to the beginning of the buffer. + */ + persistent_ram_zap(prz); + /* Build header and append record contents. */ hlen = ramoops_write_kmsg_hdr(prz, record); if (!hlen) @@ -572,6 +583,7 @@ static int ramoops_init_przs(const char *name, prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig, &cxt->ecc_info, cxt->memtype, flags, label); + kfree(label); if (IS_ERR(prz_ar[i])) { err = PTR_ERR(prz_ar[i]); dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", @@ -617,6 +629,7 @@ static int ramoops_init_prz(const char *name, label = kasprintf(GFP_KERNEL, "ramoops:%s", name); *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype, PRZ_FLAG_ZAP_OLD, label); + kfree(label); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 8823f65888f0..1f4d8c06f9be 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -574,7 +574,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, /* Initialize general buffer state. */ raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; - prz->label = label; + prz->label = kstrdup(label, GFP_KERNEL); ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b0688c02dc90..b6a4f692d345 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -984,6 +984,7 @@ static int add_dquot_ref(struct super_block *sb, int type) * later. */ old_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/super.c b/fs/super.c index cfadab2cbf35..cd352530eca9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -448,10 +448,12 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_sb_delete(sb); cgroup_writeback_umount(); + /* evict all inodes with zero refcount */ evict_inodes(sb); + /* only nonzero refcount inodes can have marks */ + fsnotify_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c284e10af491..fc93fd88ec89 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2248,24 +2248,32 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a9ad1f991ba3..4c2e046fbfad 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4561,7 +4561,7 @@ xfs_bmapi_convert_delalloc( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; - u16 flags = 0; + uint16_t flags = 0; struct xfs_trans *tp; int error; @@ -5972,8 +5972,7 @@ xfs_bmap_insert_extents( goto del_cursor; } - if (XFS_IS_CORRUPT(mp, - stop_fsb >= got.br_startoff + got.br_blockcount)) { + if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { error = -EFSCORRUPTED; goto del_cursor; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 0aa87cbde49e..dd6fcaaea318 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -724,3 +724,24 @@ xfs_dir2_namecheck( /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } + +xfs_dahash_t +xfs_dir2_hashname( + struct xfs_mount *mp, + struct xfs_name *name) +{ + if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + return xfs_ascii_ci_hashname(name); + return xfs_da_hashname(name->name, name->len); +} + +enum xfs_dacmp +xfs_dir2_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + return xfs_ascii_ci_compname(args, name, len); + return xfs_da_compname(args, name, len); +} diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index c031c53d0f0d..01ee0b926572 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,6 +175,12 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +int xfs_dir2_sf_entsize(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, int len); +void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino); +void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, @@ -194,25 +200,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -static inline xfs_dahash_t -xfs_dir2_hashname( - struct xfs_mount *mp, - struct xfs_name *name) -{ - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) - return xfs_ascii_ci_hashname(name); - return xfs_da_hashname(name->name, name->len); -} - -static inline enum xfs_dacmp -xfs_dir2_compname( - struct xfs_da_args *args, - const unsigned char *name, - int len) -{ - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) - return xfs_ascii_ci_compname(args, name, len); - return xfs_da_compname(args, name, len); -} +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, + const unsigned char *name, int len); #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8b94d33d232f..7b7f6fb2ea3b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -37,7 +37,7 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); -static int +int xfs_dir2_sf_entsize( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -84,7 +84,7 @@ xfs_dir2_sf_get_ino( return get_unaligned_be64(from) & XFS_MAXINUMBER; } -static void +void xfs_dir2_sf_put_ino( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -145,7 +145,7 @@ xfs_dir2_sf_get_ftype( return XFS_DIR3_FT_UNKNOWN; } -static void +void xfs_dir2_sf_put_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988cde7744e6..5b759af4d165 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2909,3 +2909,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index c55cd9a3dec9..7a9c04920505 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +static unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res( /* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -234,40 +260,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2efd78a9719e..e62fb5216341 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -992,6 +992,7 @@ xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1005,6 +1006,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3458a1264a3f..3984779e5911 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -956,7 +956,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fca65109cf24..56efe140c923 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -360,66 +360,119 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + + return 0; +} + +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + + if (mp->m_dalign) { + bool update_sb; + int error; + + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } return 0; @@ -648,12 +701,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -664,6 +717,17 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c13bb3655e48..a86be7f807ee 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3573,6 +3573,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc); +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |