diff options
author | Linus Torvalds | 2020-08-15 08:26:55 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-08-15 08:26:55 -0700 |
commit | 37711e5e2325535bf094bdc0a66790d659b52d5b (patch) | |
tree | 4014ab0a29e4ba9d03491fea7a06df03056d9cc2 /fs/nfs | |
parent | 6ffdcde4ee9a20beda096dec664da89002610d7d (diff) | |
parent | 563c53e73b8b6ec842828736f77e633f7b0911e9 (diff) |
Merge tag 'nfs-for-5.9-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
"Stable fixes:
- pNFS: Don't return layout segments that are being used for I/O
- pNFS: Don't move layout segments off the active list when being used for I/O
Features:
- NFS: Add support for user xattrs through the NFSv4.2 protocol
- NFS: Allow applications to speed up readdir+statx() using AT_STATX_DONT_SYNC
- NFSv4.0 allow nconnect for v4.0
Bugfixes and cleanups:
- nfs: ensure correct writeback errors are returned on close()
- nfs: nfs_file_write() should check for writeback errors
- nfs: Fix getxattr kernel panic and memory overflow
- NFS: Fix the pNFS/flexfiles mirrored read failover code
- SUNRPC: dont update timeout value on connection reset
- freezer: Add unsafe versions of freezable_schedule_timeout_interruptible for NFS
- sunrpc: destroy rpc_inode_cachep after unregister_filesystem"
* tag 'nfs-for-5.9-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (32 commits)
NFS: Fix flexfiles read failover
fs: nfs: delete repeated words in comments
rpc_pipefs: convert comma to semicolon
nfs: Fix getxattr kernel panic and memory overflow
NFS: Don't return layout segments that are in use
NFS: Don't move layouts to plh_return_segs list while in use
NFS: Add layout segment info to pnfs read/write/commit tracepoints
NFS: Add tracepoints for layouterror and layoutstats.
NFS: Report the stateid + status in trace_nfs4_layoutreturn_on_close()
SUNRPC dont update timeout value on connection reset
nfs: nfs_file_write() should check for writeback errors
nfs: ensure correct writeback errors are returned on close()
NFSv4.2: xattr cache: get rid of cache discard work queue
NFS: remove redundant initialization of variable result
NFSv4.0 allow nconnect for v4.0
freezer: Add unsafe versions of freezable_schedule_timeout_interruptible for NFS
sunrpc: destroy rpc_inode_cachep after unregister_filesystem
NFSv4.2: add client side xattr caching.
NFSv4.2: hook in the user extended attribute handlers
NFSv4.2: add the extended attribute proc functions.
...
Diffstat (limited to 'fs/nfs')
-rw-r--r-- | fs/nfs/Makefile | 2 | ||||
-rw-r--r-- | fs/nfs/blocklayout/rpc_pipefs.c | 2 | ||||
-rw-r--r-- | fs/nfs/client.c | 22 | ||||
-rw-r--r-- | fs/nfs/dir.c | 24 | ||||
-rw-r--r-- | fs/nfs/direct.c | 2 | ||||
-rw-r--r-- | fs/nfs/file.c | 17 | ||||
-rw-r--r-- | fs/nfs/flexfilelayout/flexfilelayout.c | 64 | ||||
-rw-r--r-- | fs/nfs/fs_context.c | 2 | ||||
-rw-r--r-- | fs/nfs/inode.c | 20 | ||||
-rw-r--r-- | fs/nfs/nfs42.h | 24 | ||||
-rw-r--r-- | fs/nfs/nfs42proc.c | 258 | ||||
-rw-r--r-- | fs/nfs/nfs42xattr.c | 1056 | ||||
-rw-r--r-- | fs/nfs/nfs42xdr.c | 438 | ||||
-rw-r--r-- | fs/nfs/nfs4_fs.h | 35 | ||||
-rw-r--r-- | fs/nfs/nfs4client.c | 33 | ||||
-rw-r--r-- | fs/nfs/nfs4file.c | 5 | ||||
-rw-r--r-- | fs/nfs/nfs4proc.c | 239 | ||||
-rw-r--r-- | fs/nfs/nfs4super.c | 10 | ||||
-rw-r--r-- | fs/nfs/nfs4trace.h | 46 | ||||
-rw-r--r-- | fs/nfs/nfs4xdr.c | 39 | ||||
-rw-r--r-- | fs/nfs/nfstrace.h | 3 | ||||
-rw-r--r-- | fs/nfs/pnfs.c | 52 | ||||
-rw-r--r-- | fs/nfs/pnfs.h | 2 |
23 files changed, 2274 insertions, 121 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 2433c3e03cfa..22d11fdc6deb 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -30,7 +30,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o -nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o +nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index 9fb067a6f7e0..ef9db135c649 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -79,7 +79,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, goto out_free_data; bl_msg = msg->data; - bl_msg->type = BL_DEVICE_MOUNT, + bl_msg->type = BL_DEVICE_MOUNT; bl_msg->totallen = b->simple.len; nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f1ff3076e4a4..4b8cc93913f7 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -50,6 +50,7 @@ #include "nfs.h" #include "netns.h" #include "sysfs.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -749,7 +750,7 @@ error: static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { - unsigned long max_rpc_payload; + unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) @@ -762,7 +763,9 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) server->wsize = nfs_block_size(fsinfo->wtmax, NULL); - max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); + raw_max_rpc_payload = rpc_max_payload(server->client); + max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); + if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) @@ -795,6 +798,21 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); + +#ifdef CONFIG_NFS_V4_2 + /* + * Defaults until limited by the session parameters. + */ + server->gxasize = min_t(unsigned int, raw_max_rpc_payload, + XATTR_SIZE_MAX); + server->sxasize = min_t(unsigned int, raw_max_rpc_payload, + XATTR_SIZE_MAX); + server->lxasize = min_t(unsigned int, raw_max_rpc_payload, + nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); + + if (fsinfo->xattr_support) + server->caps |= NFS_CAP_XATTR; +#endif } /* diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5a331da5f55a..a12f42e7d8c7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2460,7 +2460,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co return NULL; } -static int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block) +static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; @@ -2533,6 +2533,20 @@ out: return err; } +int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct +nfs_access_entry *res, bool may_block) +{ + int status; + + status = nfs_access_get_cached_rcu(inode, cred, res); + if (status != 0) + status = nfs_access_get_cached_locked(inode, cred, res, + may_block); + + return status; +} +EXPORT_SYMBOL_GPL(nfs_access_get_cached); + static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) { struct nfs_inode *nfsi = NFS_I(inode); @@ -2647,9 +2661,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) trace_nfs_access_enter(inode); - status = nfs_access_get_cached_rcu(inode, cred, &cache); - if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache, may_block); + status = nfs_access_get_cached(inode, cred, &cache, may_block); if (status == 0) goto out_cached; @@ -2661,6 +2673,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) * Determine which access bits we want to ask for... */ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + if (nfs_server_capable(inode, NFS_CAP_XATTR)) { + cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE | + NFS_ACCESS_XALIST; + } if (S_ISDIR(inode->i_mode)) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1b79dd5cf661..2d30a4da49fa 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -896,7 +896,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, */ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { - ssize_t result = -EINVAL, requested; + ssize_t result, requested; size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f96367a2463e..63940a7a70be 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -140,6 +140,7 @@ static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); + errseq_t since; dprintk("NFS: flush(%pD2)\n", file); @@ -148,7 +149,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return nfs_wb_all(inode); + since = filemap_sample_wb_err(file->f_mapping); + nfs_wb_all(inode); + return filemap_check_wb_err(file->f_mapping, since); } ssize_t @@ -587,12 +590,14 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode) +static int nfs_need_check_write(struct file *filp, struct inode *inode, + int error) { struct nfs_open_context *ctx; ctx = nfs_file_open_context(filp); - if (nfs_ctx_key_to_expire(ctx, inode)) + if (nfs_error_is_fatal_on_server(error) || + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } @@ -603,6 +608,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; + errseq_t since; + int error; result = nfs_key_timeout_notify(file, inode); if (result) @@ -627,6 +634,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos > i_size_read(inode)) nfs_revalidate_mapping(inode, file->f_mapping); + since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); if (result > 0) { @@ -645,7 +653,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; /* Return error values */ - if (nfs_need_check_write(file, inode)) { + error = filemap_check_wb_err(file->f_mapping, since); + if (nfs_need_check_write(file, inode, error)) { int err = nfs_wb_all(inode); if (err < 0) result = err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index de03e440b7ee..965145592750 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -790,6 +790,19 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } +static struct nfs4_pnfs_ds * +ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx) +{ + struct pnfs_layout_segment *lseg = pgio->pg_lseg; + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, + best_idx); + if (ds || !pgio->pg_mirror_idx) + return ds; + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); +} + static void ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, @@ -840,12 +853,11 @@ retry: goto out_nolseg; } - ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); + ds = ff_layout_get_ds_for_read(pgio, &ds_idx); if (!ds) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; + pnfs_generic_pg_cleanup(pgio); /* Sleep for 1 second before retrying */ ssleep(1); goto retry; @@ -871,8 +883,6 @@ out_mds: 0, NFS4_MAX_UINT64, IOMODE_READ, NFS_I(pgio->pg_inode)->layout, pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; pgio->pg_maxretrans = 0; nfs_pageio_reset_read_mds(pgio); } @@ -916,8 +926,7 @@ retry: if (!ds) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; + pnfs_generic_pg_cleanup(pgio); /* Sleep for 1 second before retrying */ ssleep(1); goto retry; @@ -939,8 +948,6 @@ out_mds: 0, NFS4_MAX_UINT64, IOMODE_RW, NFS_I(pgio->pg_inode)->layout, pgio->pg_lseg); - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; pgio->pg_maxretrans = 0; nfs_pageio_reset_write_mds(pgio); pgio->pg_error = -EAGAIN; @@ -953,8 +960,8 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_RW, false, GFP_NOFS); @@ -1028,11 +1035,24 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) } } +static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) +{ + u32 idx = hdr->pgio_mirror_idx + 1; + int new_idx = 0; + + if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) + ff_layout_send_layouterror(hdr->lseg); + else + pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + pnfs_read_resend_pnfs(hdr, new_idx); +} + static void ff_layout_reset_read(struct nfs_pgio_header *hdr) { struct rpc_task *task = &hdr->task; pnfs_layoutcommit_inode(hdr->inode, false); + pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " @@ -1234,6 +1254,12 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, break; case NFS4ERR_NXIO: ff_layout_mark_ds_unreachable(lseg, idx); + /* + * Don't return the layout if this is a read and we still + * have layouts to try + */ + if (opnum == OP_READ) + break; /* Fallthrough */ default: pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, @@ -1247,7 +1273,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - int new_idx = hdr->pgio_mirror_idx; int err; if (task->tk_status < 0) { @@ -1267,10 +1292,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - if (ff_layout_choose_best_ds_for_read(hdr->lseg, - hdr->pgio_mirror_idx + 1, - &new_idx)) - goto out_layouterror; set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1281,10 +1302,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, } return 0; -out_layouterror: - ff_layout_read_record_layoutstats_done(task, hdr); - ff_layout_send_layouterror(hdr->lseg); - hdr->pgio_mirror_idx = new_idx; out_eagain: rpc_restart_call_prepare(task); return -EAGAIN; @@ -1411,10 +1428,9 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { - ff_layout_send_layouterror(hdr->lseg); - pnfs_read_resend_pnfs(hdr); - } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + ff_layout_resend_pnfs_read(hdr); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index ccc88be88d6a..66949da0e827 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -982,7 +982,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the - * the kernel. To continue to support that functionality we + * kernel. To continue to support that functionality we * have a touch of selinux knowledge here in the NFS code. The * userspace code converted context=blah to just blah so we are * converting back to the full string selinux understands. diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0bf1f835de01..aa6493905bbe 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -193,6 +193,7 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) return nfs_check_cache_invalid_not_delegated(inode, flags); } +EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { @@ -204,7 +205,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) flags &= ~NFS_INO_INVALID_OTHER; flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE - | NFS_INO_REVAL_PAGECACHE); + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_INVALID_XATTR); } if (inode->i_mapping->nrpages == 0) @@ -233,11 +235,13 @@ static void nfs_zap_caches_locked(struct inode *inode) | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); } else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } @@ -542,6 +546,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + if (nfs_server_capable(inode, NFS_CAP_XATTR)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -794,8 +800,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat, trace_nfs_getattr_enter(inode); - if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { + nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_update; + } /* Flush out writes to the server in order to update c/mtime. */ if ((request_mask & (STATX_CTIME|STATX_MTIME)) && @@ -1375,6 +1383,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); + else if (nfs_server_capable(inode, NFS_CAP_XATTR)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ ts = inode->i_ctime; @@ -1892,7 +1902,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!(have_writers || have_delegation)) { invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR; /* Force revalidate of all attributes */ save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME @@ -2095,6 +2106,9 @@ struct inode *nfs_alloc_inode(struct super_block *sb) #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ +#ifdef CONFIG_NFS_V4_2 + nfsi->xattr_cache = NULL; +#endif return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index c891af949886..0fe5aacbcfdf 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -6,6 +6,8 @@ #ifndef __LINUX_FS_NFS_NFS4_2_H #define __LINUX_FS_NFS_NFS4_2_H +#include <linux/xattr.h> + /* * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support * more? Need to consider not to pre-alloc too much for a compound. @@ -36,5 +38,27 @@ static inline bool nfs42_files_from_same_server(struct file *in, return nfs4_check_serverowner_major_id(c_in->cl_serverowner, c_out->cl_serverowner); } + +ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, + void *buf, size_t buflen); +int nfs42_proc_setxattr(struct inode *inode, const char *name, + const void *buf, size_t buflen, int flags); +ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf, + size_t buflen, u64 *cookiep, bool *eofp); +int nfs42_proc_removexattr(struct inode *inode, const char *name); + +/* + * Maximum XDR buffer size needed for a listxattr buffer of buflen size. + * + * The upper boundary is a buffer with all 1-byte sized attribute names. + * They would be 7 bytes long in the eventual buffer ("user.x\0"), and + * 8 bytes long XDR-encoded. + * + * Include the trailing eof word as well. + */ +static inline u32 nfs42_listxattr_xdrsize(u32 buflen) +{ + return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4; +} #endif /* CONFIG_NFS_V4_2 */ #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index e2ae54b35dfe..142225f0af59 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -17,6 +17,7 @@ #include "nfs4session.h" #include "internal.h" #include "delegation.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); @@ -714,7 +715,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: - break; + return; case -NFS4ERR_BADHANDLE: case -ESTALE: pnfs_destroy_layout(NFS_I(inode)); @@ -760,6 +761,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; } + + trace_nfs4_layoutstats(inode, &data->args.stateid, task->tk_status); } static void @@ -882,7 +885,7 @@ nfs42_layouterror_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: - break; + return; case -NFS4ERR_BADHANDLE: case -ESTALE: pnfs_destroy_layout(NFS_I(inode)); @@ -926,6 +929,9 @@ nfs42_layouterror_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR; } + + trace_nfs4_layouterror(inode, &data->args.errors[0].stateid, + task->tk_status); } static void @@ -1088,3 +1094,251 @@ out_put_src_lock: nfs_put_lock_context(src_lock); return err; } + +#define NFS4XATTR_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) + +static int _nfs42_proc_removexattr(struct inode *inode, const char *name) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs42_removexattrargs args = { + .fh = NFS_FH(inode), + .xattr_name = name, + }; + struct nfs42_removexattrres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int ret; + unsigned long timestamp = jiffies; + + ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 1); + if (!ret) + nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); + + return ret; +} + +static int _nfs42_proc_setxattr(struct inode *inode, const char *name, + const void *buf, size_t buflen, int flags) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFS4XATTR_MAXPAGES]; + struct nfs42_setxattrargs arg = { + .fh = NFS_FH(inode), + .xattr_pages = pages, + .xattr_len = buflen, + .xattr_name = name, + .xattr_flags = flags, + }; + struct nfs42_setxattrres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int ret, np; + unsigned long timestamp = jiffies; + + if (buflen > server->sxasize) + return -ERANGE; + + if (buflen > 0) { + np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages); + if (np < 0) + return np; + } else + np = 0; + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, + &res.seq_res, 1); + + for (; np > 0; np--) + put_page(pages[np - 1]); + + if (!ret) + nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); + + return ret; +} + +static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, + void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFS4XATTR_MAXPAGES] = {}; + struct nfs42_getxattrargs arg = { + .fh = NFS_FH(inode), + .xattr_pages = pages, + .xattr_len = buflen, + .xattr_name = name, + }; + struct nfs42_getxattrres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETXATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int ret, np; + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, + &res.seq_res, 0); + if (ret < 0) + return ret; + + /* + * Normally, the caching is done one layer up, but for successful + * RPCS, always cache the result here, even if the caller was + * just querying the length, or if the reply was too big for + * the caller. This avoids a second RPC in the case of the + * common query-alloc-retrieve cycle for xattrs. + * + * Note that xattr_len is always capped to XATTR_SIZE_MAX. + */ + + nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); + + if (buflen) { + if (res.xattr_len > buflen) + return -ERANGE; + _copy_from_pages(buf, pages, 0, res.xattr_len); + } + + np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); + while (--np >= 0) + __free_page(pages[np]); + + return res.xattr_len; +} + +static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, + size_t buflen, u64 *cookiep, bool *eofp) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page **pages; + struct nfs42_listxattrsargs arg = { + .fh = NFS_FH(inode), + .cookie = *cookiep, + }; + struct nfs42_listxattrsres res = { + .eof = false, + .xattr_buf = buf, + .xattr_len = buflen, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LISTXATTRS], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + u32 xdrlen; + int ret, np; + + + res.scratch = alloc_page(GFP_KERNEL); + if (!res.scratch) + return -ENOMEM; + + xdrlen = nfs42_listxattr_xdrsize(buflen); + if (xdrlen > server->lxasize) + xdrlen = server->lxasize; + np = xdrlen / PAGE_SIZE + 1; + + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + __free_page(res.scratch); + return -ENOMEM; + } + + arg.xattr_pages = pages; + arg.count = xdrlen; + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, + &res.seq_res, 0); + + if (ret >= 0) { + ret = res.copied; + *cookiep = res.cookie; + *eofp = res.eof; + } + + while (--np >= 0) { + if (pages[np]) + __free_page(pages[np]); + } + + __free_page(res.scratch); + kfree(pages); + + return ret; + +} + +ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, + void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t err; + + do { + err = _nfs42_proc_getxattr(inode, name, buf, buflen); + if (err >= 0) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} + +int nfs42_proc_setxattr(struct inode *inode, const char *name, + const void *buf, size_t buflen, int flags) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_setxattr(inode, name, buf, buflen, flags); + if (!err) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} + +ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf, + size_t buflen, u64 *cookiep, bool *eofp) +{ + struct nfs4_exception exception = { }; + ssize_t err; + + do { + err = _nfs42_proc_listxattrs(inode, buf, buflen, + cookiep, eofp); + if (err >= 0) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} + +int nfs42_proc_removexattr(struct inode *inode, const char *name) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_removexattr(inode, name); + if (!err) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c new file mode 100644 index 000000000000..86777996cfec --- /dev/null +++ b/fs/nfs/nfs42xattr.c @@ -0,0 +1,1056 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * User extended attribute client side cache functions. + * + * Author: Frank van der Linden <fllinden@amazon.com> + */ +#include <linux/errno.h> +#include <linux/nfs_fs.h> +#include <linux/hashtable.h> +#include <linux/refcount.h> +#include <uapi/linux/xattr.h> + +#include "nfs4_fs.h" +#include "internal.h" + +/* + * User extended attributes client side caching is implemented by having + * a cache structure attached to NFS inodes. This structure is allocated + * when needed, and freed when the cache is zapped. + * + * The cache structure contains as hash table of entries, and a pointer + * to a special-cased entry for the listxattr cache. + * + * Accessing and allocating / freeing the caches is done via reference + * counting. The cache entries use a similar refcounting scheme. + * + * This makes freeing a cache, both from the shrinker and from the + * zap cache path, easy. It also means that, in current use cases, + * the large majority of inodes will not waste any memory, as they + * will never have any user extended attributes assigned to them. + * + * Attribute entries are hashed in to a simple hash table. They are + * also part of an LRU. + * + * There are three shrinkers. + * + * Two shrinkers deal with the cache entries themselves: one for + * large entries (> PAGE_SIZE), and one for smaller entries. The + * shrinker for the larger entries works more aggressively than + * those for the smaller entries. + * + * The other shrinker frees the cache structures themselves. + */ + +/* + * 64 buckets is a good default. There is likely no reasonable + * workload that uses more than even 64 user extended attributes. + * You can certainly add a lot more - but you get what you ask for + * in those circumstances. + */ +#define NFS4_XATTR_HASH_SIZE 64 + +#define NFSDBG_FACILITY NFSDBG_XATTRCACHE + +struct nfs4_xattr_cache; +struct nfs4_xattr_entry; + +struct nfs4_xattr_bucket { + spinlock_t lock; + struct hlist_head hlist; + struct nfs4_xattr_cache *cache; + bool draining; +}; + +struct nfs4_xattr_cache { + struct kref ref; + spinlock_t hash_lock; /* protects hashtable and lru */ + struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; + struct list_head lru; + struct list_head dispose; + atomic_long_t nent; + spinlock_t listxattr_lock; + struct inode *inode; + struct nfs4_xattr_entry *listxattr; +}; + +struct nfs4_xattr_entry { + struct kref ref; + struct hlist_node hnode; + struct list_head lru; + struct list_head dispose; + char *xattr_name; + void *xattr_value; + size_t xattr_size; + struct nfs4_xattr_bucket *bucket; + uint32_t flags; +}; + +#define NFS4_XATTR_ENTRY_EXTVAL 0x0001 + +/* + * LRU list of NFS inodes that have xattr caches. + */ +static struct list_lru nfs4_xattr_cache_lru; +static struct list_lru nfs4_xattr_entry_lru; +static struct list_lru nfs4_xattr_large_entry_lru; + +static struct kmem_cache *nfs4_xattr_cache_cachep; + +/* + * Hashing helper functions. + */ +static void +nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&cache->buckets[i].hlist); + spin_lock_init(&cache->buckets[i].lock); + cache->buckets[i].cache = cache; + cache->buckets[i].draining = false; + } +} + +/* + * Locking order: + * 1. inode i_lock or bucket lock + * 2. list_lru lock (taken by list_lru_* functions) + */ + +/* + * Wrapper functions to add a cache entry to the right LRU. + */ +static bool +nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_add(lru, &entry->lru); +} + +static bool +nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_del(lru, &entry->lru); +} + +/* + * This function allocates cache entries. They are the normal + * extended attribute name/value pairs, but may also be a listxattr + * cache. Those allocations use the same entry so that they can be + * treated as one by the memory shrinker. + * + * xattr cache entries are allocated together with names. If the + * value fits in to one page with the entry structure and the name, + * it will also be part of the same allocation (kmalloc). This is + * expected to be the vast majority of cases. Larger allocations + * have a value pointer that is allocated separately by kvmalloc. + * + * Parameters: + * + * @name: Name of the extended attribute. NULL for listxattr cache + * entry. + * @value: Value of attribute, or listxattr cache. NULL if the + * value is to be copied from pages instead. + * @pages: Pages to copy the value from, if not NULL. Passed in to + * make it easier to copy the value after an RPC, even if + * the value will not be passed up to application (e.g. + * for a 'query' getxattr with NULL buffer). + * @len: Length of the value. Can be 0 for zero-length attribues. + * @value and @pages will be NULL if @len is 0. + */ +static struct nfs4_xattr_entry * +nfs4_xattr_alloc_entry(const char *name, const void *value, + struct page **pages, size_t len) +{ + struct nfs4_xattr_entry *entry; + void *valp; + char *namep; + size_t alloclen, slen; + char *buf; + uint32_t flags; + + BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) + + XATTR_NAME_MAX + 1 > PAGE_SIZE); + + alloclen = sizeof(struct nfs4_xattr_entry); + if (name != NULL) { + slen = strlen(name) + 1; + alloclen += slen; + } else + slen = 0; + + if (alloclen + len <= PAGE_SIZE) { + alloclen += len; + flags = 0; + } else { + flags = NFS4_XATTR_ENTRY_EXTVAL; + } + + buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (buf == NULL) + return NULL; + entry = (struct nfs4_xattr_entry *)buf; + + if (name != NULL) { + namep = buf + sizeof(struct nfs4_xattr_entry); + memcpy(namep, name, slen); + } else { + namep = NULL; + } + + + if (flags & NFS4_XATTR_ENTRY_EXTVAL) { + valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (valp == NULL) { + kfree(buf); + return NULL; + } + } else if (len != 0) { + valp = buf + sizeof(struct nfs4_xattr_entry) + slen; + } else + valp = NULL; + + if (valp != NULL) { + if (value != NULL) + memcpy(valp, value, len); + else + _copy_from_pages(valp, pages, 0, len); + } + + entry->flags = flags; + entry->xattr_value = valp; + kref_init(&entry->ref); + entry->xattr_name = namep; + entry->xattr_size = len; + entry->bucket = NULL; + INIT_LIST_HEAD(&entry->lru); + INIT_LIST_HEAD(&entry->dispose); + INIT_HLIST_NODE(&entry->hnode); + + return entry; +} + +static void +nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry) +{ + if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) + kvfree(entry->xattr_value); + kfree(entry); +} + +static void +nfs4_xattr_free_entry_cb(struct kref *kref) +{ + struct nfs4_xattr_entry *entry; + + entry = container_of(kref, struct nfs4_xattr_entry, ref); + + if (WARN_ON(!list_empty(&entry->lru))) + return; + + nfs4_xattr_free_entry(entry); +} + +static void +nfs4_xattr_free_cache_cb(struct kref *kref) +{ + struct nfs4_xattr_cache *cache; + int i; + + cache = container_of(kref, struct nfs4_xattr_cache, ref); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist))) + return; + cache->buckets[i].draining = false; + } + + cache->listxattr = NULL; + + kmem_cache_free(nfs4_xattr_cache_cachep, cache); + +} + +static struct nfs4_xattr_cache * +nfs4_xattr_alloc_cache(void) +{ + struct nfs4_xattr_cache *cache; + + cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, + GFP_KERNEL_ACCOUNT | GFP_NOFS); + if (cache == NULL) + return NULL; + + kref_init(&cache->ref); + atomic_long_set(&cache->nent, 0); + + return cache; +} + +/* + * Set the listxattr cache, which is a special-cased cache entry. + * The special value ERR_PTR(-ESTALE) is used to indicate that + * the cache is being drained - this prevents a new listxattr + * cache from being added to what is now a stale cache. + */ +static int +nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *new) +{ + struct nfs4_xattr_entry *old; + int ret = 1; + + spin_lock(&cache->listxattr_lock); + + old = cache->listxattr; + + if (old == ERR_PTR(-ESTALE)) { + ret = 0; + goto out; + } + + cache->listxattr = new; + if (new != NULL && new != ERR_PTR(-ESTALE)) + nfs4_xattr_entry_lru_add(new); + + if (old != NULL) { + nfs4_xattr_entry_lru_del(old); + kref_put(&old->ref, nfs4_xattr_free_entry_cb); + } +out: + spin_unlock(&cache->listxattr_lock); + + return ret; +} + +/* + * Unlink a cache from its parent inode, clearing out an invalid + * cache. Must be called with i_lock held. + */ +static struct nfs4_xattr_cache * +nfs4_xattr_cache_unlink(struct inode *inode) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *oldcache; + + nfsi = NFS_I(inode); + + oldcache = nfsi->xattr_cache; + if (oldcache != NULL) { + list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + oldcache->inode = NULL; + } + nfsi->xattr_cache = NULL; + nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR; + + return oldcache; + +} + +/* + * Discard a cache. Called by get_cache() if there was an old, + * invalid cache. Can also be called from a shrinker callback. + * + * The cache is dead, it has already been unlinked from its inode, + * and no longer appears on the cache LRU list. + * + * Mark all buckets as draining, so that no new entries are added. This + * could still happen in the unlikely, but possible case that another + * thread had grabbed a reference before it was unlinked from the inode, + * and is still holding it for an add operation. + * + * Remove all entries from the LRU lists, so that there is no longer + * any way to 'find' this cache. Then, remove the entries from the hash + * table. + * + * At that point, the cache will remain empty and can be freed when the final + * reference drops, which is very likely the kref_put at the end of + * this function, or the one called immediately afterwards in the + * shrinker callback. + */ +static void +nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + struct nfs4_xattr_entry *entry; + struct nfs4_xattr_bucket *bucket; + struct hlist_node *n; + + nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE)); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + bucket = &cache->buckets[i]; + + spin_lock(&bucket->lock); + bucket->draining = true; + hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) { + nfs4_xattr_entry_lru_del(entry); + hlist_del_init(&entry->hnode); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + spin_unlock(&bucket->lock); + } + + atomic_long_set(&cache->nent, 0); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Get a referenced copy of the cache structure. Avoid doing allocs + * while holding i_lock. Which means that we do some optimistic allocation, + * and might have to free the result in rare cases. + * + * This function only checks the NFS_INO_INVALID_XATTR cache validity bit + * and acts accordingly, replacing the cache when needed. For the read case + * (!add), this means that the caller must make sure that the cache + * is valid before caling this function. getxattr and listxattr call + * revalidate_inode to do this. The attribute cache timeout (for the + * non-delegated case) is expected to be dealt with in the revalidate + * call. + */ + +static struct nfs4_xattr_cache * +nfs4_xattr_get_cache(struct inode *inode, int add) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *cache, *oldcache, *newcache; + + nfsi = NFS_I(inode); + + cache = oldcache = NULL; + + spin_lock(&inode->i_lock); + + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) + oldcache = nfs4_xattr_cache_unlink(inode); + else + cache = nfsi->xattr_cache; + + if (cache != NULL) + kref_get(&cache->ref); + + spin_unlock(&inode->i_lock); + + if (add && cache == NULL) { + newcache = NULL; + + cache = nfs4_xattr_alloc_cache(); + if (cache == NULL) + goto out; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) { + /* + * The cache was invalidated again. Give up, + * since what we want to enter is now likely + * outdated anyway. + */ + spin_unlock(&inode->i_lock); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = NULL; + goto out; + } + + /* + * Check if someone beat us to it. + */ + if (nfsi->xattr_cache != NULL) { + newcache = nfsi->xattr_cache; + kref_get(&newcache->ref); + } else { + kref_get(&cache->ref); + nfsi->xattr_cache = cache; + cache->inode = inode; + list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + } + + spin_unlock(&inode->i_lock); + + /* + * If there was a race, throw away the cache we just + * allocated, and use the new one allocated by someone + * else. + */ + if (newcache != NULL) { + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = newcache; + } + } + +out: + /* + * Discard the now orphaned old cache. + */ + if (oldcache != NULL) + nfs4_xattr_discard_cache(oldcache); + + return cache; +} + +static inline struct nfs4_xattr_bucket * +nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name) +{ + return &cache->buckets[jhash(name, strlen(name), 0) & + (ARRAY_SIZE(cache->buckets) - 1)]; +} + +static struct nfs4_xattr_entry * +nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name) +{ + struct nfs4_xattr_entry *entry; + + entry = NULL; + + hlist_for_each_entry(entry, &bucket->hlist, hnode) { + if (!strcmp(entry->xattr_name, name)) + break; + } + + return entry; +} + +static int +nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *entry) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *oldentry = NULL; + int ret = 1; + + bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name); + entry->bucket = bucket; + + spin_lock(&bucket->lock); + + if (bucket->draining) { + ret = 0; + goto out; + } + + oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name); + if (oldentry != NULL) { + hlist_del_init(&oldentry->hnode); + nfs4_xattr_entry_lru_del(oldentry); + } else { + atomic_long_inc(&cache->nent); + } + + hlist_add_head(&entry->hnode, &bucket->hlist); + nfs4_xattr_entry_lru_add(entry); + +out: + spin_unlock(&bucket->lock); + + if (oldentry != NULL) + kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb); + + return ret; +} + +static void +nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) { + hlist_del_init(&entry->hnode); + nfs4_xattr_entry_lru_del(entry); + atomic_long_dec(&cache->nent); + } + + spin_unlock(&bucket->lock); + + if (entry != NULL) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); +} + +static struct nfs4_xattr_entry * +nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) + kref_get(&entry->ref); + + spin_unlock(&bucket->lock); + + return entry; +} + +/* + * Entry point to retrieve an entry from the cache. + */ +ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + ret = 0; + entry = nfs4_xattr_hash_find(cache, name); + + if (entry != NULL) { + dprintk("%s: cache hit '%s', len %lu\n", __func__, + entry->xattr_name, (unsigned long)entry->xattr_size); + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (buflen < entry->xattr_size) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } else { + dprintk("%s: cache miss '%s'\n", __func__, name); + ret = -ENOENT; + } + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Retrieve a cached list of xattrs from the cache. + */ +ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + spin_lock(&cache->listxattr_lock); + + entry = cache->listxattr; + + if (entry != NULL && entry != ERR_PTR(-ESTALE)) { + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (entry->xattr_size > buflen) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + } else { + ret = -ENOENT; + } + + spin_unlock(&cache->listxattr_lock); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Add an xattr to the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + dprintk("%s: add '%s' len %lu\n", __func__, + name, (unsigned long)buflen); + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen); + if (entry == NULL) + goto out; + + (void)nfs4_xattr_set_listcache(cache, NULL); + + if (!nfs4_xattr_hash_add(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + + +/* + * Remove an xattr from the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_remove(struct inode *inode, const char *name) +{ + struct nfs4_xattr_cache *cache; + + dprintk("%s: remove '%s'\n", __func__, name); + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return; + + (void)nfs4_xattr_set_listcache(cache, NULL); + nfs4_xattr_hash_remove(cache, name); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Cache listxattr output, replacing any possible old one. + */ +void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen); + if (entry == NULL) + goto out; + + /* + * This is just there to be able to get to bucket->cache, + * which is obviously the same for all buckets, so just + * use bucket 0. + */ + entry->bucket = &cache->buckets[0]; + + if (!nfs4_xattr_set_listcache(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Zap the entire cache. Called when an inode is evicted. + */ +void nfs4_xattr_cache_zap(struct inode *inode) +{ + struct nfs4_xattr_cache *oldcache; + + spin_lock(&inode->i_lock); + oldcache = nfs4_xattr_cache_unlink(inode); + spin_unlock(&inode->i_lock); + + if (oldcache) + nfs4_xattr_discard_cache(oldcache); +} + +/* + * The entry LRU is shrunk more aggressively than the cache LRU, + * by settings @seeks to 1. + * + * Cache structures are freed only when they've become empty, after + * pruning all but one entry. + */ + +static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static struct shrinker nfs4_xattr_cache_shrinker = { + .count_objects = nfs4_xattr_cache_count, + .scan_objects = nfs4_xattr_cache_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static struct shrinker nfs4_xattr_entry_shrinker = { + .count_objects = nfs4_xattr_entry_count, + .scan_objects = nfs4_xattr_entry_scan, + .seeks = DEFAULT_SEEKS, + .batch = 512, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static struct shrinker nfs4_xattr_large_entry_shrinker = { + .count_objects = nfs4_xattr_entry_count, + .scan_objects = nfs4_xattr_entry_scan, + .seeks = 1, + .batch = 512, + .flags = SHRINKER_MEMCG_AWARE, +}; + +static enum lru_status +cache_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct inode *inode; + struct nfs4_xattr_cache *cache = container_of(item, + struct nfs4_xattr_cache, lru); + + if (atomic_long_read(&cache->nent) > 1) + return LRU_SKIP; + + /* + * If a cache structure is on the LRU list, we know that + * its inode is valid. Try to lock it to break the link. + * Since we're inverting the lock order here, only try. + */ + inode = cache->inode; + + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; + + kref_get(&cache->ref); + + cache->inode = NULL; + NFS_I(inode)->xattr_cache = NULL; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR; + list_lru_isolate(lru, &cache->lru); + + spin_unlock(&inode->i_lock); + + list_add_tail(&cache->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_cache *cache; + + freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc, + cache_lru_isolate, &dispose); + while (!list_empty(&dispose)) { + cache = list_first_entry(&dispose, struct nfs4_xattr_cache, + dispose); + list_del_init(&cache->dispose); + nfs4_xattr_discard_cache(cache); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + } + + return freed; +} + + +static unsigned long +nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + + count = list_lru_count(&nfs4_xattr_cache_lru); + return vfs_pressure_ratio(count); +} + +static enum lru_status +entry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry = container_of(item, + struct nfs4_xattr_entry, lru); + + bucket = entry->bucket; + cache = bucket->cache; + + /* + * Unhook the entry from its parent (either a cache bucket + * or a cache structure if it's a listxattr buf), so that + * it's no longer found. Then add it to the isolate list, + * to be freed later. + * + * In both cases, we're reverting lock order, so use + * trylock and skip the entry if we can't get the lock. + */ + if (entry->xattr_name != NULL) { + /* Regular cache entry */ + if (!spin_trylock(&bucket->lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + hlist_del_init(&entry->hnode); + atomic_long_dec(&cache->nent); + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&bucket->lock); + } else { + /* Listxattr cache entry */ + if (!spin_trylock(&cache->listxattr_lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + cache->listxattr = NULL; + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&cache->listxattr_lock); + } + + list_add_tail(&entry->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_entry *entry; + struct list_lru *lru; + + lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); + + while (!list_empty(&dispose)) { + entry = list_first_entry(&dispose, struct nfs4_xattr_entry, + dispose); + list_del_init(&entry->dispose); + + /* + * Drop two references: the one that we just grabbed + * in entry_lru_isolate, and the one that was set + * when the entry was first allocated. + */ + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + + return freed; +} + +static unsigned long +nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct list_lru *lru; + + lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + count = list_lru_count(lru); + return vfs_pressure_ratio(count); +} + + +static void nfs4_xattr_cache_init_once(void *p) +{ + struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + + spin_lock_init(&cache->listxattr_lock); + atomic_long_set(&cache->nent, 0); + nfs4_xattr_hash_init(cache); + cache->listxattr = NULL; + INIT_LIST_HEAD(&cache->lru); + INIT_LIST_HEAD(&cache->dispose); +} + +int __init nfs4_xattr_cache_init(void) +{ + int ret = 0; + + nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", + sizeof(struct nfs4_xattr_cache), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), + nfs4_xattr_cache_init_once); + if (nfs4_xattr_cache_cachep == NULL) + return -ENOMEM; + + ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru, + &nfs4_xattr_large_entry_shrinker); + if (ret) + goto out4; + + ret = list_lru_init_memcg(&nfs4_xattr_entry_lru, + &nfs4_xattr_entry_shrinker); + if (ret) + goto out3; + + ret = list_lru_init_memcg(&nfs4_xattr_cache_lru, + &nfs4_xattr_cache_shrinker); + if (ret) + goto out2; + + ret = register_shrinker(&nfs4_xattr_cache_shrinker); + if (ret) + goto out1; + + ret = register_shrinker(&nfs4_xattr_entry_shrinker); + if (ret) + goto out; + + ret = register_shrinker(&nfs4_xattr_large_entry_shrinker); + if (!ret) + return 0; + + unregister_shrinker(&nfs4_xattr_entry_shrinker); +out: + unregister_shrinker(&nfs4_xattr_cache_shrinker); +out1: + list_lru_destroy(&nfs4_xattr_cache_lru); +out2: + list_lru_destroy(&nfs4_xattr_entry_lru); +out3: + list_lru_destroy(&nfs4_xattr_large_entry_lru); +out4: + kmem_cache_destroy(nfs4_xattr_cache_cachep); + + return ret; +} + +void nfs4_xattr_cache_exit(void) +{ + unregister_shrinker(&nfs4_xattr_entry_shrinker); + unregister_shrinker(&nfs4_xattr_cache_shrinker); + list_lru_destroy(&nfs4_xattr_entry_lru); + list_lru_destroy(&nfs4_xattr_cache_lru); + kmem_cache_destroy(nfs4_xattr_cache_cachep); +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index c03f3246d6c5..cc50085e151c 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -169,6 +169,78 @@ decode_clone_maxsz + \ decode_getattr_maxsz) +/* Not limited by NFS itself, limited by the generic xattr code */ +#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX) + +#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \ + nfs4_xattr_name_maxsz) +#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + 1) +#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \ + 1 + nfs4_xattr_name_maxsz + 1) +#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1) +#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1) +#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \ + nfs4_xattr_name_maxsz) +#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz) + +#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getxattr_maxsz) +#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getxattr_maxsz) +#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setxattr_maxsz) +#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setxattr_maxsz) +#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_listxattrs_maxsz) +#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_listxattrs_maxsz) +#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_removexattr_maxsz) +#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_removexattr_maxsz) + +/* + * These values specify the maximum amount of data that is not + * associated with the extended attribute name or extended + * attribute list in the SETXATTR, GETXATTR and LISTXATTR + * respectively. + */ +const u32 nfs42_maxsetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + 1 + + nfs4_xattr_name_maxsz) + * XDR_UNIT); + +const u32 nfs42_maxgetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz + 1) * XDR_UNIT); + +const u32 nfs42_maxlistxattrs_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz + 3) * XDR_UNIT); + static void encode_fallocate(struct xdr_stream *xdr, const struct nfs42_falloc_args *args) { @@ -333,6 +405,210 @@ static void encode_layouterror(struct xdr_stream *xdr, encode_device_error(xdr, &args->errors[0]); } +static void encode_setxattr(struct xdr_stream *xdr, + const struct nfs42_setxattrargs *arg, + struct compound_hdr *hdr) +{ + __be32 *p; + + BUILD_BUG_ON(XATTR_CREATE != SETXATTR4_CREATE); + BUILD_BUG_ON(XATTR_REPLACE != SETXATTR4_REPLACE); + + encode_op_hdr(xdr, OP_SETXATTR, decode_setxattr_maxsz, hdr); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->xattr_flags); + encode_string(xdr, strlen(arg->xattr_name), arg->xattr_name); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->xattr_len); + if (arg->xattr_len) + xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len); +} + +static int decode_setxattr(struct xdr_stream *xdr, + struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_SETXATTR); + if (status) + goto out; + status = decode_change_info(xdr, cinfo); +out: + return status; +} + + +static void encode_getxattr(struct xdr_stream *xdr, const char *name, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_GETXATTR, decode_getxattr_maxsz, hdr); + encode_string(xdr, strlen(name), name); +} + +static int decode_getxattr(struct xdr_stream *xdr, + struct nfs42_getxattrres *res, + struct rpc_rqst *req) +{ + int status; + __be32 *p; + u32 len, rdlen; + + status = decode_op_hdr(xdr, OP_GETXATTR); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + len = be32_to_cpup(p); + if (len > req->rq_rcv_buf.page_len) + return -ERANGE; + + res->xattr_len = len; + + if (len > 0) { + rdlen = xdr_read_pages(xdr, len); + if (rdlen < len) + return -EIO; + } + + return 0; +} + +static void encode_removexattr(struct xdr_stream *xdr, const char *name, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_REMOVEXATTR, decode_removexattr_maxsz, hdr); + encode_string(xdr, strlen(name), name); +} + + +static int decode_removexattr(struct xdr_stream *xdr, + struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_REMOVEXATTR); + if (status) + goto out; + + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static void encode_listxattrs(struct xdr_stream *xdr, + const struct nfs42_listxattrsargs *arg, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr); + + p = reserve_space(xdr, 12); + if (unlikely(!p)) + return; + + p = xdr_encode_hyper(p, arg->cookie); + /* + * RFC 8276 says to specify the full max length of the LISTXATTRS + * XDR reply. Count is set to the XDR length of the names array + * plus the EOF marker. So, add the cookie and the names count. + */ + *p = cpu_to_be32(arg->count + 8 + 4); +} + +static int decode_listxattrs(struct xdr_stream *xdr, + struct nfs42_listxattrsres *res) +{ + int status; + __be32 *p; + u32 count, len, ulen; + size_t left, copied; + char *buf; + + status = decode_op_hdr(xdr, OP_LISTXATTRS); + if (status) { + /* + * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL + * should be translated to ERANGE. + */ + if (status == -ETOOSMALL) + status = -ERANGE; + goto out; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + return -EIO; + + xdr_decode_hyper(p, &res->cookie); + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + left = res->xattr_len; + buf = res->xattr_buf; + + count = be32_to_cpup(p); + copied = 0; + + /* + * We have asked for enough room to encode the maximum number + * of possible attribute names, so everything should fit. + * + * But, don't rely on that assumption. Just decode entries + * until they don't fit anymore, just in case the server did + * something odd. + */ + while (count--) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + len = be32_to_cpup(p); + if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) { + status = -ERANGE; + goto out; + } + + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return -EIO; + + ulen = len + XATTR_USER_PREFIX_LEN + 1; + if (buf) { + if (ulen > left) { + status = -ERANGE; + goto out; + } + + memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + memcpy(buf + XATTR_USER_PREFIX_LEN, p, len); + + buf[ulen - 1] = 0; + buf += ulen; + left -= ulen; + } + copied += ulen; + } + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + res->eof = be32_to_cpup(p); + res->copied = copied; + +out: + if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX) + status = -E2BIG; + + return status; +} + /* * Encode ALLOCATE request */ @@ -988,4 +1264,166 @@ out: return status; } +#ifdef CONFIG_NFS_V4_2 +static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_setxattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setxattr(xdr, args, &hdr); + encode_nops(&hdr); +} + +static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + void *data) +{ + struct nfs42_setxattrres *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + + status = decode_setxattr(xdr, &res->cinfo); +out: + return status; +} + +static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_getxattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + size_t plen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getxattr(xdr, args->xattr_name, &hdr); + + plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; + + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, + hdr.replen); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + + encode_nops(&hdr); +} + +static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *data) +{ + struct nfs42_getxattrres *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getxattr(xdr, res, rqstp); +out: + return status; +} + +static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfs42_listxattrsargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_listxattrs(xdr, args, &hdr); + + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, + hdr.replen); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + + encode_nops(&hdr); +} + +static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *data) +{ + struct nfs42_listxattrsres *res = data; + struct compound_hdr hdr; + int status; + + xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE); + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_listxattrs(xdr, res); +out: + return status; +} + +static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfs42_removexattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_removexattr(xdr, args->xattr_name, &hdr); + encode_nops(&hdr); +} + +static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req, + struct xdr_stream *xdr, void *data) +{ + struct nfs42_removexattrres *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + + status = decode_removexattr(xdr, &res->cinfo); +out: + return status; +} +#endif #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 210e590e1f71..0c9505dc852c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -324,6 +324,13 @@ extern int update_open_stateid(struct nfs4_state *state, extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); +extern void nfs4_update_changeattr(struct inode *dir, + struct nfs4_change_info *cinfo, + unsigned long timestamp, + unsigned long cache_validity); +extern int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages); + #if defined(CONFIG_NFS_V4_1) extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *); @@ -557,6 +564,12 @@ static inline void nfs4_unregister_sysctl(void) /* nfs4xdr.c */ extern const struct rpc_procinfo nfs4_procedures[]; +#ifdef CONFIG_NFS_V4_2 +extern const u32 nfs42_maxsetxattr_overhead; +extern const u32 nfs42_maxgetxattr_overhead; +extern const u32 nfs42_maxlistxattrs_overhead; +#endif + struct nfs4_mount_data; /* callback_xdr.c */ @@ -613,12 +626,34 @@ static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state * nfs4_stateid_match_other(&state->open_stateid, stateid); } +/* nfs42xattr.c */ +#ifdef CONFIG_NFS_V4_2 +extern int __init nfs4_xattr_cache_init(void); +extern void nfs4_xattr_cache_exit(void); +extern void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, + ssize_t buflen); +extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name); +extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, + char *buf, ssize_t buflen); +extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen); +extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, + ssize_t buflen); +extern void nfs4_xattr_cache_zap(struct inode *inode); #else +static inline void nfs4_xattr_cache_zap(struct inode *inode) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +#else /* CONFIG_NFS_V4 */ #define nfs4_close_state(a, b) do { } while (0) #define nfs4_close_sync(a, b) do { } while (0) #define nfs4_state_protect(a, b, c, d) do { } while (0) #define nfs4_state_protect_write(a, b, c, d) do { } while (0) + #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 0bd77cc1f639..daacc78a3d48 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -880,7 +880,7 @@ static int nfs4_set_client(struct nfs_server *server, if (minorversion == 0) __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); - else if (proto == XPRT_TRANSPORT_TCP) + if (proto == XPRT_TRANSPORT_TCP) cl_init.nconnect = nconnect; if (server->flags & NFS_MOUNT_NORESVPORT) @@ -992,6 +992,36 @@ static void nfs4_session_limit_rwsize(struct nfs_server *server) #endif /* CONFIG_NFS_V4_1 */ } +/* + * Limit xattr sizes using the channel attributes. + */ +static void nfs4_session_limit_xasize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_2 + struct nfs4_session *sess; + u32 server_gxa_sz; + u32 server_sxa_sz; + u32 server_lxa_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + + sess = server->nfs_client->cl_session; + + server_gxa_sz = sess->fc_attrs.max_resp_sz - nfs42_maxgetxattr_overhead; + server_sxa_sz = sess->fc_attrs.max_rqst_sz - nfs42_maxsetxattr_overhead; + server_lxa_sz = sess->fc_attrs.max_resp_sz - + nfs42_maxlistxattrs_overhead; + + if (server->gxasize > server_gxa_sz) + server->gxasize = server_gxa_sz; + if (server->sxasize > server_sxa_sz) + server->sxasize = server_sxa_sz; + if (server->lxasize > server_lxa_sz) + server->lxasize = server_lxa_sz; +#endif +} + static int nfs4_server_common_setup(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { @@ -1039,6 +1069,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, goto out; nfs4_session_limit_rwsize(server); + nfs4_session_limit_xasize(server); if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8e5d6223ddd3..a33970765467 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -110,6 +110,7 @@ static int nfs4_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); + errseq_t since; dprintk("NFS: flush(%pD2)\n", file); @@ -125,7 +126,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id) return filemap_fdatawrite(file->f_mapping); /* Flush writes to the server and return any errors */ - return nfs_wb_all(inode); + since = filemap_sample_wb_err(file->f_mapping); + nfs_wb_all(inode); + return filemap_check_wb_err(file->f_mapping, since); } #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8963062da57e..dbd01548335b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -66,6 +66,7 @@ #include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" +#include "nfs42.h" #include "nfs4trace.h" @@ -256,6 +257,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD1_FS_LAYOUT_TYPES, FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_CLONE_BLKSIZE + | FATTR4_WORD2_XATTR_SUPPORT }; const u32 nfs4_fs_locations_bitmap[3] = { @@ -1173,37 +1175,49 @@ nfs4_dec_nlink_locked(struct inode *inode) } static void -update_changeattr_locked(struct inode *dir, struct nfs4_change_info *cinfo, +nfs4_update_changeattr_locked(struct inode *inode, + struct nfs4_change_info *cinfo, unsigned long timestamp, unsigned long cache_validity) { - struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_inode *nfsi = NFS_I(inode); nfsi->cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME - | NFS_INO_INVALID_DATA | cache_validity; - if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) { + + if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; } else { - nfs_force_lookup_revalidate(dir); - if (cinfo->before != inode_peek_iversion_raw(dir)) + if (S_ISDIR(inode->i_mode)) { + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_force_lookup_revalidate(inode); + } else { + if (!NFS_PROTO(inode)->have_delegation(inode, + FMODE_READ)) + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + } + + if (cinfo->before != inode_peek_iversion_raw(inode)) nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL; + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR; } - inode_set_iversion_raw(dir, cinfo->after); + inode_set_iversion_raw(inode, cinfo->after); nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; - nfs_fscache_invalidate(dir); + + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); } -static void -update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, +void +nfs4_update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, unsigned long timestamp, unsigned long cache_validity) { spin_lock(&dir->i_lock); - update_changeattr_locked(dir, cinfo, timestamp, cache_validity); + nfs4_update_changeattr_locked(dir, cinfo, timestamp, cache_validity); spin_unlock(&dir->i_lock); } @@ -1356,6 +1370,12 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; +#ifdef CONFIG_NFS_V4_2 + if (server->caps & NFS_CAP_XATTR) + p->o_arg.access |= NFS4_ACCESS_XAREAD | + NFS4_ACCESS_XAWRITE | + NFS4_ACCESS_XALIST; +#endif } } p->o_arg.clientid = server->nfs_client->cl_clientid; @@ -2653,8 +2673,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, data->file_created = true; if (data->file_created || inode_peek_iversion_raw(dir) != o_res->cinfo.after) - update_changeattr(dir, &o_res->cinfo, - o_res->f_attr->time_start, 0); + nfs4_update_changeattr(dir, &o_res->cinfo, + o_res->f_attr->time_start, + NFS_INO_INVALID_DATA); } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; @@ -3756,7 +3777,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -4540,7 +4561,8 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype) status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - update_changeattr_locked(dir, &res.cinfo, timestamp, 0); + nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp, + NFS_INO_INVALID_DATA); /* Removing a directory decrements nlink in the parent */ if (ftype == NF4DIR && dir->i_nlink > 2) nfs4_dec_nlink_locked(dir); @@ -4624,8 +4646,9 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) &data->timeout) == -EAGAIN) return 0; if (task->tk_status == 0) - update_changeattr(dir, &res->cinfo, - res->dir_attr->time_start, 0); + nfs4_update_changeattr(dir, &res->cinfo, + res->dir_attr->time_start, + NFS_INO_INVALID_DATA); return 1; } @@ -4669,16 +4692,18 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (task->tk_status == 0) { if (new_dir != old_dir) { /* Note: If we moved a directory, nlink will change */ - update_changeattr(old_dir, &res->old_cinfo, + nfs4_update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start, - NFS_INO_INVALID_OTHER); - update_changeattr(new_dir, &res->new_cinfo, + NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_DATA); + nfs4_update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start, - NFS_INO_INVALID_OTHER); + NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_DATA); } else - update_changeattr(old_dir, &res->old_cinfo, + nfs4_update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start, - 0); + NFS_INO_INVALID_DATA); } return 1; } @@ -4719,7 +4744,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { - update_changeattr(dir, &res.cinfo, res.fattr->time_start, 0); + nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start, + NFS_INO_INVALID_DATA); status = nfs_post_op_update_inode(inode, res.fattr); if (!status) nfs_setsecurity(inode, res.fattr, res.label); @@ -4797,8 +4823,9 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - update_changeattr_locked(dir, &data->res.dir_cinfo, - data->res.fattr->time_start, 0); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); /* Creating a directory bumps nlink in the parent */ if (data->arg.ftype == NF4DIR) nfs4_inc_nlink_locked(dir); @@ -5531,7 +5558,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server) */ #define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) -static int buf_to_pages_noslab(const void *buf, size_t buflen, +int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen, struct page **pages) { struct page *newpage, **spages; @@ -5773,7 +5800,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; - i = buf_to_pages_noslab(buf, buflen, arg.acl_pages); + i = nfs4_buf_to_pages_noslab(buf, buflen, arg.acl_pages); if (i < 0) return i; nfs4_inode_make_writeable(inode); @@ -5845,8 +5872,6 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) return -ENOENT; - if (buflen < label.len) - return -ERANGE; return 0; } @@ -7430,6 +7455,133 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #endif +#ifdef CONFIG_NFS_V4_2 +static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + struct nfs_access_entry cache; + int ret; + + if (!nfs_server_capable(inode, NFS_CAP_XATTR)) + return -EOPNOTSUPP; + + /* + * There is no mapping from the MAY_* flags to the NFS_ACCESS_XA* + * flags right now. Handling of xattr operations use the normal + * file read/write permissions. + * + * Just in case the server has other ideas (which RFC 8276 allows), + * do a cached access check for the XA* flags to possibly avoid + * doing an RPC and getting EACCES back. + */ + if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { + if (!(cache.mask & NFS_ACCESS_XAWRITE)) + return -EACCES; + } + + if (buf == NULL) { + ret = nfs42_proc_removexattr(inode, key); + if (!ret) + nfs4_xattr_cache_remove(inode, key); + } else { + ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags); + if (!ret) + nfs4_xattr_cache_add(inode, key, buf, NULL, buflen); + } + + return ret; +} + +static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + struct nfs_access_entry cache; + ssize_t ret; + + if (!nfs_server_capable(inode, NFS_CAP_XATTR)) + return -EOPNOTSUPP; + + if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { + if (!(cache.mask & NFS_ACCESS_XAREAD)) + return -EACCES; + } + + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret) + return ret; + + ret = nfs4_xattr_cache_get(inode, key, buf, buflen); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + + ret = nfs42_proc_getxattr(inode, key, buf, buflen); + + return ret; +} + +static ssize_t +nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) +{ + u64 cookie; + bool eof; + ssize_t ret, size; + char *buf; + size_t buflen; + struct nfs_access_entry cache; + + if (!nfs_server_capable(inode, NFS_CAP_XATTR)) + return 0; + + if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { + if (!(cache.mask & NFS_ACCESS_XALIST)) + return 0; + } + + ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret) + return ret; + + ret = nfs4_xattr_cache_list(inode, list, list_len); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + + cookie = 0; + eof = false; + buflen = list_len ? list_len : XATTR_LIST_MAX; + buf = list_len ? list : NULL; + size = 0; + + while (!eof) { + ret = nfs42_proc_listxattrs(inode, buf, buflen, + &cookie, &eof); + if (ret < 0) + return ret; + + if (list_len) { + buf += ret; + buflen -= ret; + } + size += ret; + } + + if (list_len) + nfs4_xattr_cache_set_list(inode, list, size); + + return size; +} + +#else + +static ssize_t +nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) +{ + return 0; +} +#endif /* CONFIG_NFS_V4_2 */ + /* * nfs_fhget will use either the mounted_on_fileid or the fileid */ @@ -10035,7 +10187,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2; + ssize_t error, error2, error3; error = generic_listxattr(dentry, list, size); if (error < 0) @@ -10048,7 +10200,17 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size); if (error2 < 0) return error2; - return error + error2; + + if (list) { + list += error2; + size -= error2; + } + + error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size); + if (error3 < 0) + return error3; + + return error + error2 + error3; } static const struct inode_operations nfs4_dir_inode_operations = { @@ -10136,11 +10298,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { .set = nfs4_xattr_set_nfs4_acl, }; +#ifdef CONFIG_NFS_V4_2 +static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = nfs4_xattr_get_nfs4_user, + .set = nfs4_xattr_set_nfs4_user, +}; +#endif + const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, #ifdef CONFIG_NFS_V4_SECURITY_LABEL &nfs4_xattr_nfs4_label_handler, #endif +#ifdef CONFIG_NFS_V4_2 + &nfs4_xattr_nfs4_user_handler, +#endif NULL }; diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 1475f932d7da..0c1ab846b83d 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -69,6 +69,7 @@ static void nfs4_evict_inode(struct inode *inode) pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); + nfs4_xattr_cache_zap(inode); } struct nfs_referral_count { @@ -268,6 +269,12 @@ static int __init init_nfs_v4(void) if (err) goto out1; +#ifdef CONFIG_NFS_V4_2 + err = nfs4_xattr_cache_init(); + if (err) + goto out2; +#endif + err = nfs4_register_sysctl(); if (err) goto out2; @@ -288,6 +295,9 @@ static void __exit exit_nfs_v4(void) nfs4_pnfs_v3_ds_connect_unload(); unregister_nfs_version(&nfs_v4); +#ifdef CONFIG_NFS_V4_2 + nfs4_xattr_cache_exit(); +#endif nfs4_unregister_sysctl(); nfs_idmap_quit(); nfs_dns_resolver_destroy(); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 543541173a3d..b4f852d4d099 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1727,6 +1727,13 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid); DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name); DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); +#ifdef CONFIG_NFS_V4_1 +#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) \ + (lseg ? nfs_stateid_hash(&lseg->pls_layout->plh_stateid) : 0) +#else +#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) (0) +#endif + DECLARE_EVENT_CLASS(nfs4_read_event, TP_PROTO( const struct nfs_pgio_header *hdr, @@ -1745,6 +1752,8 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( @@ -1754,6 +1763,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, hdr->args.fh : &nfsi->fh; const struct nfs4_state *state = hdr->args.context->state; + const struct pnfs_layout_segment *lseg = hdr->lseg; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; @@ -1766,11 +1776,15 @@ DECLARE_EVENT_CLASS(nfs4_read_event, be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&state->stateid); + __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0; + __entry->layoutstateid_hash = + NFS4_LSEG_LAYOUT_STATEID_HASH(lseg); ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u stateid=%d:0x%08x", + "offset=%lld count=%u res=%u stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1778,7 +1792,8 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, - __entry->stateid_seq, __entry->stateid_hash + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); #define DEFINE_NFS4_READ_EVENT(name) \ @@ -1811,6 +1826,8 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( @@ -1820,6 +1837,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, hdr->args.fh : &nfsi->fh; const struct nfs4_state *state = hdr->args.context->state; + const struct pnfs_layout_segment *lseg = hdr->lseg; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; @@ -1832,11 +1850,15 @@ DECLARE_EVENT_CLASS(nfs4_write_event, be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&state->stateid); + __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0; + __entry->layoutstateid_hash = + NFS4_LSEG_LAYOUT_STATEID_HASH(lseg); ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u stateid=%d:0x%08x", + "offset=%lld count=%u res=%u stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1844,7 +1866,8 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, - __entry->stateid_seq, __entry->stateid_hash + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); @@ -1875,6 +1898,8 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, __field(unsigned long, error) __field(loff_t, offset) __field(u32, count) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( @@ -1882,6 +1907,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = data->args.fh ? data->args.fh : &nfsi->fh; + const struct pnfs_layout_segment *lseg = data->lseg; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; @@ -1889,18 +1915,22 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, __entry->offset = data->args.offset; __entry->count = data->args.count; __entry->error = error < 0 ? -error : 0; + __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0; + __entry->layoutstateid_hash = + NFS4_LSEG_LAYOUT_STATEID_HASH(lseg); ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u", + "offset=%lld count=%u layoutstateid=%d:0x%08x", -__entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); #define DEFINE_NFS4_COMMIT_EVENT(name) \ @@ -1993,7 +2023,9 @@ TRACE_EVENT(nfs4_layoutget, DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn_on_close); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layouterror); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutstats); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 47817ef0aadb..0b3510f62623 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4166,7 +4166,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, return -EIO; if (len < NFS4_MAXLABELLEN) { if (label) { - memcpy(label->label, p, len); + if (label->len) { + if (label->len < len) + return -ERANGE; + memcpy(label->label, p, len); + } label->len = len; label->pi = pi; label->lfs = lfs; @@ -4201,6 +4205,26 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str return status; } +static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[2] & (FATTR4_WORD2_XATTR_SUPPORT - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_XATTR_SUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT; + } + dprintk("%s: XATTR support=%s\n", __func__, + *res == 0 ? "false" : "true"); + return 0; +} + static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen) { unsigned int attrwords = XDR_QUADLEN(attrlen); @@ -4855,6 +4879,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if (status) goto xdr_error; + status = decode_attr_xattrsupport(xdr, bitmap, + &fsinfo->xattr_support); + if (status) + goto xdr_error; + status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -5227,7 +5256,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) * The XDR encode routine has set things up so that * the link text will be copied directly into the * buffer. We just have to do overflow-checking, - * and and null-terminate the text (the VFS expects + * and null-terminate the text (the VFS expects * null-termination). */ xdr_terminate_string(rcvbuf, len); @@ -7456,6 +7485,8 @@ static struct { { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, + { NFS4ERR_NOXATTR, -ENODATA }, + { NFS4ERR_XATTR2BIG, -E2BIG }, { -1, -EIO } }; @@ -7584,6 +7615,10 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), + PROC42(GETXATTR, enc_getxattr, dec_getxattr), + PROC42(SETXATTR, enc_setxattr, dec_setxattr), + PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), + PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 547cec79899f..5a59dcdce0b2 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER); { NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \ { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \ { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \ - { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }) + { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \ + { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }) TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS); TRACE_DEFINE_ENUM(NFS_INO_STALE); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dd2e14f5875d..40332c758d84 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1226,31 +1226,27 @@ out: return status; } +static bool +pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo, + enum pnfs_iomode iomode, + u32 seq) +{ + struct pnfs_layout_range recall_range = { + .length = NFS4_MAX_UINT64, + .iomode = iomode, + }; + return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + &recall_range, seq) != -EBUSY; +} + /* Return true if layoutreturn is needed */ static bool pnfs_layout_need_return(struct pnfs_layout_hdr *lo) { - struct pnfs_layout_segment *s; - enum pnfs_iomode iomode; - u32 seq; - if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) return false; - - seq = lo->plh_return_seq; - iomode = lo->plh_return_iomode; - - /* Defer layoutreturn until all recalled lsegs are done */ - list_for_each_entry(s, &lo->plh_segs, pls_list) { - if (seq && pnfs_seqid_is_newer(s->pls_seq, seq)) - continue; - if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode) - continue; - if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) - return false; - } - - return true; + return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode, + lo->plh_return_seq); } static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) @@ -1549,12 +1545,12 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, default: arg_stateid = &args->stateid; } + trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret); pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, res_stateid); if (ld_private && ld_private->ops && ld_private->ops->free) ld_private->ops->free(ld_private); pnfs_put_layout_hdr(lo); - trace_nfs4_layoutreturn_on_close(args->inode, 0); } bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) @@ -2392,16 +2388,6 @@ out_forget: return ERR_PTR(-EAGAIN); } -static int -mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg, - struct list_head *tmp_list) -{ - if (!mark_lseg_invalid(lseg, tmp_list)) - return 0; - pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg); - return 1; -} - /** * pnfs_mark_matching_lsegs_return - Free or return matching layout segments * @lo: pointer to layout header @@ -2438,7 +2424,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - if (mark_lseg_invalid_or_return(lseg, tmp_list)) + if (mark_lseg_invalid(lseg, tmp_list)) continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); @@ -2953,7 +2939,8 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr, + unsigned int mirror_idx) { struct nfs_pageio_descriptor pgio; @@ -2964,6 +2951,7 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + pgio.pg_mirror_idx = mirror_idx; hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); } } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 8e0ada581b92..2661c44c62db 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -311,7 +311,7 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -void pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, |