diff options
author | Linus Torvalds | 2022-05-23 12:42:33 -0700 |
---|---|---|
committer | Linus Torvalds | 2022-05-23 12:42:33 -0700 |
commit | 368da430d04dbe31aded44e5f5c255ff0899ad1d (patch) | |
tree | 0a66b646d202e5daf31b1fca031bcec17e40a192 | |
parent | 09beaff75e4c4cee590b0e547c7587ff6cadb5db (diff) | |
parent | 033b87d24f7257c45506bd043ad85ed24a9925e2 (diff) |
Merge tag 'for-5.19/io_uring-socket-2022-05-22' of git://git.kernel.dk/linux-block
Pull io_uring socket() support from Jens Axboe:
"This adds support for socket(2) for io_uring. This is handy when using
direct / registered file descriptors with io_uring.
Outside of those two patches, a small series from Dylan on top that
improves the tracing by providing a text representation of the opcode
rather than needing to decode this by reading the header file every
time.
That sits in this branch as it was the last opcode added (until it
wasn't...)"
* tag 'for-5.19/io_uring-socket-2022-05-22' of git://git.kernel.dk/linux-block:
io_uring: use the text representation of ops in trace
io_uring: rename op -> opcode
io_uring: add io_uring_get_opcode
io_uring: add type to op enum
io_uring: add socket(2) support
net: add __sys_socket_file()
-rw-r--r-- | fs/io_uring.c | 177 | ||||
-rw-r--r-- | include/linux/io_uring.h | 5 | ||||
-rw-r--r-- | include/linux/socket.h | 1 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 36 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 3 | ||||
-rw-r--r-- | net/socket.c | 52 |
6 files changed, 248 insertions, 26 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index e4c9f776919b..d9529275a030 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -600,6 +600,16 @@ struct io_accept { unsigned long nofile; }; +struct io_socket { + struct file *file; + int domain; + int type; + int protocol; + int flags; + u32 file_slot; + unsigned long nofile; +}; + struct io_sync { struct file *file; loff_t len; @@ -981,6 +991,7 @@ struct io_kiocb { struct io_hardlink hardlink; struct io_msg msg; struct io_xattr xattr; + struct io_socket sock; }; u8 opcode; @@ -1297,6 +1308,9 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1 }, [IORING_OP_GETXATTR] = {}, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -1341,6 +1355,107 @@ static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; +const char *io_uring_get_opcode(u8 opcode) +{ + switch ((enum io_uring_op)opcode) { + case IORING_OP_NOP: + return "NOP"; + case IORING_OP_READV: + return "READV"; + case IORING_OP_WRITEV: + return "WRITEV"; + case IORING_OP_FSYNC: + return "FSYNC"; + case IORING_OP_READ_FIXED: + return "READ_FIXED"; + case IORING_OP_WRITE_FIXED: + return "WRITE_FIXED"; + case IORING_OP_POLL_ADD: + return "POLL_ADD"; + case IORING_OP_POLL_REMOVE: + return "POLL_REMOVE"; + case IORING_OP_SYNC_FILE_RANGE: + return "SYNC_FILE_RANGE"; + case IORING_OP_SENDMSG: + return "SENDMSG"; + case IORING_OP_RECVMSG: + return "RECVMSG"; + case IORING_OP_TIMEOUT: + return "TIMEOUT"; + case IORING_OP_TIMEOUT_REMOVE: + return "TIMEOUT_REMOVE"; + case IORING_OP_ACCEPT: + return "ACCEPT"; + case IORING_OP_ASYNC_CANCEL: + return "ASYNC_CANCEL"; + case IORING_OP_LINK_TIMEOUT: + return "LINK_TIMEOUT"; + case IORING_OP_CONNECT: + return "CONNECT"; + case IORING_OP_FALLOCATE: + return "FALLOCATE"; + case IORING_OP_OPENAT: + return "OPENAT"; + case IORING_OP_CLOSE: + return "CLOSE"; + case IORING_OP_FILES_UPDATE: + return "FILES_UPDATE"; + case IORING_OP_STATX: + return "STATX"; + case IORING_OP_READ: + return "READ"; + case IORING_OP_WRITE: + return "WRITE"; + case IORING_OP_FADVISE: + return "FADVISE"; + case IORING_OP_MADVISE: + return "MADVISE"; + case IORING_OP_SEND: + return "SEND"; + case IORING_OP_RECV: + return "RECV"; + case IORING_OP_OPENAT2: + return "OPENAT2"; + case IORING_OP_EPOLL_CTL: + return "EPOLL_CTL"; + case IORING_OP_SPLICE: + return "SPLICE"; + case IORING_OP_PROVIDE_BUFFERS: + return "PROVIDE_BUFFERS"; + case IORING_OP_REMOVE_BUFFERS: + return "REMOVE_BUFFERS"; + case IORING_OP_TEE: + return "TEE"; + case IORING_OP_SHUTDOWN: + return "SHUTDOWN"; + case IORING_OP_RENAMEAT: + return "RENAMEAT"; + case IORING_OP_UNLINKAT: + return "UNLINKAT"; + case IORING_OP_MKDIRAT: + return "MKDIRAT"; + case IORING_OP_SYMLINKAT: + return "SYMLINKAT"; + case IORING_OP_LINKAT: + return "LINKAT"; + case IORING_OP_MSG_RING: + return "MSG_RING"; + case IORING_OP_FSETXATTR: + return "FSETXATTR"; + case IORING_OP_SETXATTR: + return "SETXATTR"; + case IORING_OP_FGETXATTR: + return "FGETXATTR"; + case IORING_OP_GETXATTR: + return "GETXATTR"; + case IORING_OP_SOCKET: + return "SOCKET"; + case IORING_OP_LAST: + return "INVALID"; + } + return "INVALID"; +} + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -6237,6 +6352,62 @@ retry: return ret; } +static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_socket *sock = &req->sock; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + sock->domain = READ_ONCE(sqe->fd); + sock->type = READ_ONCE(sqe->off); + sock->protocol = READ_ONCE(sqe->len); + sock->file_slot = READ_ONCE(sqe->file_index); + sock->nofile = rlimit(RLIMIT_NOFILE); + + sock->flags = sock->type & ~SOCK_TYPE_MASK; + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) + return -EINVAL; + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + return 0; +} + +static int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_socket *sock = &req->sock; + bool fixed = !!sock->file_slot; + struct file *file; + int ret, fd; + + if (!fixed) { + fd = __get_unused_fd_flags(sock->flags, sock->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_install_fixed_file(req, file, issue_flags, + sock->file_slot - 1); + } + __io_req_complete(req, issue_flags, ret, 0); + return 0; +} + static int io_connect_prep_async(struct io_kiocb *req) { struct io_async_connect *io = req->async_data; @@ -6322,6 +6493,7 @@ IO_NETOP_PREP_ASYNC(sendmsg); IO_NETOP_PREP_ASYNC(recvmsg); IO_NETOP_PREP_ASYNC(connect); IO_NETOP_PREP(accept); +IO_NETOP_PREP(socket); IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ @@ -7651,6 +7823,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_fgetxattr_prep(req, sqe); case IORING_OP_GETXATTR: return io_getxattr_prep(req, sqe); + case IORING_OP_SOCKET: + return io_socket_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -7974,6 +8148,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_GETXATTR: ret = io_getxattr(req, issue_flags); break; + case IORING_OP_SOCKET: + ret = io_socket(req, issue_flags); + break; default: ret = -EINVAL; break; diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 1814e698d861..24651c229ed2 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -10,6 +10,7 @@ struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); +const char *io_uring_get_opcode(u8 opcode); static inline void io_uring_files_cancel(void) { @@ -42,6 +43,10 @@ static inline void io_uring_files_cancel(void) static inline void io_uring_free(struct task_struct *tsk) { } +static inline const char *io_uring_get_opcode(u8 opcode) +{ + return ""; +} #endif #endif diff --git a/include/linux/socket.h b/include/linux/socket.h index 6f85f5d957ef..a1882e1e71d2 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -434,6 +434,7 @@ extern struct file *do_accept(struct file *file, unsigned file_flags, extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); +extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index f74edfc75f41..610e493234db 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -7,6 +7,7 @@ #include <linux/tracepoint.h> #include <uapi/linux/io_uring.h> +#include <linux/io_uring.h> struct io_wq_work; @@ -169,8 +170,9 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->rw = rw; ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); @@ -205,8 +207,9 @@ TRACE_EVENT(io_uring_defer, __entry->opcode = opcode; ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d", - __entry->ctx, __entry->req, __entry->data, __entry->opcode) + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", + __entry->ctx, __entry->req, __entry->data, + io_uring_get_opcode(__entry->opcode)) ); /** @@ -305,9 +308,9 @@ TRACE_EVENT(io_uring_fail_link, __entry->link = link; ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, - __entry->link) + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->link) ); /** @@ -389,9 +392,9 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, " + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " "non block %d, sq_thread %d", __entry->ctx, __entry->req, - __entry->user_data, __entry->opcode, + __entry->user_data, io_uring_get_opcode(__entry->opcode), __entry->flags, __entry->force_nonblock, __entry->sq_thread) ); @@ -433,8 +436,9 @@ TRACE_EVENT(io_uring_poll_arm, __entry->events = events; ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->mask, __entry->events) ); @@ -470,8 +474,9 @@ TRACE_EVENT(io_uring_task_add, __entry->mask = mask; ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x", - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", + __entry->ctx, __entry->req, __entry->user_data, + io_uring_get_opcode(__entry->opcode), __entry->mask) ); @@ -530,12 +535,13 @@ TRACE_EVENT(io_uring_req_failed, ), TP_printk("ring %p, req %p, user_data 0x%llx, " - "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " + "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " "error=%d", __entry->ctx, __entry->req, __entry->user_data, - __entry->opcode, __entry->flags, __entry->ioprio, + io_uring_get_opcode(__entry->opcode), + __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 199bbf73a752..804916814e51 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -128,7 +128,7 @@ enum { */ #define IORING_SETUP_TASKRUN_FLAG (1U << 9) -enum { +enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, IORING_OP_WRITEV, @@ -174,6 +174,7 @@ enum { IORING_OP_SETXATTR, IORING_OP_FGETXATTR, IORING_OP_GETXATTR, + IORING_OP_SOCKET, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/net/socket.c b/net/socket.c index 6887840682bb..bb6a1a12fbde 100644 --- a/net/socket.c +++ b/net/socket.c @@ -504,7 +504,7 @@ static int sock_map_fd(struct socket *sock, int flags) struct socket *sock_from_file(struct file *file) { if (file->f_op == &socket_file_ops) - return file->private_data; /* set in sock_map_fd */ + return file->private_data; /* set in sock_alloc_file */ return NULL; } @@ -1538,11 +1538,10 @@ int sock_create_kern(struct net *net, int family, int type, int protocol, struct } EXPORT_SYMBOL(sock_create_kern); -int __sys_socket(int family, int type, int protocol) +static struct socket *__sys_socket_create(int family, int type, int protocol) { - int retval; struct socket *sock; - int flags; + int retval; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); @@ -1550,17 +1549,50 @@ int __sys_socket(int family, int type, int protocol) BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); - flags = type & ~SOCK_TYPE_MASK; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; + if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return ERR_PTR(-EINVAL); type &= SOCK_TYPE_MASK; + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + return ERR_PTR(retval); + + return sock; +} + +struct file *__sys_socket_file(int family, int type, int protocol) +{ + struct socket *sock; + struct file *file; + int flags; + + sock = __sys_socket_create(family, type, protocol); + if (IS_ERR(sock)) + return ERR_CAST(sock); + + flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - retval = sock_create(family, type, protocol, &sock); - if (retval < 0) - return retval; + file = sock_alloc_file(sock, flags, NULL); + if (IS_ERR(file)) + sock_release(sock); + + return file; +} + +int __sys_socket(int family, int type, int protocol) +{ + struct socket *sock; + int flags; + + sock = __sys_socket_create(family, type, protocol); + if (IS_ERR(sock)) + return PTR_ERR(sock); + + flags = type & ~SOCK_TYPE_MASK; + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } |