From 3d8fa456d5ed22ce8db085a89a037b87568b2b64 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:25 -0700 Subject: ipc: clamp with min() Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msgutil.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 5df8e4bf1db0..98b1c2b476cc 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -41,8 +41,8 @@ struct msg_msgseg { /* the next part of the message follows immediately */ }; -#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) -#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) +#define DATALEN_MSG (int)(PAGE_SIZE-sizeof(struct msg_msg)) +#define DATALEN_SEG (int)(PAGE_SIZE-sizeof(struct msg_msgseg)) struct msg_msg *load_msg(const void __user *src, int len) { @@ -51,10 +51,7 @@ struct msg_msg *load_msg(const void __user *src, int len) int err; int alen; - alen = len; - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; - + alen = min(len, DATALEN_MSG); msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) return ERR_PTR(-ENOMEM); @@ -72,9 +69,7 @@ struct msg_msg *load_msg(const void __user *src, int len) pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; + alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL); if (seg == NULL) { @@ -113,19 +108,14 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) if (src->m_ts > dst->m_ts) return ERR_PTR(-EINVAL); - alen = len; - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; - + alen = min(len, DATALEN_MSG); memcpy(dst + 1, src + 1, alen); len -= alen; dst_pseg = dst->next; src_pseg = src->next; while (len > 0) { - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; + alen = min(len, DATALEN_SEG); memcpy(dst_pseg + 1, src_pseg + 1, alen); dst_pseg = dst_pseg->next; len -= alen; @@ -148,9 +138,7 @@ int store_msg(void __user *dest, struct msg_msg *msg, int len) int alen; struct msg_msgseg *seg; - alen = len; - if (alen > DATALEN_MSG) - alen = DATALEN_MSG; + alen = min(len, DATALEN_MSG); if (copy_to_user(dest, msg + 1, alen)) return -1; @@ -158,9 +146,7 @@ int store_msg(void __user *dest, struct msg_msg *msg, int len) dest = ((char __user *)dest) + alen; seg = msg->next; while (len > 0) { - alen = len; - if (alen > DATALEN_SEG) - alen = DATALEN_SEG; + alen = min(len, DATALEN_SEG); if (copy_to_user(dest, seg + 1, alen)) return -1; len -= alen; -- cgit v1.2.3 From be5f4b335f6e05df1b5c24b7e7d79ff52d7b8dbc Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:31 -0700 Subject: ipc: separate msg allocation from userspace copy Separating msg allocation enables single-block vmalloc allocation instead. Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msgutil.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 98b1c2b476cc..0a5c8a95c257 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -44,21 +44,54 @@ struct msg_msgseg { #define DATALEN_MSG (int)(PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG (int)(PAGE_SIZE-sizeof(struct msg_msgseg)) -struct msg_msg *load_msg(const void __user *src, int len) + +static struct msg_msg *alloc_msg(int len) { struct msg_msg *msg; struct msg_msgseg **pseg; - int err; int alen; alen = min(len, DATALEN_MSG); msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) - return ERR_PTR(-ENOMEM); + return NULL; msg->next = NULL; msg->security = NULL; + len -= alen; + pseg = &msg->next; + while (len > 0) { + struct msg_msgseg *seg; + alen = min(len, DATALEN_SEG); + seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL); + if (seg == NULL) + goto out_err; + *pseg = seg; + seg->next = NULL; + pseg = &seg->next; + len -= alen; + } + + return msg; + +out_err: + free_msg(msg); + return NULL; +} + +struct msg_msg *load_msg(const void __user *src, int len) +{ + struct msg_msg *msg; + struct msg_msgseg *seg; + int err; + int alen; + + msg = alloc_msg(len); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + + alen = min(len, DATALEN_MSG); if (copy_from_user(msg + 1, src, alen)) { err = -EFAULT; goto out_err; @@ -66,23 +99,14 @@ struct msg_msg *load_msg(const void __user *src, int len) len -= alen; src = ((char __user *)src) + alen; - pseg = &msg->next; + seg = msg->next; while (len > 0) { - struct msg_msgseg *seg; alen = min(len, DATALEN_SEG); - seg = kmalloc(sizeof(*seg) + alen, - GFP_KERNEL); - if (seg == NULL) { - err = -ENOMEM; - goto out_err; - } - *pseg = seg; - seg->next = NULL; if (copy_from_user(seg + 1, src, alen)) { err = -EFAULT; goto out_err; } - pseg = &seg->next; + seg = seg->next; len -= alen; src = ((char __user *)src) + alen; } -- cgit v1.2.3 From da085d4591a6fe11eac2e1f659f25b655e9f2e53 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:37 -0700 Subject: ipc: tighten msg copy loops Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msgutil.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 0a5c8a95c257..b79582d461a4 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -97,18 +97,14 @@ struct msg_msg *load_msg(const void __user *src, int len) goto out_err; } - len -= alen; - src = ((char __user *)src) + alen; - seg = msg->next; - while (len > 0) { + for (seg = msg->next; seg != NULL; seg = seg->next) { + len -= alen; + src = (char __user *)src + alen; alen = min(len, DATALEN_SEG); if (copy_from_user(seg + 1, src, alen)) { err = -EFAULT; goto out_err; } - seg = seg->next; - len -= alen; - src = ((char __user *)src) + alen; } err = security_msg_msg_alloc(msg); @@ -135,15 +131,13 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) alen = min(len, DATALEN_MSG); memcpy(dst + 1, src + 1, alen); - len -= alen; - dst_pseg = dst->next; - src_pseg = src->next; - while (len > 0) { + for (dst_pseg = dst->next, src_pseg = src->next; + src_pseg != NULL; + dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) { + + len -= alen; alen = min(len, DATALEN_SEG); memcpy(dst_pseg + 1, src_pseg + 1, alen); - dst_pseg = dst_pseg->next; - len -= alen; - src_pseg = src_pseg->next; } dst->m_type = src->m_type; @@ -166,16 +160,12 @@ int store_msg(void __user *dest, struct msg_msg *msg, int len) if (copy_to_user(dest, msg + 1, alen)) return -1; - len -= alen; - dest = ((char __user *)dest) + alen; - seg = msg->next; - while (len > 0) { + for (seg = msg->next; seg != NULL; seg = seg->next) { + len -= alen; + dest = (char __user *)dest + alen; alen = min(len, DATALEN_SEG); if (copy_to_user(dest, seg + 1, alen)) return -1; - len -= alen; - dest = ((char __user *)dest) + alen; - seg = seg->next; } return 0; } -- cgit v1.2.3 From 2b3097a294b6daaf390010de14ca50bfccbc6fb6 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:42 -0700 Subject: ipc: set EFAULT as default error in load_msg() Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msgutil.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index b79582d461a4..d33fbb2743bd 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -84,7 +84,7 @@ struct msg_msg *load_msg(const void __user *src, int len) { struct msg_msg *msg; struct msg_msgseg *seg; - int err; + int err = -EFAULT; int alen; msg = alloc_msg(len); @@ -92,19 +92,15 @@ struct msg_msg *load_msg(const void __user *src, int len) return ERR_PTR(-ENOMEM); alen = min(len, DATALEN_MSG); - if (copy_from_user(msg + 1, src, alen)) { - err = -EFAULT; + if (copy_from_user(msg + 1, src, alen)) goto out_err; - } for (seg = msg->next; seg != NULL; seg = seg->next) { len -= alen; src = (char __user *)src + alen; alen = min(len, DATALEN_SEG); - if (copy_from_user(seg + 1, src, alen)) { - err = -EFAULT; + if (copy_from_user(seg + 1, src, alen)) goto out_err; - } } err = security_msg_msg_alloc(msg); -- cgit v1.2.3 From 852028af861ed6c7ab7e73053dd664eb28e55200 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:48 -0700 Subject: ipc: remove msg handling from queue scan In preparation for refactoring the queue scan into a separate function, relocate msg copying. Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index fede1d06ef30..b46473074662 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -862,16 +862,8 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, walk_msg->m_type != 1) { msgtyp = walk_msg->m_type - 1; } else if (msgflg & MSG_COPY) { - if (copy_number == msg_counter) { - /* - * Found requested message. - * Copy it. - */ - msg = copy_msg(msg, copy); - if (IS_ERR(msg)) - goto out_unlock; + if (copy_number == msg_counter) break; - } msg = ERR_PTR(-EAGAIN); } else break; @@ -892,8 +884,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, * If we are copying, then do not unlink message and do * not update queue parameters. */ - if (msgflg & MSG_COPY) + if (msgflg & MSG_COPY) { + msg = copy_msg(msg, copy); goto out_unlock; + } list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); -- cgit v1.2.3 From 8ac6ed5857c8d583e0dc2ab2165966ab143930ad Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:54 -0700 Subject: ipc: implement MSG_COPY as a new receive mode Teach the helper routines about MSG_COPY so that msgtyp is preserved as the message number to copy. The security functions affected by this change were audited and no additional changes are necessary. Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index b46473074662..7c209b4f5e38 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -66,6 +66,7 @@ struct msg_sender { #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 +#define SEARCH_NUMBER 5 #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) @@ -583,6 +584,7 @@ static int testmsg(struct msg_msg *msg, long type, int mode) switch(mode) { case SEARCH_ANY: + case SEARCH_NUMBER: return 1; case SEARCH_LESSEQUAL: if (msg->m_type <=type) @@ -738,6 +740,8 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, static inline int convert_mode(long *msgtyp, int msgflg) { + if (msgflg & MSG_COPY) + return SEARCH_NUMBER; /* * find message of correct type. * msgtyp = 0 => get first. @@ -774,14 +778,10 @@ static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) * This function creates new kernel message structure, large enough to store * bufsz message bytes. */ -static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz, - int msgflg, long *msgtyp, - unsigned long *copy_number) +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { struct msg_msg *copy; - *copy_number = *msgtyp; - *msgtyp = 0; /* * Create dummy message to copy real message to. */ @@ -797,9 +797,7 @@ static inline void free_copy(struct msg_msg *copy) free_msg(copy); } #else -static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz, - int msgflg, long *msgtyp, - unsigned long *copy_number) +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { return ERR_PTR(-ENOSYS); } @@ -818,15 +816,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int mode; struct ipc_namespace *ns; struct msg_msg *copy = NULL; - unsigned long copy_number = 0; ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; if (msgflg & MSG_COPY) { - copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax), - msgflg, &msgtyp, ©_number); + copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) return PTR_ERR(copy); } @@ -861,8 +857,8 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, if (mode == SEARCH_LESSEQUAL && walk_msg->m_type != 1) { msgtyp = walk_msg->m_type - 1; - } else if (msgflg & MSG_COPY) { - if (copy_number == msg_counter) + } else if (mode == SEARCH_NUMBER) { + if (msgtyp == msg_counter) break; msg = ERR_PTR(-EAGAIN); } else -- cgit v1.2.3 From d076ac9112797884c0be35f4c93c1517aa352c0c Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:14:59 -0700 Subject: ipc: simplify msg list search Signed-off-by: Peter Hurley Acked-by: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 7c209b4f5e38..a92275023134 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -836,7 +836,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, for (;;) { struct msg_receiver msr_d; - struct list_head *tmp; + struct msg_msg *walk_msg; long msg_counter = 0; msg = ERR_PTR(-EACCES); @@ -844,11 +844,8 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, goto out_unlock; msg = ERR_PTR(-EAGAIN); - tmp = msq->q_messages.next; - while (tmp != &msq->q_messages) { - struct msg_msg *walk_msg; + list_for_each_entry(walk_msg, &msq->q_messages, m_list) { - walk_msg = list_entry(tmp, struct msg_msg, m_list); if (testmsg(walk_msg, msgtyp, mode) && !security_msg_queue_msgrcv(msq, walk_msg, current, msgtyp, mode)) { @@ -865,7 +862,6 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, break; msg_counter++; } - tmp = tmp->next; } if (!IS_ERR(msg)) { /* -- cgit v1.2.3 From daaf74cf0867e3042090d56d10b194d6265b4684 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Tue, 30 Apr 2013 19:15:04 -0700 Subject: ipc: refactor msg list search into separate function [fengguang.wu@intel.com: find_msg can be static] Signed-off-by: Peter Hurley Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index a92275023134..a80aaf463d9c 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -807,6 +807,30 @@ static inline void free_copy(struct msg_msg *copy) } #endif +static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) +{ + struct msg_msg *msg; + long count = 0; + + list_for_each_entry(msg, &msq->q_messages, m_list) { + if (testmsg(msg, *msgtyp, mode) && + !security_msg_queue_msgrcv(msq, msg, current, + *msgtyp, mode)) { + if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { + *msgtyp = msg->m_type - 1; + } else if (mode == SEARCH_NUMBER) { + if (*msgtyp == count) + return msg; + } else + return msg; + count++; + } + } + + return ERR_PTR(-EAGAIN); +} + + long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) @@ -836,33 +860,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, for (;;) { struct msg_receiver msr_d; - struct msg_msg *walk_msg; - long msg_counter = 0; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; - msg = ERR_PTR(-EAGAIN); - list_for_each_entry(walk_msg, &msq->q_messages, m_list) { - - if (testmsg(walk_msg, msgtyp, mode) && - !security_msg_queue_msgrcv(msq, walk_msg, current, - msgtyp, mode)) { - - msg = walk_msg; - if (mode == SEARCH_LESSEQUAL && - walk_msg->m_type != 1) { - msgtyp = walk_msg->m_type - 1; - } else if (mode == SEARCH_NUMBER) { - if (msgtyp == msg_counter) - break; - msg = ERR_PTR(-EAGAIN); - } else - break; - msg_counter++; - } - } + msg = find_msg(msq, &msgtyp, mode); + if (!IS_ERR(msg)) { /* * Found a suitable message. -- cgit v1.2.3 From 1e3c941c52eab70c8acb2f77829c24673445c858 Mon Sep 17 00:00:00 2001 From: HoSung Jung Date: Tue, 30 Apr 2013 19:15:09 -0700 Subject: ipc/msgutil.c: use linux/uaccess.h Signed-off-by: HoSung Jung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msgutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index d33fbb2743bd..d43439e6eb47 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "util.h" @@ -37,7 +37,7 @@ struct ipc_namespace init_ipc_ns = { atomic_t nr_ipc_ns = ATOMIC_INIT(1); struct msg_msgseg { - struct msg_msgseg* next; + struct msg_msgseg *next; /* the next part of the message follows immediately */ }; -- cgit v1.2.3 From 7bb4deff61bdab3338534841cb6d0508314a41d6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 19:15:14 -0700 Subject: ipc: remove bogus lock comment for ipc_checkid This series makes the sysv semaphore code more scalable, by reducing the time the semaphore lock is held, and making the locking more scalable for semaphore arrays with multiple semaphores. The first four patches were written by Davidlohr Buesso, and reduce the hold time of the semaphore lock. The last three patches change the sysv semaphore code locking to be more fine grained, providing a performance boost when multiple semaphores in a semaphore array are being manipulated simultaneously. On a 24 CPU system, performance numbers with the semop-multi test with N threads and N semaphores, look like this: vanilla Davidlohr's Davidlohr's + Davidlohr's + threads patches rwlock patches v3 patches 10 610652 726325 1783589 2142206 20 341570 365699 1520453 1977878 30 288102 307037 1498167 2037995 40 290714 305955 1612665 2256484 50 288620 312890 1733453 2650292 60 289987 306043 1649360 2388008 70 291298 306347 1723167 2717486 80 290948 305662 1729545 2763582 90 290996 306680 1736021 2757524 100 292243 306700 1773700 3059159 This patch: There is no reason to be holding the ipc lock while reading ipcp->seq, hence remove misleading comment. Also simplify the return value for the function. Signed-off-by: Davidlohr Bueso Signed-off-by: Rik van Riel Cc: Chegu Vinod Cc: Emmanuel Benisty Cc: Jason Low Cc: Michel Lespinasse Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ipc/util.h b/ipc/util.h index eeb79a1fbd83..ac1480a4efd1 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -150,14 +150,9 @@ static inline int ipc_buildid(int id, int seq) return SEQ_MULTIPLIER * seq + id; } -/* - * Must be called with ipcp locked - */ static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) { - if (uid / SEQ_MULTIPLIER != ipcp->seq) - return 1; - return 0; + return uid / SEQ_MULTIPLIER != ipcp->seq; } static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) -- cgit v1.2.3 From 4d2bff5eb86e8d7b4a20934cccb93bdeebed3558 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 19:15:19 -0700 Subject: ipc: introduce obtaining a lockless ipc object Through ipc_lock() and therefore ipc_lock_check() we currently return the locked ipc object. This is not necessary for all situations and can, therefore, cause unnecessary ipc lock contention. Introduce analogous ipc_obtain_object() and ipc_obtain_object_check() functions that only lookup and return the ipc object. Both these functions must be called within the RCU read critical section. [akpm@linux-foundation.org: propagate the ipc_obtain_object() errno from ipc_lock()] Signed-off-by: Davidlohr Bueso Signed-off-by: Rik van Riel Reviewed-by: Chegu Vinod Acked-by: Michel Lespinasse Cc: Emmanuel Benisty Cc: Jason Low Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++------------- ipc/util.h | 2 ++ 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 03eadd8fb0fd..813804ebdeba 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -668,6 +668,28 @@ void ipc64_perm_to_ipc_perm (struct ipc64_perm *in, struct ipc_perm *out) out->seq = in->seq; } +/** + * ipc_obtain_object + * @ids: ipc identifier set + * @id: ipc id to look for + * + * Look for an id in the ipc ids idr and return associated ipc object. + * + * Call inside the RCU critical section. + * The ipc object is *not* locked on exit. + */ +struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id) +{ + struct kern_ipc_perm *out; + int lid = ipcid_to_idx(id); + + out = idr_find(&ids->ipcs_idr, lid); + if (!out) + return ERR_PTR(-EINVAL); + + return out; +} + /** * ipc_lock - Lock an ipc structure without rw_mutex held * @ids: IPC identifier set @@ -675,32 +697,53 @@ void ipc64_perm_to_ipc_perm (struct ipc64_perm *in, struct ipc_perm *out) * * Look for an id in the ipc ids idr and lock the associated ipc object. * - * The ipc object is locked on exit. + * The ipc object is locked on successful exit. */ - struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; - int lid = ipcid_to_idx(id); rcu_read_lock(); - out = idr_find(&ids->ipcs_idr, lid); - if (out == NULL) { - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } + out = ipc_obtain_object(ids, id); + if (IS_ERR(out)) + goto err1; spin_lock(&out->lock); - + /* ipc_rmid() may have already freed the ID while ipc_lock * was spinning: here verify that the structure is still valid */ - if (out->deleted) { - spin_unlock(&out->lock); - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } + if (!out->deleted) + return out; + spin_unlock(&out->lock); + out = ERR_PTR(-EINVAL); +err1: + rcu_read_unlock(); + return out; +} + +/** + * ipc_obtain_object_check + * @ids: ipc identifier set + * @id: ipc id to look for + * + * Similar to ipc_obtain_object() but also checks + * the ipc object reference counter. + * + * Call inside the RCU critical section. + * The ipc object is *not* locked on exit. + */ +struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id) +{ + struct kern_ipc_perm *out = ipc_obtain_object(ids, id); + + if (IS_ERR(out)) + goto out; + + if (ipc_checkid(out, id)) + return ERR_PTR(-EIDRM); +out: return out; } diff --git a/ipc/util.h b/ipc/util.h index ac1480a4efd1..bfc8d4ea6e46 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -123,6 +123,7 @@ void ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); +struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); @@ -168,6 +169,7 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm) } struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); +struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, -- cgit v1.2.3 From 444d0f621b64716f7868dcbde448e0c66ece4e61 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 19:15:24 -0700 Subject: ipc: introduce lockless pre_down ipcctl Various forms of ipc use ipcctl_pre_down() to retrieve an ipc object and check permissions, mostly for IPC_RMID and IPC_SET commands. Introduce ipcctl_pre_down_nolock(), a lockless version of this function. The locking version is retained, yet modified to call the nolock version without affecting its semantics, thus transparent to all ipc callers. Signed-off-by: Davidlohr Bueso Signed-off-by: Rik van Riel Suggested-by: Linus Torvalds Cc: Chegu Vinod Cc: Emmanuel Benisty Cc: Jason Low Cc: Michel Lespinasse Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 31 ++++++++++++++++++++++++++----- ipc/util.h | 3 +++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 813804ebdeba..3df0af3158a5 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -824,11 +824,28 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, struct ipc64_perm *perm, int extra_perm) { struct kern_ipc_perm *ipcp; + + ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm); + if (IS_ERR(ipcp)) + goto out; + + spin_lock(&ipcp->lock); +out: + return ipcp; +} + +struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, + struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm) +{ kuid_t euid; - int err; + int err = -EPERM; + struct kern_ipc_perm *ipcp; down_write(&ids->rw_mutex); - ipcp = ipc_lock_check(ids, id); + rcu_read_lock(); + + ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_up; @@ -837,17 +854,21 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, audit_ipc_obj(ipcp); if (cmd == IPC_SET) audit_ipc_set_perm(extra_perm, perm->uid, - perm->gid, perm->mode); + perm->gid, perm->mode); euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ipcp; - err = -EPERM; - ipc_unlock(ipcp); out_up: + /* + * Unsuccessful lookup, unlock and return + * the corresponding error. + */ + rcu_read_unlock(); up_write(&ids->rw_mutex); + return ERR_PTR(err); } diff --git a/ipc/util.h b/ipc/util.h index bfc8d4ea6e46..13d92fea15a3 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -128,6 +128,9 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); +struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, + struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm); struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); -- cgit v1.2.3 From 16df3674efe39f3ab63e7052f1244dd3d50e7f84 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 19:15:29 -0700 Subject: ipc,sem: do not hold ipc lock more than necessary Instead of holding the ipc lock for permissions and security checks, among others, only acquire it when necessary. Some numbers.... 1) With Rik's semop-multi.c microbenchmark we can see the following results: Baseline (3.9-rc1): cpus 4, threads: 256, semaphores: 128, test duration: 30 secs total operations: 151452270, ops/sec 5048409 + 59.40% a.out [kernel.kallsyms] [k] _raw_spin_lock + 6.14% a.out [kernel.kallsyms] [k] sys_semtimedop + 3.84% a.out [kernel.kallsyms] [k] avc_has_perm_flags + 3.64% a.out [kernel.kallsyms] [k] __audit_syscall_exit + 2.06% a.out [kernel.kallsyms] [k] copy_user_enhanced_fast_string + 1.86% a.out [kernel.kallsyms] [k] ipc_lock With this patchset: cpus 4, threads: 256, semaphores: 128, test duration: 30 secs total operations: 273156400, ops/sec 9105213 + 18.54% a.out [kernel.kallsyms] [k] _raw_spin_lock + 11.72% a.out [kernel.kallsyms] [k] sys_semtimedop + 7.70% a.out [kernel.kallsyms] [k] ipc_has_perm.isra.21 + 6.58% a.out [kernel.kallsyms] [k] avc_has_perm_flags + 6.54% a.out [kernel.kallsyms] [k] __audit_syscall_exit + 4.71% a.out [kernel.kallsyms] [k] ipc_obtain_object_check 2) While on an Oracle swingbench DSS (data mining) workload the improvements are not as exciting as with Rik's benchmark, we can see some positive numbers. For an 8 socket machine the following are the percentages of %sys time incurred in the ipc lock: Baseline (3.9-rc1): 100 swingbench users: 8,74% 400 swingbench users: 21,86% 800 swingbench users: 84,35% With this patchset: 100 swingbench users: 8,11% 400 swingbench users: 19,93% 800 swingbench users: 77,69% [riel@redhat.com: fix two locking bugs] [sasha.levin@oracle.com: prevent releasing RCU read lock twice in semctl_main] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Signed-off-by: Rik van Riel Reviewed-by: Chegu Vinod Acked-by: Michel Lespinasse Cc: Rik van Riel Cc: Jason Low Cc: Emmanuel Benisty Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 161 +++++++++++++++++++++++++++++++++++++++++++------------------ ipc/util.h | 5 ++ 2 files changed, 118 insertions(+), 48 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 5b167d00efa6..cd1093cf7e8f 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -204,13 +204,34 @@ static inline struct sem_array *sem_lock(struct ipc_namespace *ns, int id) return container_of(ipcp, struct sem_array, sem_perm); } +static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&sem_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct sem_array, sem_perm); +} + static inline struct sem_array *sem_lock_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_lock_check(&sem_ids(ns), id); if (IS_ERR(ipcp)) - return (struct sem_array *)ipcp; + return ERR_CAST(ipcp); + + return container_of(ipcp, struct sem_array, sem_perm); +} + +static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } @@ -234,6 +255,16 @@ static inline void sem_putref(struct sem_array *sma) ipc_unlock(&(sma)->sem_perm); } +/* + * Call inside the rcu read section. + */ +static inline void sem_getref(struct sem_array *sma) +{ + spin_lock(&(sma)->sem_perm.lock); + ipc_rcu_getref(sma); + ipc_unlock(&(sma)->sem_perm); +} + static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) { ipc_rmid(&sem_ids(ns), &s->sem_perm); @@ -842,18 +873,25 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, case SEM_STAT: { struct semid64_ds tbuf; - int id; + int id = 0; + + memset(&tbuf, 0, sizeof(tbuf)); if (cmd == SEM_STAT) { - sma = sem_lock(ns, semid); - if (IS_ERR(sma)) - return PTR_ERR(sma); + rcu_read_lock(); + sma = sem_obtain_object(ns, semid); + if (IS_ERR(sma)) { + err = PTR_ERR(sma); + goto out_unlock; + } id = sma->sem_perm.id; } else { - sma = sem_lock_check(ns, semid); - if (IS_ERR(sma)) - return PTR_ERR(sma); - id = 0; + rcu_read_lock(); + sma = sem_obtain_object_check(ns, semid); + if (IS_ERR(sma)) { + err = PTR_ERR(sma); + goto out_unlock; + } } err = -EACCES; @@ -864,13 +902,11 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, if (err) goto out_unlock; - memset(&tbuf, 0, sizeof(tbuf)); - kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); tbuf.sem_otime = sma->sem_otime; tbuf.sem_ctime = sma->sem_ctime; tbuf.sem_nsems = sma->sem_nsems; - sem_unlock(sma); + rcu_read_unlock(); if (copy_semid_to_user(p, &tbuf, version)) return -EFAULT; return id; @@ -879,7 +915,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, return -EINVAL; } out_unlock: - sem_unlock(sma); + rcu_read_unlock(); return err; } @@ -947,27 +983,34 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, { struct sem_array *sma; struct sem* curr; - int err; + int err, nsems; ushort fast_sem_io[SEMMSL_FAST]; ushort* sem_io = fast_sem_io; - int nsems; struct list_head tasks; - sma = sem_lock_check(ns, semid); - if (IS_ERR(sma)) + INIT_LIST_HEAD(&tasks); + + rcu_read_lock(); + sma = sem_obtain_object_check(ns, semid); + if (IS_ERR(sma)) { + rcu_read_unlock(); return PTR_ERR(sma); + } - INIT_LIST_HEAD(&tasks); nsems = sma->sem_nsems; err = -EACCES; if (ipcperms(ns, &sma->sem_perm, - cmd == SETALL ? S_IWUGO : S_IRUGO)) - goto out_unlock; + cmd == SETALL ? S_IWUGO : S_IRUGO)) { + rcu_read_unlock(); + goto out_wakeup; + } err = security_sem_semctl(sma, cmd); - if (err) - goto out_unlock; + if (err) { + rcu_read_unlock(); + goto out_wakeup; + } err = -EACCES; switch (cmd) { @@ -977,7 +1020,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; if(nsems > SEMMSL_FAST) { - sem_getref_and_unlock(sma); + sem_getref(sma); sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { @@ -993,6 +1036,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } } + spin_lock(&sma->sem_perm.lock); for (i = 0; i < sma->sem_nsems; i++) sem_io[i] = sma->sem_base[i].semval; sem_unlock(sma); @@ -1006,7 +1050,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; struct sem_undo *un; - sem_getref_and_unlock(sma); + ipc_rcu_getref(sma); + rcu_read_unlock(); if(nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); @@ -1053,9 +1098,12 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */ } err = -EINVAL; - if(semnum < 0 || semnum >= nsems) - goto out_unlock; + if (semnum < 0 || semnum >= nsems) { + rcu_read_unlock(); + goto out_wakeup; + } + spin_lock(&sma->sem_perm.lock); curr = &sma->sem_base[semnum]; switch (cmd) { @@ -1072,10 +1120,11 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, err = count_semzcnt(sma,semnum); goto out_unlock; } + out_unlock: sem_unlock(sma); +out_wakeup: wake_up_sem_queue_do(&tasks); - out_free: if(sem_io != fast_sem_io) ipc_free(sem_io, sizeof(ushort)*nsems); @@ -1126,29 +1175,35 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } - ipcp = ipcctl_pre_down(ns, &sem_ids(ns), semid, cmd, - &semid64.sem_perm, 0); + ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, + &semid64.sem_perm, 0); if (IS_ERR(ipcp)) return PTR_ERR(ipcp); sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(sma, cmd); - if (err) + if (err) { + rcu_read_unlock(); goto out_unlock; + } switch(cmd){ case IPC_RMID: + ipc_lock_object(&sma->sem_perm); freeary(ns, ipcp); goto out_up; case IPC_SET: + ipc_lock_object(&sma->sem_perm); err = ipc_update_perm(&semid64.sem_perm, ipcp); if (err) goto out_unlock; sma->sem_ctime = get_seconds(); break; default: + rcu_read_unlock(); err = -EINVAL; + goto out_up; } out_unlock: @@ -1277,16 +1332,18 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) spin_unlock(&ulp->lock); if (likely(un!=NULL)) goto out; - rcu_read_unlock(); /* no undo structure around - allocate one. */ /* step 1: figure out the size of the semaphore array */ - sma = sem_lock_check(ns, semid); - if (IS_ERR(sma)) + sma = sem_obtain_object_check(ns, semid); + if (IS_ERR(sma)) { + rcu_read_unlock(); return ERR_CAST(sma); + } nsems = sma->sem_nsems; - sem_getref_and_unlock(sma); + ipc_rcu_getref(sma); + rcu_read_unlock(); /* step 2: allocate new undo structure */ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); @@ -1421,7 +1478,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, INIT_LIST_HEAD(&tasks); - sma = sem_lock_check(ns, semid); + rcu_read_lock(); + sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { if (un) rcu_read_unlock(); @@ -1429,6 +1487,24 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, goto out_free; } + error = -EFBIG; + if (max >= sma->sem_nsems) { + rcu_read_unlock(); + goto out_wakeup; + } + + error = -EACCES; + if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { + rcu_read_unlock(); + goto out_wakeup; + } + + error = security_sem_semop(sma, sops, nsops, alter); + if (error) { + rcu_read_unlock(); + goto out_wakeup; + } + /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -1437,6 +1513,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * "un" itself is guaranteed by rcu. */ error = -EIDRM; + ipc_lock_object(&sma->sem_perm); if (un) { if (un->semid == -1) { rcu_read_unlock(); @@ -1454,18 +1531,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } } - error = -EFBIG; - if (max >= sma->sem_nsems) - goto out_unlock_free; - - error = -EACCES; - if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) - goto out_unlock_free; - - error = security_sem_semop(sma, sops, nsops, alter); - if (error) - goto out_unlock_free; - error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); if (error <= 0) { if (alter && error == 0) @@ -1568,7 +1633,7 @@ sleep_again: out_unlock_free: sem_unlock(sma); - +out_wakeup: wake_up_sem_queue_do(&tasks); out_free: if(sops != fast_sops) diff --git a/ipc/util.h b/ipc/util.h index 13d92fea15a3..c36b9977c957 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -171,6 +171,11 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm) rcu_read_unlock(); } +static inline void ipc_lock_object(struct kern_ipc_perm *perm) +{ + spin_lock(&perm->lock); +} + struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, -- cgit v1.2.3 From c460b662d5cae467f1c341c59b02a5c5e68fed0b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Apr 2013 19:15:35 -0700 Subject: ipc,sem: open code and rename sem_lock Rename sem_lock() to sem_obtain_lock(), so we can introduce a sem_lock() later that only locks the sem_array and does nothing else. Open code the locking from ipc_lock() in sem_obtain_lock() so we can introduce finer grained locking for the sem_array in the next patch. [akpm@linux-foundation.org: propagate the ipc_obtain_object() errno out of sem_obtain_lock()] Signed-off-by: Rik van Riel Acked-by: Davidlohr Bueso Cc: Chegu Vinod Cc: Emmanuel Benisty Cc: Jason Low Cc: Michel Lespinasse Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index cd1093cf7e8f..70020066ac0d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -194,14 +194,31 @@ void __init sem_init (void) * sem_lock_(check_) routines are called in the paths where the rw_mutex * is not held. */ -static inline struct sem_array *sem_lock(struct ipc_namespace *ns, int id) +static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id) { - struct kern_ipc_perm *ipcp = ipc_lock(&sem_ids(ns), id); + struct kern_ipc_perm *ipcp; + struct sem_array *sma; - if (IS_ERR(ipcp)) - return (struct sem_array *)ipcp; + rcu_read_lock(); + ipcp = ipc_obtain_object(&sem_ids(ns), id); + if (IS_ERR(ipcp)) { + sma = ERR_CAST(ipcp); + goto err; + } - return container_of(ipcp, struct sem_array, sem_perm); + spin_lock(&ipcp->lock); + + /* ipc_rmid() may have already freed the ID while sem_lock + * was spinning: verify that the structure is still valid + */ + if (!ipcp->deleted) + return container_of(ipcp, struct sem_array, sem_perm); + + spin_unlock(&ipcp->lock); + sma = ERR_PTR(-EINVAL); +err: + rcu_read_unlock(); + return sma; } static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) @@ -1593,7 +1610,7 @@ sleep_again: goto out_free; } - sma = sem_lock(ns, semid); + sma = sem_obtain_lock(ns, semid); /* * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. -- cgit v1.2.3 From 9f1bc2c9022c1d4944c4a1a44c2f365487420aca Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Apr 2013 19:15:39 -0700 Subject: ipc,sem: have only one list in struct sem_queue Having only one list in struct sem_queue, and only queueing simple semaphore operations on the list for the semaphore involved, allows us to introduce finer grained locking for semtimedop. Signed-off-by: Rik van Riel Acked-by: Davidlohr Bueso Cc: Chegu Vinod Cc: Emmanuel Benisty Cc: Jason Low Cc: Michel Lespinasse Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 65 +++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 70020066ac0d..f68b61749a85 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -99,7 +99,6 @@ struct sem { /* One queue for each sleeping process in the system. */ struct sem_queue { - struct list_head simple_list; /* queue of pending operations */ struct list_head list; /* queue of pending operations */ struct task_struct *sleeper; /* this process */ struct sem_undo *undo; /* undo structure */ @@ -519,7 +518,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt, q->status = IN_WAKEUP; q->pid = error; - list_add_tail(&q->simple_list, pt); + list_add_tail(&q->list, pt); } /** @@ -537,7 +536,7 @@ static void wake_up_sem_queue_do(struct list_head *pt) int did_something; did_something = !list_empty(pt); - list_for_each_entry_safe(q, t, pt, simple_list) { + list_for_each_entry_safe(q, t, pt, list) { wake_up_process(q->sleeper); /* q can disappear immediately after writing q->status. */ smp_wmb(); @@ -550,9 +549,7 @@ static void wake_up_sem_queue_do(struct list_head *pt) static void unlink_queue(struct sem_array *sma, struct sem_queue *q) { list_del(&q->list); - if (q->nsops == 1) - list_del(&q->simple_list); - else + if (q->nsops > 1) sma->complex_count--; } @@ -605,9 +602,9 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) } /* * semval is 0. Check if there are wait-for-zero semops. - * They must be the first entries in the per-semaphore simple queue + * They must be the first entries in the per-semaphore queue */ - h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list); + h = list_first_entry(&curr->sem_pending, struct sem_queue, list); BUG_ON(h->nsops != 1); BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); @@ -627,8 +624,9 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) * @pt: list head for the tasks that must be woken up. * * update_queue must be called after a semaphore in a semaphore array - * was modified. If multiple semaphore were modified, then @semnum - * must be set to -1. + * was modified. If multiple semaphores were modified, update_queue must + * be called with semnum = -1, as well as with the number of each modified + * semaphore. * The tasks that must be woken up are added to @pt. The return code * is stored in q->pid. * The function return 1 if at least one semop was completed successfully. @@ -638,30 +636,19 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) struct sem_queue *q; struct list_head *walk; struct list_head *pending_list; - int offset; int semop_completed = 0; - /* if there are complex operations around, then knowing the semaphore - * that was modified doesn't help us. Assume that multiple semaphores - * were modified. - */ - if (sma->complex_count) - semnum = -1; - - if (semnum == -1) { + if (semnum == -1) pending_list = &sma->sem_pending; - offset = offsetof(struct sem_queue, list); - } else { + else pending_list = &sma->sem_base[semnum].sem_pending; - offset = offsetof(struct sem_queue, simple_list); - } again: walk = pending_list->next; while (walk != pending_list) { int error, restart; - q = (struct sem_queue *)((char *)walk - offset); + q = container_of(walk, struct sem_queue, list); walk = walk->next; /* If we are scanning the single sop, per-semaphore list of @@ -720,9 +707,18 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop if (sma->complex_count || sops == NULL) { if (update_queue(sma, -1, pt)) otime = 1; + } + + if (!sops) { + /* No semops; something special is going on. */ + for (i = 0; i < sma->sem_nsems; i++) { + if (update_queue(sma, i, pt)) + otime = 1; + } goto done; } + /* Check the semaphores that were modified. */ for (i = 0; i < nsops; i++) { if (sops[i].sem_op > 0 || (sops[i].sem_op < 0 && @@ -793,6 +789,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); struct list_head tasks; + int i; /* Free the existing undo structures for this semaphore set. */ assert_spin_locked(&sma->sem_perm.lock); @@ -811,6 +808,13 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) unlink_queue(sma, q); wake_up_sem_queue_prepare(&tasks, q, -EIDRM); } + for (i = 0; i < sma->sem_nsems; i++) { + struct sem *sem = sma->sem_base + i; + list_for_each_entry_safe(q, tq, &sem->sem_pending, list) { + unlink_queue(sma, q); + wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + } + } /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); @@ -1565,21 +1569,20 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, queue.undo = un; queue.pid = task_tgid_vnr(current); queue.alter = alter; - if (alter) - list_add_tail(&queue.list, &sma->sem_pending); - else - list_add(&queue.list, &sma->sem_pending); if (nsops == 1) { struct sem *curr; curr = &sma->sem_base[sops->sem_num]; if (alter) - list_add_tail(&queue.simple_list, &curr->sem_pending); + list_add_tail(&queue.list, &curr->sem_pending); else - list_add(&queue.simple_list, &curr->sem_pending); + list_add(&queue.list, &curr->sem_pending); } else { - INIT_LIST_HEAD(&queue.simple_list); + if (alter) + list_add_tail(&queue.list, &sma->sem_pending); + else + list_add(&queue.list, &sma->sem_pending); sma->complex_count++; } -- cgit v1.2.3 From 6062a8dc0517bce23e3c2f7d2fea5e22411269a3 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Apr 2013 19:15:44 -0700 Subject: ipc,sem: fine grained locking for semtimedop Introduce finer grained locking for semtimedop, to handle the common case of a program wanting to manipulate one semaphore from an array with multiple semaphores. If the call is a semop manipulating just one semaphore in an array with multiple semaphores, only take the lock for that semaphore itself. If the call needs to manipulate multiple semaphores, or another caller is in a transaction that manipulates multiple semaphores, the sem_array lock is taken, as well as all the locks for the individual semaphores. On a 24 CPU system, performance numbers with the semop-multi test with N threads and N semaphores, look like this: vanilla Davidlohr's Davidlohr's + Davidlohr's + threads patches rwlock patches v3 patches 10 610652 726325 1783589 2142206 20 341570 365699 1520453 1977878 30 288102 307037 1498167 2037995 40 290714 305955 1612665 2256484 50 288620 312890 1733453 2650292 60 289987 306043 1649360 2388008 70 291298 306347 1723167 2717486 80 290948 305662 1729545 2763582 90 290996 306680 1736021 2757524 100 292243 306700 1773700 3059159 [davidlohr.bueso@hp.com: do not call sem_lock when bogus sma] [davidlohr.bueso@hp.com: make refcounter atomic] Signed-off-by: Rik van Riel Suggested-by: Linus Torvalds Acked-by: Davidlohr Bueso Cc: Chegu Vinod Cc: Jason Low Reviewed-by: Michel Lespinasse Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Emmanuel Benisty Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 7 +- ipc/sem.c | 271 ++++++++++++++++++++++++++++++++++++++----------------------- ipc/util.c | 48 +++++------ ipc/util.h | 2 +- 4 files changed, 203 insertions(+), 125 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index a80aaf463d9c..09a1f41e6595 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -687,7 +687,12 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, goto out_unlock_free; } ss_add(msq, &s); - ipc_rcu_getref(msq); + + if (!ipc_rcu_getref(msq)) { + err = -EIDRM; + goto out_unlock_free; + } + msg_unlock(msq); schedule(); diff --git a/ipc/sem.c b/ipc/sem.c index f68b61749a85..e78ee3186d1f 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -94,6 +94,7 @@ struct sem { int semval; /* current value */ int sempid; /* pid of last operation */ + spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head sem_pending; /* pending single-sop operations */ }; @@ -137,7 +138,6 @@ struct sem_undo_list { #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) -#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) #define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid) static int newary(struct ipc_namespace *, struct ipc_params *); @@ -189,11 +189,90 @@ void __init sem_init (void) IPC_SEM_IDS, sysvipc_sem_proc_show); } +/* + * If the request contains only one semaphore operation, and there are + * no complex transactions pending, lock only the semaphore involved. + * Otherwise, lock the entire semaphore array, since we either have + * multiple semaphores in our own semops, or we need to look at + * semaphores from other pending complex operations. + * + * Carefully guard against sma->complex_count changing between zero + * and non-zero while we are spinning for the lock. The value of + * sma->complex_count cannot change while we are holding the lock, + * so sem_unlock should be fine. + * + * The global lock path checks that all the local locks have been released, + * checking each local lock once. This means that the local lock paths + * cannot start their critical sections while the global lock is held. + */ +static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, + int nsops) +{ + int locknum; + again: + if (nsops == 1 && !sma->complex_count) { + struct sem *sem = sma->sem_base + sops->sem_num; + + /* Lock just the semaphore we are interested in. */ + spin_lock(&sem->lock); + + /* + * If sma->complex_count was set while we were spinning, + * we may need to look at things we did not lock here. + */ + if (unlikely(sma->complex_count)) { + spin_unlock(&sem->lock); + goto lock_array; + } + + /* + * Another process is holding the global lock on the + * sem_array; we cannot enter our critical section, + * but have to wait for the global lock to be released. + */ + if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { + spin_unlock(&sem->lock); + spin_unlock_wait(&sma->sem_perm.lock); + goto again; + } + + locknum = sops->sem_num; + } else { + int i; + /* + * Lock the semaphore array, and wait for all of the + * individual semaphore locks to go away. The code + * above ensures no new single-lock holders will enter + * their critical section while the array lock is held. + */ + lock_array: + spin_lock(&sma->sem_perm.lock); + for (i = 0; i < sma->sem_nsems; i++) { + struct sem *sem = sma->sem_base + i; + spin_unlock_wait(&sem->lock); + } + locknum = -1; + } + return locknum; +} + +static inline void sem_unlock(struct sem_array *sma, int locknum) +{ + if (locknum == -1) { + spin_unlock(&sma->sem_perm.lock); + } else { + struct sem *sem = sma->sem_base + locknum; + spin_unlock(&sem->lock); + } + rcu_read_unlock(); +} + /* * sem_lock_(check_) routines are called in the paths where the rw_mutex * is not held. */ -static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id) +static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, + int id, struct sembuf *sops, int nsops, int *locknum) { struct kern_ipc_perm *ipcp; struct sem_array *sma; @@ -205,7 +284,8 @@ static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id goto err; } - spin_lock(&ipcp->lock); + sma = container_of(ipcp, struct sem_array, sem_perm); + *locknum = sem_lock(sma, sops, nsops); /* ipc_rmid() may have already freed the ID while sem_lock * was spinning: verify that the structure is still valid @@ -213,7 +293,7 @@ static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id if (!ipcp->deleted) return container_of(ipcp, struct sem_array, sem_perm); - spin_unlock(&ipcp->lock); + sem_unlock(sma, *locknum); sma = ERR_PTR(-EINVAL); err: rcu_read_unlock(); @@ -230,17 +310,6 @@ static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int return container_of(ipcp, struct sem_array, sem_perm); } -static inline struct sem_array *sem_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&sem_ids(ns), id); - - if (IS_ERR(ipcp)) - return ERR_CAST(ipcp); - - return container_of(ipcp, struct sem_array, sem_perm); -} - static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, int id) { @@ -254,21 +323,21 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { - ipc_lock_by_ptr(&sma->sem_perm); + rcu_read_lock(); + sem_lock(sma, NULL, -1); ipc_rcu_putref(sma); } static inline void sem_getref_and_unlock(struct sem_array *sma) { - ipc_rcu_getref(sma); - ipc_unlock(&(sma)->sem_perm); + WARN_ON_ONCE(!ipc_rcu_getref(sma)); + sem_unlock(sma, -1); } static inline void sem_putref(struct sem_array *sma) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - ipc_unlock(&(sma)->sem_perm); + sem_lock_and_putref(sma); + sem_unlock(sma, -1); } /* @@ -276,9 +345,9 @@ static inline void sem_putref(struct sem_array *sma) */ static inline void sem_getref(struct sem_array *sma) { - spin_lock(&(sma)->sem_perm.lock); - ipc_rcu_getref(sma); - ipc_unlock(&(sma)->sem_perm); + sem_lock(sma, NULL, -1); + WARN_ON_ONCE(!ipc_rcu_getref(sma)); + sem_unlock(sma, -1); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -371,15 +440,17 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_base = (struct sem *) &sma[1]; - for (i = 0; i < nsems; i++) + for (i = 0; i < nsems; i++) { INIT_LIST_HEAD(&sma->sem_base[i].sem_pending); + spin_lock_init(&sma->sem_base[i].lock); + } sma->complex_count = 0; INIT_LIST_HEAD(&sma->sem_pending); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = get_seconds(); - sem_unlock(sma); + sem_unlock(sma, -1); return sma->sem_perm.id; } @@ -818,7 +889,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); - sem_unlock(sma); + sem_unlock(sma, -1); wake_up_sem_queue_do(&tasks); ns->used_sems -= sma->sem_nsems; @@ -947,7 +1018,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, struct sem_array *sma; struct sem* curr; int err; - int nsems; struct list_head tasks; int val; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) @@ -958,31 +1028,39 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, val = arg; #endif - sma = sem_lock_check(ns, semid); - if (IS_ERR(sma)) - return PTR_ERR(sma); + if (val > SEMVMX || val < 0) + return -ERANGE; INIT_LIST_HEAD(&tasks); - nsems = sma->sem_nsems; - err = -EACCES; - if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) - goto out_unlock; + rcu_read_lock(); + sma = sem_obtain_object_check(ns, semid); + if (IS_ERR(sma)) { + rcu_read_unlock(); + return PTR_ERR(sma); + } + + if (semnum < 0 || semnum >= sma->sem_nsems) { + rcu_read_unlock(); + return -EINVAL; + } + + + if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) { + rcu_read_unlock(); + return -EACCES; + } err = security_sem_semctl(sma, SETVAL); - if (err) - goto out_unlock; + if (err) { + rcu_read_unlock(); + return -EACCES; + } - err = -EINVAL; - if(semnum < 0 || semnum >= nsems) - goto out_unlock; + sem_lock(sma, NULL, -1); curr = &sma->sem_base[semnum]; - err = -ERANGE; - if (val > SEMVMX || val < 0) - goto out_unlock; - assert_spin_locked(&sma->sem_perm.lock); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; @@ -992,11 +1070,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &tasks); - err = 0; -out_unlock: - sem_unlock(sma); + sem_unlock(sma, -1); wake_up_sem_queue_do(&tasks); - return err; + return 0; } static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, @@ -1051,16 +1127,16 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma); + sem_unlock(sma, -1); err = -EIDRM; goto out_free; } - } + } else + sem_lock(sma, NULL, -1); - spin_lock(&sma->sem_perm.lock); for (i = 0; i < sma->sem_nsems; i++) sem_io[i] = sma->sem_base[i].semval; - sem_unlock(sma); + sem_unlock(sma, -1); err = 0; if(copy_to_user(array, sem_io, nsems*sizeof(ushort))) err = -EFAULT; @@ -1071,7 +1147,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; struct sem_undo *un; - ipc_rcu_getref(sma); + if (!ipc_rcu_getref(sma)) { + rcu_read_unlock(); + return -EIDRM; + } rcu_read_unlock(); if(nsems > SEMMSL_FAST) { @@ -1097,7 +1176,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma); + sem_unlock(sma, -1); err = -EIDRM; goto out_free; } @@ -1124,7 +1203,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_wakeup; } - spin_lock(&sma->sem_perm.lock); + sem_lock(sma, NULL, -1); curr = &sma->sem_base[semnum]; switch (cmd) { @@ -1143,7 +1222,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } out_unlock: - sem_unlock(sma); + sem_unlock(sma, -1); out_wakeup: wake_up_sem_queue_do(&tasks); out_free: @@ -1211,11 +1290,11 @@ static int semctl_down(struct ipc_namespace *ns, int semid, switch(cmd){ case IPC_RMID: - ipc_lock_object(&sma->sem_perm); + sem_lock(sma, NULL, -1); freeary(ns, ipcp); goto out_up; case IPC_SET: - ipc_lock_object(&sma->sem_perm); + sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64.sem_perm, ipcp); if (err) goto out_unlock; @@ -1228,7 +1307,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, } out_unlock: - sem_unlock(sma); + sem_unlock(sma, -1); out_up: up_write(&sem_ids(ns).rw_mutex); return err; @@ -1340,8 +1419,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) struct sem_array *sma; struct sem_undo_list *ulp; struct sem_undo *un, *new; - int nsems; - int error; + int nsems, error; error = get_undo_list(&ulp); if (error) @@ -1363,7 +1441,11 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) } nsems = sma->sem_nsems; - ipc_rcu_getref(sma); + if (!ipc_rcu_getref(sma)) { + rcu_read_unlock(); + un = ERR_PTR(-EIDRM); + goto out; + } rcu_read_unlock(); /* step 2: allocate new undo structure */ @@ -1376,7 +1458,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 3: Acquire the lock on semaphore array */ sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma); + sem_unlock(sma, -1); kfree(new); un = ERR_PTR(-EIDRM); goto out; @@ -1404,7 +1486,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) success: spin_unlock(&ulp->lock); rcu_read_lock(); - sem_unlock(sma); + sem_unlock(sma, -1); out: return un; } @@ -1444,7 +1526,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf* sops = fast_sops, *sop; struct sem_undo *un; - int undos = 0, alter = 0, max; + int undos = 0, alter = 0, max, locknum; struct sem_queue queue; unsigned long jiffies_left = 0; struct ipc_namespace *ns; @@ -1488,22 +1570,23 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, alter = 1; } + INIT_LIST_HEAD(&tasks); + if (undos) { + /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out_free; } - } else + } else { un = NULL; + rcu_read_lock(); + } - INIT_LIST_HEAD(&tasks); - - rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { - if (un) - rcu_read_unlock(); + rcu_read_unlock(); error = PTR_ERR(sma); goto out_free; } @@ -1534,23 +1617,9 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * "un" itself is guaranteed by rcu. */ error = -EIDRM; - ipc_lock_object(&sma->sem_perm); - if (un) { - if (un->semid == -1) { - rcu_read_unlock(); - goto out_unlock_free; - } else { - /* - * rcu lock can be released, "un" cannot disappear: - * - sem_lock is acquired, thus IPC_RMID is - * impossible. - * - exit_sem is impossible, it always operates on - * current (or a dead task). - */ - - rcu_read_unlock(); - } - } + locknum = sem_lock(sma, sops, nsops); + if (un && un->semid == -1) + goto out_unlock_free; error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); if (error <= 0) { @@ -1591,7 +1660,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, sleep_again: current->state = TASK_INTERRUPTIBLE; - sem_unlock(sma); + sem_unlock(sma, locknum); if (timeout) jiffies_left = schedule_timeout(jiffies_left); @@ -1613,7 +1682,7 @@ sleep_again: goto out_free; } - sma = sem_obtain_lock(ns, semid); + sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); /* * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. @@ -1652,7 +1721,7 @@ sleep_again: unlink_queue(sma, &queue); out_unlock_free: - sem_unlock(sma); + sem_unlock(sma, locknum); out_wakeup: wake_up_sem_queue_do(&tasks); out_free: @@ -1716,8 +1785,7 @@ void exit_sem(struct task_struct *tsk) struct sem_array *sma; struct sem_undo *un; struct list_head tasks; - int semid; - int i; + int semid, i; rcu_read_lock(); un = list_entry_rcu(ulp->list_proc.next, @@ -1726,23 +1794,26 @@ void exit_sem(struct task_struct *tsk) semid = -1; else semid = un->semid; - rcu_read_unlock(); - if (semid == -1) + if (semid == -1) { + rcu_read_unlock(); break; + } - sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid); - + sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid); /* exit_sem raced with IPC_RMID, nothing to do */ - if (IS_ERR(sma)) + if (IS_ERR(sma)) { + rcu_read_unlock(); continue; + } + sem_lock(sma, NULL, -1); un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created * exactly the same semid. Nothing to do. */ - sem_unlock(sma); + sem_unlock(sma, -1); continue; } @@ -1782,7 +1853,7 @@ void exit_sem(struct task_struct *tsk) /* maybe some queued-up processes were waiting for this */ INIT_LIST_HEAD(&tasks); do_smart_update(sma, NULL, 0, 1, &tasks); - sem_unlock(sma); + sem_unlock(sma, -1); wake_up_sem_queue_do(&tasks); kfree_rcu(un, rcu); diff --git a/ipc/util.c b/ipc/util.c index 3df0af3158a5..579201e4bc01 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -439,9 +439,9 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) * NULL is returned if the allocation fails */ -void* ipc_alloc(int size) +void *ipc_alloc(int size) { - void* out; + void *out; if(size > PAGE_SIZE) out = vmalloc(size); else @@ -478,7 +478,7 @@ void ipc_free(void* ptr, int size) */ struct ipc_rcu_hdr { - int refcount; + atomic_t refcount; int is_vmalloc; void *data[0]; }; @@ -516,39 +516,41 @@ static inline int rcu_use_vmalloc(int size) * @size: size desired * * Allocate memory for the rcu header structure + the object. - * Returns the pointer to the object. - * NULL is returned if the allocation fails. + * Returns the pointer to the object or NULL upon failure. */ - -void* ipc_rcu_alloc(int size) +void *ipc_rcu_alloc(int size) { - void* out; - /* + void *out; + + /* * We prepend the allocation with the rcu struct, and - * workqueue if necessary (for vmalloc). + * workqueue if necessary (for vmalloc). */ if (rcu_use_vmalloc(size)) { out = vmalloc(HDRLEN_VMALLOC + size); - if (out) { - out += HDRLEN_VMALLOC; - container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; - container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; - } + if (!out) + goto done; + + out += HDRLEN_VMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; } else { out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); - if (out) { - out += HDRLEN_KMALLOC; - container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; - container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; - } + if (!out) + goto done; + + out += HDRLEN_KMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; } + /* set reference counter no matter what kind of allocation was done */ + atomic_set(&container_of(out, struct ipc_rcu_hdr, data)->refcount, 1); +done: return out; } -void ipc_rcu_getref(void *ptr) +int ipc_rcu_getref(void *ptr) { - container_of(ptr, struct ipc_rcu_hdr, data)->refcount++; + return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu_hdr, data)->refcount); } static void ipc_do_vfree(struct work_struct *work) @@ -578,7 +580,7 @@ static void ipc_schedule_free(struct rcu_head *head) void ipc_rcu_putref(void *ptr) { - if (--container_of(ptr, struct ipc_rcu_hdr, data)->refcount > 0) + if (!atomic_dec_and_test(&container_of(ptr, struct ipc_rcu_hdr, data)->refcount)) return; if (container_of(ptr, struct ipc_rcu_hdr, data)->is_vmalloc) { diff --git a/ipc/util.h b/ipc/util.h index c36b9977c957..2b0bdd5d92ce 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -119,7 +119,7 @@ void ipc_free(void* ptr, int size); * to 0 schedules the rcu destruction. Caller must guarantee locking. */ void* ipc_rcu_alloc(int size); -void ipc_rcu_getref(void *ptr); +int ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); -- cgit v1.2.3 From 41239fe82d85c135684b09f1e65622d6c1dbe8dc Mon Sep 17 00:00:00 2001 From: Nikola Pajkovsky Date: Tue, 30 Apr 2013 19:15:49 -0700 Subject: ipc/msg.c: use list_for_each_entry_[safe] for list traversing The ipc/msg.c code does its list operations by hand and it open-codes the accesses, instead of using for_each_entry_[safe]. Signed-off-by: Nikola Pajkovsky Cc: Stanislav Kinsbursky Cc: "Eric W. Biederman" Cc: Peter Hurley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 09a1f41e6595..d0c6d967b390 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -238,14 +238,9 @@ static inline void ss_del(struct msg_sender *mss) static void ss_wakeup(struct list_head *h, int kill) { - struct list_head *tmp; + struct msg_sender *mss, *t; - tmp = h->next; - while (tmp != h) { - struct msg_sender *mss; - - mss = list_entry(tmp, struct msg_sender, list); - tmp = tmp->next; + list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; wake_up_process(mss->tsk); @@ -254,14 +249,9 @@ static void ss_wakeup(struct list_head *h, int kill) static void expunge_all(struct msg_queue *msq, int res) { - struct list_head *tmp; - - tmp = msq->q_receivers.next; - while (tmp != &msq->q_receivers) { - struct msg_receiver *msr; + struct msg_receiver *msr, *t; - msr = list_entry(tmp, struct msg_receiver, r_list); - tmp = tmp->next; + list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { msr->r_msg = NULL; wake_up_process(msr->r_tsk); smp_mb(); @@ -279,7 +269,7 @@ static void expunge_all(struct msg_queue *msq, int res) */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { - struct list_head *tmp; + struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); expunge_all(msq, -EIDRM); @@ -287,11 +277,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) msg_rmid(ns, msq); msg_unlock(msq); - tmp = msq->q_messages.next; - while (tmp != &msq->q_messages) { - struct msg_msg *msg = list_entry(tmp, struct msg_msg, m_list); - - tmp = tmp->next; + list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { atomic_dec(&ns->msg_hdrs); free_msg(msg); } @@ -604,14 +590,9 @@ static int testmsg(struct msg_msg *msg, long type, int mode) static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) { - struct list_head *tmp; - - tmp = msq->q_receivers.next; - while (tmp != &msq->q_receivers) { - struct msg_receiver *msr; + struct msg_receiver *msr, *t; - msr = list_entry(tmp, struct msg_receiver, r_list); - tmp = tmp->next; + list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { if (testmsg(msg, msr->r_msgtype, msr->r_mode) && !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { -- cgit v1.2.3 From d69f3bad4675ac519d41ca2b11e1c00ca115cecd Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 30 Apr 2013 19:15:54 -0700 Subject: ipc: sysv shared memory limited to 8TiB Trying to run an application which was trying to put data into half of memory using shmget(), we found that having a shmall value below 8EiB-8TiB would prevent us from using anything more than 8TiB. By setting kernel.shmall greater than 8EiB-8TiB would make the job work. In the newseg() function, ns->shm_tot which, at 8TiB is INT_MAX. ipc/shm.c: 458 static int newseg(struct ipc_namespace *ns, struct ipc_params *params) 459 { ... 465 int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; ... 474 if (ns->shm_tot + numpages > ns->shm_ctlall) 475 return -ENOSPC; [akpm@linux-foundation.org: make ipc/shm.c:newseg()'s numpages size_t, not int] Signed-off-by: Robin Holt Reported-by: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 +- ipc/shm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index ae221a7b5092..c4d870b0d5e6 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -43,8 +43,8 @@ struct ipc_namespace { size_t shm_ctlmax; size_t shm_ctlall; + unsigned long shm_tot; int shm_ctlmni; - int shm_tot; /* * Defines whether IPC_RMID is forced for _all_ shm segments regardless * of shmctl() diff --git a/ipc/shm.c b/ipc/shm.c index cb858df061d3..8247c49ec073 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -462,7 +462,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) size_t size = params->u.size; int error; struct shmid_kernel *shp; - int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; struct file * file; char name[13]; int id; -- cgit v1.2.3