From ce7231e92dac381f6e4f9cfdfdf9e0ea055223ad Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 13 May 2008 13:45:14 -0700 Subject: [PATCH 1/2] ocfs2: Add CONFIG_OCFS2_FS_STATS config option This patch adds config option CONFIG_OCFS2_FS_STATS to allow building the fs with instrumentation enabled. An upcoming patch will provide support to instrument cluster locking, which is a crucial overhead in a cluster file system. This config option allows users to avoid the cpu and memory overhead that is involved in gathering such statistics. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/Kconfig b/fs/Kconfig index 2694648cbd1b..9c07a4f49e67 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -470,6 +470,14 @@ config OCFS2_FS_USERSPACE_CLUSTER It is safe to say Y, as the clustering method is run-time selectable. +config OCFS2_FS_STATS + bool "OCFS2 statistics" + depends on OCFS2_FS + default y + help + This option allows some fs statistics to be captured. Enabling + this option may increase the memory consumption. + config OCFS2_DEBUG_MASKLOG bool "OCFS2 logging support" depends on OCFS2_FS -- cgit v1.2.3 From 8ddb7b004dfa1832a750e199df8bff4b75b73565 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 13 May 2008 13:45:15 -0700 Subject: [PATCH 2/2] ocfs2: Instrument fs cluster locks This patch adds code to track the number of times the fs takes various cluster locks as well as the times associated with it. The information is made available to users via debugfs. This patch was originally written by Jan Kara . Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/dlmglue.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/ocfs2/ocfs2.h | 12 ++++++ 2 files changed, 133 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 80e20d9f2780..80537b769e41 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -31,6 +31,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_DLM_GLUE #include @@ -59,6 +60,9 @@ struct ocfs2_mask_waiter { struct completion mw_complete; unsigned long mw_mask; unsigned long mw_goal; +#ifdef CONFIG_OCFS2_FS_STATS + unsigned long long mw_lock_start; +#endif }; static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); @@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) spin_unlock(&ocfs2_dlm_tracking_lock); } +#ifdef CONFIG_OCFS2_FS_STATS +static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ + res->l_lock_num_prmode = 0; + res->l_lock_num_prmode_failed = 0; + res->l_lock_total_prmode = 0; + res->l_lock_max_prmode = 0; + res->l_lock_num_exmode = 0; + res->l_lock_num_exmode_failed = 0; + res->l_lock_total_exmode = 0; + res->l_lock_max_exmode = 0; + res->l_lock_refresh = 0; +} + +static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, + struct ocfs2_mask_waiter *mw, int ret) +{ + unsigned long long *num, *sum; + unsigned int *max, *failed; + struct timespec ts = current_kernel_time(); + unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start; + + if (level == LKM_PRMODE) { + num = &res->l_lock_num_prmode; + sum = &res->l_lock_total_prmode; + max = &res->l_lock_max_prmode; + failed = &res->l_lock_num_prmode_failed; + } else if (level == LKM_EXMODE) { + num = &res->l_lock_num_exmode; + sum = &res->l_lock_total_exmode; + max = &res->l_lock_max_exmode; + failed = &res->l_lock_num_exmode_failed; + } else + return; + + (*num)++; + (*sum) += time; + if (time > *max) + *max = time; + if (ret) + (*failed)++; +} + +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ + lockres->l_lock_refresh++; +} + +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ + struct timespec ts = current_kernel_time(); + mw->mw_lock_start = timespec_to_ns(&ts); +} +#else +static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) +{ +} +static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, + int level, struct ocfs2_mask_waiter *mw, int ret) +{ +} +static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) +{ +} +static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) +{ +} +#endif + static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, struct ocfs2_lock_res *res, enum ocfs2_lock_type type, @@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, res->l_flags = OCFS2_LOCK_INITIALIZED; ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); + + ocfs2_init_lock_stats(res); } void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) @@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) { INIT_LIST_HEAD(&mw->mw_item); init_completion(&mw->mw_complete); + ocfs2_init_start_time(mw); } static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) @@ -1254,6 +1330,7 @@ out: goto again; mlog_errno(ret); } + ocfs2_update_lock_stats(lockres, level, &mw, ret); mlog_exit(ret); return ret; @@ -1983,6 +2060,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, le32_to_cpu(fe->i_flags)); ocfs2_refresh_inode(inode, fe); + ocfs2_track_lock_refresh(lockres); } status = 0; @@ -2267,6 +2345,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); + ocfs2_track_lock_refresh(lockres); } bail: mlog_exit(status); @@ -2461,7 +2540,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) } /* So that debugfs.ocfs2 can determine which format is being used */ -#define OCFS2_DLM_DEBUG_STR_VERSION 1 +#define OCFS2_DLM_DEBUG_STR_VERSION 2 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; @@ -2502,6 +2581,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) for(i = 0; i < DLM_LVB_LEN; i++) seq_printf(m, "0x%x\t", lvb[i]); +#ifdef CONFIG_OCFS2_FS_STATS +# define lock_num_prmode(_l) (_l)->l_lock_num_prmode +# define lock_num_exmode(_l) (_l)->l_lock_num_exmode +# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed +# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed +# define lock_total_prmode(_l) (_l)->l_lock_total_prmode +# define lock_total_exmode(_l) (_l)->l_lock_total_exmode +# define lock_max_prmode(_l) (_l)->l_lock_max_prmode +# define lock_max_exmode(_l) (_l)->l_lock_max_exmode +# define lock_refresh(_l) (_l)->l_lock_refresh +#else +# define lock_num_prmode(_l) (0) +# define lock_num_exmode(_l) (0) +# define lock_num_prmode_failed(_l) (0) +# define lock_num_exmode_failed(_l) (0) +# define lock_total_prmode(_l) (0) +# define lock_total_exmode(_l) (0) +# define lock_max_prmode(_l) (0) +# define lock_max_exmode(_l) (0) +# define lock_refresh(_l) (0) +#endif + /* The following seq_print was added in version 2 of this output */ + seq_printf(m, "%llu\t" + "%llu\t" + "%u\t" + "%u\t" + "%llu\t" + "%llu\t" + "%u\t" + "%u\t" + "%u\t", + lock_num_prmode(lockres), + lock_num_exmode(lockres), + lock_num_prmode_failed(lockres), + lock_num_exmode_failed(lockres), + lock_total_prmode(lockres), + lock_total_exmode(lockres), + lock_max_prmode(lockres), + lock_max_exmode(lockres), + lock_refresh(lockres)); + /* End the line */ seq_printf(m, "\n"); return 0; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 31692379c170..1cb814be8ef1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -132,6 +132,18 @@ struct ocfs2_lock_res { wait_queue_head_t l_event; struct list_head l_debug_list; + +#ifdef CONFIG_OCFS2_FS_STATS + unsigned long long l_lock_num_prmode; /* PR acquires */ + unsigned long long l_lock_num_exmode; /* EX acquires */ + unsigned int l_lock_num_prmode_failed; /* Failed PR gets */ + unsigned int l_lock_num_exmode_failed; /* Failed EX gets */ + unsigned long long l_lock_total_prmode; /* Tot wait for PR */ + unsigned long long l_lock_total_exmode; /* Tot wait for EX */ + unsigned int l_lock_max_prmode; /* Max wait for PR */ + unsigned int l_lock_max_exmode; /* Max wait for EX */ + unsigned int l_lock_refresh; /* Disk refreshes */ +#endif }; struct ocfs2_dlm_debug { -- cgit v1.2.3 From dd25e55ea133b14678cfaa9e205b082b24b26dbc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 May 2008 14:41:00 -0700 Subject: ocfs2: fix printk format warnings with OCFS2_FS_STATS=n Fix printk format warnings when OCFS2_FS_STATS=n: linux-next-20080528/fs/ocfs2/dlmglue.c: In function 'ocfs2_dlm_seq_show': linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'int' linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 4 has type 'int' linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 7 has type 'int' linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 8 has type 'int' Signed-off-by: Randy Dunlap Signed-off-by: Mark Fasheh --- fs/ocfs2/dlmglue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 80537b769e41..eae3d643a5e4 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2592,12 +2592,12 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_exmode(_l) (_l)->l_lock_max_exmode # define lock_refresh(_l) (_l)->l_lock_refresh #else -# define lock_num_prmode(_l) (0) -# define lock_num_exmode(_l) (0) +# define lock_num_prmode(_l) (0ULL) +# define lock_num_exmode(_l) (0ULL) # define lock_num_prmode_failed(_l) (0) # define lock_num_exmode_failed(_l) (0) -# define lock_total_prmode(_l) (0) -# define lock_total_exmode(_l) (0) +# define lock_total_prmode(_l) (0ULL) +# define lock_total_exmode(_l) (0ULL) # define lock_max_prmode(_l) (0) # define lock_max_exmode(_l) (0) # define lock_refresh(_l) (0) -- cgit v1.2.3 From 7600c72b75bab374ad39b2a4799a0728579a8e2f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 9 Jun 2008 16:34:23 -0700 Subject: ocfs2: use simple_read_from_buffer() Signed-off-by: Akinobu Mita Acked-by: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Mark Fasheh --- fs/ocfs2/stack_user.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index c021280dd462..24e0b19c8b69 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -549,26 +549,17 @@ static ssize_t ocfs2_control_read(struct file *file, size_t count, loff_t *ppos) { - char *proto_string = OCFS2_CONTROL_PROTO; - size_t to_write = 0; - - if (*ppos >= OCFS2_CONTROL_PROTO_LEN) - return 0; - - to_write = OCFS2_CONTROL_PROTO_LEN - *ppos; - if (to_write > count) - to_write = count; - if (copy_to_user(buf, proto_string + *ppos, to_write)) - return -EFAULT; + ssize_t ret; - *ppos += to_write; + ret = simple_read_from_buffer(buf, count, ppos, + OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN); /* Have we read the whole protocol list? */ - if (*ppos >= OCFS2_CONTROL_PROTO_LEN) + if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN) ocfs2_control_set_handshake_state(file, OCFS2_CONTROL_HANDSHAKE_READ); - return to_write; + return ret; } static int ocfs2_control_release(struct inode *inode, struct file *file) -- cgit v1.2.3 From 56753bd3b9220f6f2477eb1cf97f40c24e0a4c91 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 9 Jun 2008 11:24:41 -0700 Subject: ocfs2: Silence an error message in ocfs2_file_aio_read() This patch silences an EINVAL error message in ocfs2_file_aio_read() that is always due to a user error. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 57e0d30cde98..e8514e8b6ce8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2202,7 +2202,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); if (ret == -EINVAL) - mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); + mlog(0, "generic_file_aio_read returned -EINVAL\n"); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); -- cgit v1.2.3 From 01af482037d32c215aab208a0b110ffe6fd782c0 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 10 Jun 2008 14:24:48 +0800 Subject: ocfs2: Handle error during journal load This patch ensures the mount fails if the fs is unable to load the journal. Signed-off-by: Wengang Wang Acked-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index df63ba20ae90..ccecfe5094fa 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1703,7 +1703,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) local = ocfs2_mount_local(osb); /* will play back anything left in the journal. */ - ocfs2_journal_load(osb->journal, local); + status = ocfs2_journal_load(osb->journal, local); + if (status < 0) { + mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); + goto finally; + } if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ -- cgit v1.2.3 From 461c6a30eca6f25add1dadb9fd8a1d8e89a6e627 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 19 May 2008 16:23:37 -0700 Subject: ocfs2/net: Silence build warnings on sparc64 suseconds_t is type long on most arches except sparc64 where it is type int. This patch silences the following warnings that are generated when building on it. netdebug.c: In function 'nst_seq_show': netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 13 has type 'suseconds_t' netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 15 has type 'suseconds_t' netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 17 has type 'suseconds_t' netdebug.c: In function 'sc_seq_show': netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 19 has type 'suseconds_t' netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 21 has type 'suseconds_t' netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 23 has type 'suseconds_t' netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 25 has type 'suseconds_t' netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 27 has type 'suseconds_t' netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 29 has type 'suseconds_t' Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/netdebug.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 7bf3c0ea7bd9..d8bfa0eb41b2 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -146,8 +146,10 @@ static int nst_seq_show(struct seq_file *seq, void *v) nst->st_task->comm, nst->st_node, nst->st_sc, nst->st_id, nst->st_msg_type, nst->st_msg_key, - nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec, - nst->st_send_time.tv_sec, nst->st_send_time.tv_usec, + nst->st_sock_time.tv_sec, + (unsigned long)nst->st_sock_time.tv_usec, + nst->st_send_time.tv_sec, + (unsigned long)nst->st_send_time.tv_usec, nst->st_status_time.tv_sec, nst->st_status_time.tv_usec); } @@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) return sc; /* unused, just needs to be null when done */ } -#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec +#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec static int sc_seq_show(struct seq_file *seq, void *v) { -- cgit v1.2.3 From e407e39783a7206d20b3e9961aedf272de966e31 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Thu, 12 Jun 2008 22:35:39 -0700 Subject: ocfs2: Fix CONFIG_OCFS2_DEBUG_FS #ifdefs A couple places use OCFS2_DEBUG_FS where they really mean CONFIG_OCFS2_DEBUG_FS. Reported-by: Robert P. J. Day Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 2 +- fs/ocfs2/localalloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9698338adc39..a8c19cb3cfdd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -329,7 +329,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); -#ifdef OCFS2_DEBUG_FS +#ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else status = journal_extend(handle, nblocks); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index be774bdc8b36..28e492e4ec88 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -498,7 +498,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; -#ifdef OCFS2_DEBUG_FS +#ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " -- cgit v1.2.3 From fe9f387740ac7cb3b7c2fffa76807e997e6c6292 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Thu, 12 Jun 2008 22:39:18 -0700 Subject: ocfs2: Don't snprintf() without a format. Some system files are per-slot. Their names include the slot number. ocfs2_sprintf_system_inode_name() uses the system inode definitions to fill in the slot number with snprintf(). For global system files, there is no node number, and the name was printed as a format with no arguments. -Wformat-nonliteral and -Wformat-security don't like this. Instead, use a static "%s" format and the name as the argument. Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 52c426665154..3f1945177629 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -901,7 +901,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, * list has a copy per slot. */ if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) - chars = snprintf(buf, len, + chars = snprintf(buf, len, "%s", ocfs2_system_inodes[type].si_name); else chars = snprintf(buf, len, -- cgit v1.2.3 From 6f61076406251626be39651d114fac412b1e0c39 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Mon, 16 Jun 2008 19:00:58 +0200 Subject: configfs: Introduce configfs_dirent_lock This patch introduces configfs_dirent_lock spinlock to protect configfs_dirent traversals against linkage mutations (add/del/move). This will allow configfs_detach_prep() to avoid locking i_mutexes. Locking rules for configfs_dirent linkage mutations are the same plus the requirement of taking configfs_dirent_lock. For configfs_dirent walking, one can either take appropriate i_mutex as before, or take configfs_dirent_lock. The spinlock could actually be a mutex, but the critical sections are either O(1) or should not be too long (default groups walking in last patch). ChangeLog: - Clarify the comment on configfs_dirent_lock usage - Move sd->s_element init before linking the new dirent - In lseek(), do not release configfs_dirent_lock before the dirent is relinked. Signed-off-by: Louis Rilling Signed-off-by: Joel Becker --- fs/configfs/configfs_internal.h | 3 +++ fs/configfs/dir.c | 28 +++++++++++++++++++++++++++- fs/configfs/inode.c | 2 ++ fs/configfs/symlink.c | 2 ++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index cca98609aa7f..5a33b58e66da 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -26,6 +26,7 @@ #include #include +#include struct configfs_dirent { atomic_t s_count; @@ -49,6 +50,8 @@ struct configfs_dirent { #define CONFIGFS_USET_DROPPING 0x0100 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) +extern spinlock_t configfs_dirent_lock; + extern struct vfsmount * configfs_mount; extern struct kmem_cache *configfs_dir_cachep; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index a48dc7dd8765..2619f485bc3d 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -35,6 +35,14 @@ #include "configfs_internal.h" DECLARE_RWSEM(configfs_rename_sem); +/* + * Protects mutations of configfs_dirent linkage together with proper i_mutex + * Mutators of configfs_dirent linkage must *both* have the proper inode locked + * and configfs_dirent_lock locked, in that order. + * This allows one to safely traverse configfs_dirent trees without having to + * lock inodes. + */ +DEFINE_SPINLOCK(configfs_dirent_lock); static void configfs_d_iput(struct dentry * dentry, struct inode * inode) @@ -79,8 +87,10 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare atomic_set(&sd->s_count, 1); INIT_LIST_HEAD(&sd->s_links); INIT_LIST_HEAD(&sd->s_children); - list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; + spin_lock(&configfs_dirent_lock); + list_add(&sd->s_sibling, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); return sd; } @@ -173,7 +183,9 @@ static int create_dir(struct config_item * k, struct dentry * p, } else { struct configfs_dirent *sd = d->d_fsdata; if (sd) { + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } } @@ -224,7 +236,9 @@ int configfs_create_link(struct configfs_symlink *sl, else { struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } } @@ -238,7 +252,9 @@ static void remove_dir(struct dentry * d) struct configfs_dirent * sd; sd = d->d_fsdata; + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_put(sd); if (d->d_inode) simple_rmdir(parent->d_inode,d); @@ -410,7 +426,9 @@ static void detach_attrs(struct config_item * item) list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) continue; + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dentry); configfs_put(sd); } @@ -1268,7 +1286,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct configfs_dirent * cursor = file->private_data; mutex_lock(&dentry->d_inode->i_mutex); + spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); + spin_unlock(&configfs_dirent_lock); mutex_unlock(&dentry->d_inode->i_mutex); release_configfs_dirent(cursor); @@ -1308,7 +1328,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir /* fallthrough */ default: if (filp->f_pos == 2) { + spin_lock(&configfs_dirent_lock); list_move(q, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct configfs_dirent *next; @@ -1331,7 +1353,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir dt_type(next)) < 0) return 0; + spin_lock(&configfs_dirent_lock); list_move(q, p); + spin_unlock(&configfs_dirent_lock); p = q; filp->f_pos++; } @@ -1362,6 +1386,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) struct list_head *p; loff_t n = file->f_pos - 2; + spin_lock(&configfs_dirent_lock); list_del(&cursor->s_sibling); p = sd->s_children.next; while (n && p != &sd->s_children) { @@ -1373,6 +1398,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) p = p->next; } list_add_tail(&cursor->s_sibling, p); + spin_unlock(&configfs_dirent_lock); } } mutex_unlock(&dentry->d_inode->i_mutex); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index b9a1d810346d..4803ccc94480 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) if (!sd->s_element) continue; if (!strcmp(configfs_get_name(sd), name)) { + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dir); configfs_put(sd); break; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 2a731ef5f305..676c84c416da 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -169,7 +169,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; + spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dentry->d_parent); dput(dentry); configfs_put(sd); -- cgit v1.2.3 From 5301a77da2da1e4c22573e0e8d394a653b8ad9f9 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Mon, 16 Jun 2008 19:00:59 +0200 Subject: configfs: Protect configfs_dirent s_links list mutations Symlinks to a config_item are listed under its configfs_dirent s_links, but the list mutations are not protected by any common lock. This patch uses the configfs_dirent_lock spinlock to add the necessary protection. Note: we should also protect the list_empty() test in configfs_detach_prep() but 1/ the lock should not be released immediately because nothing would prevent the list from being filled after a successful list_empty() test, making the problem tricky, 2/ this will be solved by the rmdir() vs rename() deadlock bugfix. Signed-off-by: Louis Rilling Signed-off-by: Joel Becker --- fs/configfs/dir.c | 5 +++-- fs/configfs/symlink.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 2619f485bc3d..a08e5c2f25e8 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -37,10 +37,11 @@ DECLARE_RWSEM(configfs_rename_sem); /* * Protects mutations of configfs_dirent linkage together with proper i_mutex + * Also protects mutations of symlinks linkage to target configfs_dirent * Mutators of configfs_dirent linkage must *both* have the proper inode locked * and configfs_dirent_lock locked, in that order. - * This allows one to safely traverse configfs_dirent trees without having to - * lock inodes. + * This allows one to safely traverse configfs_dirent trees and symlinks without + * having to lock inodes. */ DEFINE_SPINLOCK(configfs_dirent_lock); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 676c84c416da..faeb4417a10d 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -77,12 +77,15 @@ static int create_link(struct config_item *parent_item, sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { sl->sl_target = config_item_get(item); - /* FIXME: needs a lock, I'd bet */ + spin_lock(&configfs_dirent_lock); list_add(&sl->sl_list, &target_sd->s_links); + spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, dentry); if (ret) { + spin_lock(&configfs_dirent_lock); list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); config_item_put(item); kfree(sl); } @@ -186,8 +189,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) type->ct_item_ops->drop_link(parent_item, sl->sl_target); - /* FIXME: Needs lock */ + spin_lock(&configfs_dirent_lock); list_del_init(&sl->sl_list); + spin_unlock(&configfs_dirent_lock); /* Put reference from create_link() */ config_item_put(sl->sl_target); -- cgit v1.2.3 From 107ed40bd070df5e4a0a012042c45c40963dc574 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Mon, 16 Jun 2008 19:01:00 +0200 Subject: configfs: Make configfs_new_dirent() return error code instead of NULL This patch makes configfs_new_dirent return negative error code instead of NULL, which will be useful in the next patch to differentiate ENOMEM from ENOENT. Signed-off-by: Louis Rilling Signed-off-by: Joel Becker --- fs/configfs/dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index a08e5c2f25e8..918a332babfe 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "configfs_internal.h" @@ -83,7 +84,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) - return NULL; + return ERR_PTR(-ENOMEM); atomic_set(&sd->s_count, 1); INIT_LIST_HEAD(&sd->s_links); @@ -129,8 +130,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd, struct configfs_dirent * sd; sd = configfs_new_dirent(parent_sd, element); - if (!sd) - return -ENOMEM; + if (IS_ERR(sd)) + return PTR_ERR(sd); sd->s_mode = mode; sd->s_type = type; @@ -1277,7 +1278,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) file->private_data = configfs_new_dirent(parent_sd, NULL); mutex_unlock(&dentry->d_inode->i_mutex); - return file->private_data ? 0 : -ENOMEM; + return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0; } -- cgit v1.2.3 From b3e76af87441fc36eef3516d73ab2314e7b2d911 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Mon, 16 Jun 2008 19:01:01 +0200 Subject: configfs: Fix deadlock with racing rmdir() and rename() This patch fixes the deadlock between racing sys_rename() and configfs_rmdir(). The idea is to avoid locking i_mutexes of default groups in configfs_detach_prep(), and rely instead on the new configfs_dirent_lock to protect against configfs_dirent's linkage mutations. To ensure that an mkdir() racing with rmdir() will not create new items in a to-be-removed default group, we make configfs_new_dirent() check for the CONFIGFS_USET_DROPPING flag right before linking the new dirent, and return error if the flag is set. This makes racing mkdir()/symlink()/dir_open() fail in places where errors could already happen, resp. in (attach_item()|attach_group())/create_link()/new_dirent(). configfs_depend() remains safe since it locks all the path from configfs root, and is thus mutually exclusive with rmdir(). An advantage of this is that now detach_groups() unconditionnaly takes the default groups i_mutex, which makes it more consistent with populate_groups(). Signed-off-by: Louis Rilling Signed-off-by: Joel Becker --- fs/configfs/dir.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 918a332babfe..d5b5985716ba 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -43,6 +43,10 @@ DECLARE_RWSEM(configfs_rename_sem); * and configfs_dirent_lock locked, in that order. * This allows one to safely traverse configfs_dirent trees and symlinks without * having to lock inodes. + * + * Protects setting of CONFIGFS_USET_DROPPING: checking the flag + * unlocked is not reliable unless in detach_groups() called from + * rmdir()/unregister() and from configfs_attach_group() */ DEFINE_SPINLOCK(configfs_dirent_lock); @@ -91,6 +95,11 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare INIT_LIST_HEAD(&sd->s_children); sd->s_element = element; spin_lock(&configfs_dirent_lock); + if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { + spin_unlock(&configfs_dirent_lock); + kmem_cache_free(configfs_dir_cachep, sd); + return ERR_PTR(-ENOENT); + } list_add(&sd->s_sibling, &parent_sd->s_children); spin_unlock(&configfs_dirent_lock); @@ -349,11 +358,11 @@ static struct dentry * configfs_lookup(struct inode *dir, /* * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are - * attributes and are removed by rmdir(). We recurse, taking i_mutex - * on all children that are candidates for default detach. If the - * result is clean, then configfs_detach_group() will handle dropping - * i_mutex. If there is an error, the caller will clean up the i_mutex - * holders via configfs_detach_rollback(). + * attributes and are removed by rmdir(). We recurse, setting + * CONFIGFS_USET_DROPPING on all children that are candidates for + * default detach. + * If there is an error, the caller will reset the flags via + * configfs_detach_rollback(). */ static int configfs_detach_prep(struct dentry *dentry) { @@ -370,8 +379,7 @@ static int configfs_detach_prep(struct dentry *dentry) if (sd->s_type & CONFIGFS_NOT_PINNED) continue; if (sd->s_type & CONFIGFS_USET_DEFAULT) { - mutex_lock(&sd->s_dentry->d_inode->i_mutex); - /* Mark that we've taken i_mutex */ + /* Mark that we're trying to drop the group */ sd->s_type |= CONFIGFS_USET_DROPPING; /* @@ -392,7 +400,7 @@ out: } /* - * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is + * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was * set. */ static void configfs_detach_rollback(struct dentry *dentry) @@ -403,11 +411,7 @@ static void configfs_detach_rollback(struct dentry *dentry) list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (sd->s_type & CONFIGFS_USET_DEFAULT) { configfs_detach_rollback(sd->s_dentry); - - if (sd->s_type & CONFIGFS_USET_DROPPING) { - sd->s_type &= ~CONFIGFS_USET_DROPPING; - mutex_unlock(&sd->s_dentry->d_inode->i_mutex); - } + sd->s_type &= ~CONFIGFS_USET_DROPPING; } } } @@ -486,16 +490,12 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; + mutex_lock(&child->d_inode->i_mutex); + configfs_detach_group(sd->s_element); child->d_inode->i_flags |= S_DEAD; - /* - * From rmdir/unregister, a configfs_detach_prep() pass - * has taken our i_mutex for us. Drop it. - * From mkdir/register cleanup, there is no sem held. - */ - if (sd->s_type & CONFIGFS_USET_DROPPING) - mutex_unlock(&child->d_inode->i_mutex); + mutex_unlock(&child->d_inode->i_mutex); d_delete(child); dput(child); @@ -1181,12 +1181,15 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) return -EINVAL; } + spin_lock(&configfs_dirent_lock); ret = configfs_detach_prep(dentry); if (ret) { configfs_detach_rollback(dentry); + spin_unlock(&configfs_dirent_lock); config_item_put(parent_item); return ret; } + spin_unlock(&configfs_dirent_lock); /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); @@ -1476,9 +1479,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry)) { printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); } + spin_unlock(&configfs_dirent_lock); configfs_detach_group(&group->cg_item); dentry->d_inode->i_flags |= S_DEAD; mutex_unlock(&dentry->d_inode->i_mutex); -- cgit v1.2.3 From 6d8344baee99402de58b5fa5dfea197242955c15 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Mon, 16 Jun 2008 19:01:02 +0200 Subject: configfs: Fix failing mkdir() making racing rmdir() fail When fixing the rename() vs rmdir() deadlock, we stopped locking default groups' inodes in configfs_detach_prep(), letting racing mkdir() in default groups proceed concurrently. This enables races like below happen, which leads to a failing mkdir() making rmdir() fail, despite the group to remove having no user-created directory under it in the end. process A: process B: /* PWD=A/B */ mkdir("C") make_item("C") attach_group("C") rmdir("A") detach_prep("A") detach_prep("B") error because of "C" return -ENOTEMPTY attach_group("C/D") error (eg -ENOMEM) return -ENOMEM This patch prevents such scenarii by making rmdir() wait as long as detach_prep() fails because a racing mkdir() is in the middle of attach_group(). To achieve this, mkdir() sets a flag CONFIGFS_USET_IN_MKDIR in parent's configfs_dirent before calling attach_group(), and clears the flag once attach_group() is done. detach_prep() fails with -EAGAIN whenever the flag is hit and returns the guilty inode's mutex so that rmdir() can wait on it. Signed-off-by: Louis Rilling Signed-off-by: Joel Becker --- fs/configfs/configfs_internal.h | 1 + fs/configfs/dir.c | 53 +++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 5a33b58e66da..da015c12e3ea 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -48,6 +48,7 @@ struct configfs_dirent { #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 +#define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) extern spinlock_t configfs_dirent_lock; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d5b5985716ba..614e382a6049 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -364,7 +364,7 @@ static struct dentry * configfs_lookup(struct inode *dir, * If there is an error, the caller will reset the flags via * configfs_detach_rollback(). */ -static int configfs_detach_prep(struct dentry *dentry) +static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex) { struct configfs_dirent *parent_sd = dentry->d_fsdata; struct configfs_dirent *sd; @@ -379,6 +379,12 @@ static int configfs_detach_prep(struct dentry *dentry) if (sd->s_type & CONFIGFS_NOT_PINNED) continue; if (sd->s_type & CONFIGFS_USET_DEFAULT) { + /* Abort if racing with mkdir() */ + if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { + if (wait_mutex) + *wait_mutex = &sd->s_dentry->d_inode->i_mutex; + return -EAGAIN; + } /* Mark that we're trying to drop the group */ sd->s_type |= CONFIGFS_USET_DROPPING; @@ -386,7 +392,7 @@ static int configfs_detach_prep(struct dentry *dentry) * Yup, recursive. If there's a problem, blame * deep nesting of default_groups */ - ret = configfs_detach_prep(sd->s_dentry); + ret = configfs_detach_prep(sd->s_dentry, wait_mutex); if (!ret) continue; } else @@ -1113,11 +1119,26 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) */ module_got = 1; + /* + * Make racing rmdir() fail if it did not tag parent with + * CONFIGFS_USET_DROPPING + * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will + * fail and let rmdir() terminate correctly + */ + spin_lock(&configfs_dirent_lock); + /* This will make configfs_detach_prep() fail */ + sd->s_type |= CONFIGFS_USET_IN_MKDIR; + spin_unlock(&configfs_dirent_lock); + if (group) ret = configfs_attach_group(parent_item, item, dentry); else ret = configfs_attach_item(parent_item, item, dentry); + spin_lock(&configfs_dirent_lock); + sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; + spin_unlock(&configfs_dirent_lock); + out_unlink: if (ret) { /* Tear down everything we built up */ @@ -1182,13 +1203,25 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) } spin_lock(&configfs_dirent_lock); - ret = configfs_detach_prep(dentry); - if (ret) { - configfs_detach_rollback(dentry); - spin_unlock(&configfs_dirent_lock); - config_item_put(parent_item); - return ret; - } + do { + struct mutex *wait_mutex; + + ret = configfs_detach_prep(dentry, &wait_mutex); + if (ret) { + configfs_detach_rollback(dentry); + spin_unlock(&configfs_dirent_lock); + if (ret != -EAGAIN) { + config_item_put(parent_item); + return ret; + } + + /* Wait until the racing operation terminates */ + mutex_lock(wait_mutex); + mutex_unlock(wait_mutex); + + spin_lock(&configfs_dirent_lock); + } + } while (ret == -EAGAIN); spin_unlock(&configfs_dirent_lock); /* Get a working ref for the duration of this function */ @@ -1480,7 +1513,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) I_MUTEX_PARENT); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); spin_lock(&configfs_dirent_lock); - if (configfs_detach_prep(dentry)) { + if (configfs_detach_prep(dentry, NULL)) { printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); } spin_unlock(&configfs_dirent_lock); -- cgit v1.2.3 From 11c3b79218390a139f2d474ee1e983a672d5839a Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Thu, 12 Jun 2008 14:00:18 -0700 Subject: configfs: Allow ->make_item() and ->make_group() to return detailed errors. The configfs operations ->make_item() and ->make_group() currently return a new item/group. A return of NULL signifies an error. Because of this, -ENOMEM is the only return code bubbled up the stack. Multiple folks have requested the ability to return specific error codes when these operations fail. This patch adds that ability by changing the ->make_item/group() ops to return an int. Also updated are the in-kernel users of configfs. Signed-off-by: Joel Becker --- Documentation/filesystems/configfs/configfs.txt | 10 +++-- .../filesystems/configfs/configfs_example.c | 14 ++++--- drivers/net/netconsole.c | 10 +++-- fs/configfs/dir.c | 13 +++---- fs/dlm/config.c | 45 ++++++++++++++-------- fs/ocfs2/cluster/heartbeat.c | 17 ++++---- fs/ocfs2/cluster/nodemanager.c | 45 ++++++++++++++-------- include/linux/configfs.h | 4 +- 8 files changed, 94 insertions(+), 64 deletions(-) diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index 44c97e6accb2..15838d706ea2 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -233,10 +233,12 @@ accomplished via the group operations specified on the group's config_item_type. struct configfs_group_operations { - struct config_item *(*make_item)(struct config_group *group, - const char *name); - struct config_group *(*make_group)(struct config_group *group, - const char *name); + int (*make_item)(struct config_group *group, + const char *name, + struct config_item **new_item); + int (*make_group)(struct config_group *group, + const char *name, + struct config_group **new_group); int (*commit_item)(struct config_item *item); void (*disconnect_notify)(struct config_group *group, struct config_item *item); diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c index 25151fd5c2c6..0b422acd470c 100644 --- a/Documentation/filesystems/configfs/configfs_example.c +++ b/Documentation/filesystems/configfs/configfs_example.c @@ -273,13 +273,13 @@ static inline struct simple_children *to_simple_children(struct config_item *ite return item ? container_of(to_config_group(item), struct simple_children, group) : NULL; } -static struct config_item *simple_children_make_item(struct config_group *group, const char *name) +static int simple_children_make_item(struct config_group *group, const char *name, struct config_item **new_item) { struct simple_child *simple_child; simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL); if (!simple_child) - return NULL; + return -ENOMEM; config_item_init_type_name(&simple_child->item, name, @@ -287,7 +287,8 @@ static struct config_item *simple_children_make_item(struct config_group *group, simple_child->storeme = 0; - return &simple_child->item; + *new_item = &simple_child->item; + return 0; } static struct configfs_attribute simple_children_attr_description = { @@ -359,20 +360,21 @@ static struct configfs_subsystem simple_children_subsys = { * children of its own. */ -static struct config_group *group_children_make_group(struct config_group *group, const char *name) +static int group_children_make_group(struct config_group *group, const char *name, struct config_group **new_group) { struct simple_children *simple_children; simple_children = kzalloc(sizeof(struct simple_children), GFP_KERNEL); if (!simple_children) - return NULL; + return -ENOMEM; config_group_init_type_name(&simple_children->group, name, &simple_children_type); - return &simple_children->group; + *new_group = &simple_children->group; + return 0; } static struct configfs_attribute group_children_attr_description = { diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 665341e43055..387a13395015 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -585,8 +585,9 @@ static struct config_item_type netconsole_target_type = { * Group operations and type for netconsole_subsys. */ -static struct config_item *make_netconsole_target(struct config_group *group, - const char *name) +static int make_netconsole_target(struct config_group *group, + const char *name, + struct config_item **new_item) { unsigned long flags; struct netconsole_target *nt; @@ -598,7 +599,7 @@ static struct config_item *make_netconsole_target(struct config_group *group, nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) { printk(KERN_ERR "netconsole: failed to allocate memory\n"); - return NULL; + return -ENOMEM; } nt->np.name = "netconsole"; @@ -615,7 +616,8 @@ static struct config_item *make_netconsole_target(struct config_group *group, list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); - return &nt->item; + *new_item = &nt->item; + return 0; } static void drop_netconsole_target(struct config_group *group, diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 614e382a6049..0e64312a084c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1073,25 +1073,24 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) group = NULL; item = NULL; if (type->ct_group_ops->make_group) { - group = type->ct_group_ops->make_group(to_config_group(parent_item), name); - if (group) { + ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group); + if (!ret) { link_group(to_config_group(parent_item), group); item = &group->cg_item; } } else { - item = type->ct_group_ops->make_item(to_config_group(parent_item), name); - if (item) + ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item); + if (!ret) link_obj(parent_item, item); } mutex_unlock(&subsys->su_mutex); kfree(name); - if (!item) { + if (ret) { /* - * If item == NULL, then link_obj() was never called. + * If ret != 0, then link_obj() was never called. * There are no extra references to clean up. */ - ret = -ENOMEM; goto out_put; } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index eac23bd288b2..492d8caaaf25 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -41,16 +41,20 @@ struct comm; struct nodes; struct node; -static struct config_group *make_cluster(struct config_group *, const char *); +static int make_cluster(struct config_group *, const char *, + struct config_group **); static void drop_cluster(struct config_group *, struct config_item *); static void release_cluster(struct config_item *); -static struct config_group *make_space(struct config_group *, const char *); +static int make_space(struct config_group *, const char *, + struct config_group **); static void drop_space(struct config_group *, struct config_item *); static void release_space(struct config_item *); -static struct config_item *make_comm(struct config_group *, const char *); +static int make_comm(struct config_group *, const char *, + struct config_item **); static void drop_comm(struct config_group *, struct config_item *); static void release_comm(struct config_item *); -static struct config_item *make_node(struct config_group *, const char *); +static int make_node(struct config_group *, const char *, + struct config_item **); static void drop_node(struct config_group *, struct config_item *); static void release_node(struct config_item *); @@ -392,8 +396,8 @@ static struct node *to_node(struct config_item *i) return i ? container_of(i, struct node, item) : NULL; } -static struct config_group *make_cluster(struct config_group *g, - const char *name) +static int make_cluster(struct config_group *g, const char *name, + struct config_group **new_g) { struct cluster *cl = NULL; struct spaces *sps = NULL; @@ -431,14 +435,15 @@ static struct config_group *make_cluster(struct config_group *g, space_list = &sps->ss_group; comm_list = &cms->cs_group; - return &cl->group; + *new_g = &cl->group; + return 0; fail: kfree(cl); kfree(gps); kfree(sps); kfree(cms); - return NULL; + return -ENOMEM; } static void drop_cluster(struct config_group *g, struct config_item *i) @@ -466,7 +471,8 @@ static void release_cluster(struct config_item *i) kfree(cl); } -static struct config_group *make_space(struct config_group *g, const char *name) +static int make_space(struct config_group *g, const char *name, + struct config_group **new_g) { struct space *sp = NULL; struct nodes *nds = NULL; @@ -489,13 +495,14 @@ static struct config_group *make_space(struct config_group *g, const char *name) INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; - return &sp->group; + *new_g = &sp->group; + return 0; fail: kfree(sp); kfree(gps); kfree(nds); - return NULL; + return -ENOMEM; } static void drop_space(struct config_group *g, struct config_item *i) @@ -522,19 +529,21 @@ static void release_space(struct config_item *i) kfree(sp); } -static struct config_item *make_comm(struct config_group *g, const char *name) +static int make_comm(struct config_group *g, const char *name, + struct config_item **new_i) { struct comm *cm; cm = kzalloc(sizeof(struct comm), GFP_KERNEL); if (!cm) - return NULL; + return -ENOMEM; config_item_init_type_name(&cm->item, name, &comm_type); cm->nodeid = -1; cm->local = 0; cm->addr_count = 0; - return &cm->item; + *new_i = &cm->item; + return 0; } static void drop_comm(struct config_group *g, struct config_item *i) @@ -554,14 +563,15 @@ static void release_comm(struct config_item *i) kfree(cm); } -static struct config_item *make_node(struct config_group *g, const char *name) +static int make_node(struct config_group *g, const char *name, + struct config_item **new_i) { struct space *sp = to_space(g->cg_item.ci_parent); struct node *nd; nd = kzalloc(sizeof(struct node), GFP_KERNEL); if (!nd) - return NULL; + return -ENOMEM; config_item_init_type_name(&nd->item, name, &node_type); nd->nodeid = -1; @@ -573,7 +583,8 @@ static struct config_item *make_node(struct config_group *g, const char *name) sp->members_count++; mutex_unlock(&sp->members_lock); - return &nd->item; + *new_i = &nd->item; + return 0; } static void drop_node(struct config_group *g, struct config_item *i) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f02ccb34604d..443d108211ab 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1489,25 +1489,28 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, - const char *name) +static int o2hb_heartbeat_group_make_item(struct config_group *group, + const char *name, + struct config_item **new_item) { struct o2hb_region *reg = NULL; - struct config_item *ret = NULL; + int ret = 0; reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); - if (reg == NULL) - goto out; /* ENOMEM */ + if (reg == NULL) { + ret = -ENOMEM; + goto out; + } config_item_init_type_name(®->hr_item, name, &o2hb_region_type); - ret = ®->hr_item; + *new_item = ®->hr_item; spin_lock(&o2hb_live_lock); list_add_tail(®->hr_all_item, &o2hb_all_regions); spin_unlock(&o2hb_live_lock); out: - if (ret == NULL) + if (ret) kfree(reg); return ret; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index cfdb08b484ed..b364b7052e46 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -644,27 +644,32 @@ out: return ret; } -static struct config_item *o2nm_node_group_make_item(struct config_group *group, - const char *name) +static int o2nm_node_group_make_item(struct config_group *group, + const char *name, + struct config_item **new_item) { struct o2nm_node *node = NULL; - struct config_item *ret = NULL; + int ret = 0; - if (strlen(name) > O2NM_MAX_NAME_LEN) - goto out; /* ENAMETOOLONG */ + if (strlen(name) > O2NM_MAX_NAME_LEN) { + ret = -ENAMETOOLONG; + goto out; + } node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL); - if (node == NULL) - goto out; /* ENOMEM */ + if (node == NULL) { + ret = -ENOMEM; + goto out; + } strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); spin_lock_init(&node->nd_lock); - ret = &node->nd_item; + *new_item = &node->nd_item; out: - if (ret == NULL) + if (ret) kfree(node); return ret; @@ -751,25 +756,31 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro } #endif -static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, - const char *name) +static int o2nm_cluster_group_make_group(struct config_group *group, + const char *name, + struct config_group **new_group) { struct o2nm_cluster *cluster = NULL; struct o2nm_node_group *ns = NULL; - struct config_group *o2hb_group = NULL, *ret = NULL; + struct config_group *o2hb_group = NULL; void *defs = NULL; + int ret = 0; /* this runs under the parent dir's i_mutex; there can be only * one caller in here at a time */ - if (o2nm_single_cluster) - goto out; /* ENOSPC */ + if (o2nm_single_cluster) { + ret = -ENOSPC; + goto out; + } cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) { + ret = -ENOMEM; goto out; + } config_group_init_type_name(&cluster->cl_group, name, &o2nm_cluster_type); @@ -786,11 +797,11 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; - ret = &cluster->cl_group; + *new_group = &cluster->cl_group; o2nm_single_cluster = cluster; out: - if (ret == NULL) { + if (ret) { kfree(cluster); kfree(ns); o2hb_free_hb_set(o2hb_group); diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 3ae65b1bf90f..0488f937634a 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -165,8 +165,8 @@ struct configfs_item_operations { }; struct configfs_group_operations { - struct config_item *(*make_item)(struct config_group *group, const char *name); - struct config_group *(*make_group)(struct config_group *group, const char *name); + int (*make_item)(struct config_group *group, const char *name, struct config_item **new_item); + int (*make_group)(struct config_group *group, const char *name, struct config_group **new_group); int (*commit_item)(struct config_item *item); void (*disconnect_notify)(struct config_group *group, struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); -- cgit v1.2.3 From e75206517504461778c283b942440ef312e437d5 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Thu, 12 Jun 2008 17:26:47 +0200 Subject: configfs: call drop_link() to cleanup after create_link() failure When allow_link() succeeds but create_link() fails, the subsystem is not informed of the failure. This patch fixes this by calling drop_link() on create_link() failures. Signed-off-by: Louis Rilling Signed-off-by: Joel Becker --- fs/configfs/symlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index faeb4417a10d..0004d18c40ac 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -140,8 +140,12 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna goto out_put; ret = type->ct_item_ops->allow_link(parent_item, target_item); - if (!ret) + if (!ret) { ret = create_link(parent_item, target_item, dentry); + if (ret && type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + target_item); + } config_item_put(target_item); path_put(&nd.path); -- cgit v1.2.3 From c0420ad2ca514551ca086510b0e7d17a05c70492 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 30 Jun 2008 18:45:45 +0800 Subject: [PATCH] ocfs2: fix oops in mmap_truncate testing This patch fixes a mmap_truncate bug which was found by ocfs2 test suite. In an ocfs2 cluster more than 1 node, run program mmap_truncate, which races mmap writes and truncates from multiple processes. While the test is running, a stat from another node forces writeout, causing an oops in ocfs2_get_block() because it sees a buffer to write which isn't allocated. This patch fixed the bug by clear dirty and uptodate bits in buffer, leave the buffer unmapped and return. Fix is suggested by Mark Fasheh, and I code up the patch. Signed-off-by: Coly Li Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 17964c0505a9..1db080135c6d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -174,10 +174,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New * allows block_prepare_write() to zero. + * + * If we see this on a sparse file system, then a truncate has + * raced us and removed the cluster. In this case, we clear + * the buffers dirty and uptodate bits and let the buffer code + * ignore it as a hole. */ - mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), - "ino %lu, iblock %llu\n", inode->i_ino, - (unsigned long long)iblock); + if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { + clear_buffer_dirty(bh_result); + clear_buffer_uptodate(bh_result); + goto bail; + } /* Treat the unwritten extent as a hole for zeroing purposes. */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) -- cgit v1.2.3