From bc09d141ebb26c0c1ab713c8597ca833be9afee4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 10 Dec 2014 15:41:15 -0800 Subject: fs/cifs: remove obsolete __constant Replace all __constant_foo to foo() except in smb2status.h (1700 lines to update). Signed-off-by: Fabian Frederick Cc: Steve French Cc: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/cifsacl.c | 2 +- fs/cifs/cifssmb.c | 20 ++++++++++---------- fs/cifs/sess.c | 2 +- fs/cifs/smb2misc.c | 38 +++++++++++++++++++------------------- fs/cifs/smb2ops.c | 2 +- fs/cifs/smb2pdu.c | 2 +- fs/cifs/smb2pdu.h | 28 ++++++++++++++-------------- 7 files changed, 47 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6d00c419cbae..1ea780bc6376 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; /* security id for Authenticated Users system group */ static const struct cifs_sid sid_authusers = { - 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} }; + 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 61d00a6e398f..fa13d5e79f64 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; else { if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_RDLCK)) + cpu_to_le16(CIFS_RDLCK)) pLockData->fl_type = F_RDLCK; else if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_WRLCK)) + cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK; pLockData->fl_start = le64_to_cpu(parm_data->start); @@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); pSMB->TotalParameterCount = 0; - pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->TotalDataCount = cpu_to_le32(2); pSMB->MaxParameterCount = 0; pSMB->MaxDataCount = 0; pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; - pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataCount = cpu_to_le32(2); pSMB->DataOffset = cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, compression_state) - 4); /* 84 */ pSMB->SetupCount = 4; - pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); pSMB->ParameterCount = 0; - pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); pSMB->IsFsctl = 1; /* FSCTL */ pSMB->IsRootFlag = 0; pSMB->Fid = fid; /* file handle always le */ /* 3 byte pad, followed by 2 byte compress state */ - pSMB->ByteCount = __constant_cpu_to_le16(5); + pSMB->ByteCount = cpu_to_le16(5); inc_rfc1001_len(pSMB, 5); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 57db63ff88da..ab0fda6e8e58 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = __constant_cpu_to_le16(1); + pSMB->req.VcNumber = cpu_to_le16(1); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1a08a34838fc..f1cefc9763ed 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -67,27 +67,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * indexed by command in host byte order */ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { - /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65), - /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9), - /* SMB2_LOGOFF */ __constant_cpu_to_le16(4), - /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16), - /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4), - /* SMB2_CREATE */ __constant_cpu_to_le16(89), - /* SMB2_CLOSE */ __constant_cpu_to_le16(60), - /* SMB2_FLUSH */ __constant_cpu_to_le16(4), - /* SMB2_READ */ __constant_cpu_to_le16(17), - /* SMB2_WRITE */ __constant_cpu_to_le16(17), - /* SMB2_LOCK */ __constant_cpu_to_le16(4), - /* SMB2_IOCTL */ __constant_cpu_to_le16(49), + /* SMB2_NEGOTIATE */ cpu_to_le16(65), + /* SMB2_SESSION_SETUP */ cpu_to_le16(9), + /* SMB2_LOGOFF */ cpu_to_le16(4), + /* SMB2_TREE_CONNECT */ cpu_to_le16(16), + /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4), + /* SMB2_CREATE */ cpu_to_le16(89), + /* SMB2_CLOSE */ cpu_to_le16(60), + /* SMB2_FLUSH */ cpu_to_le16(4), + /* SMB2_READ */ cpu_to_le16(17), + /* SMB2_WRITE */ cpu_to_le16(17), + /* SMB2_LOCK */ cpu_to_le16(4), + /* SMB2_IOCTL */ cpu_to_le16(49), /* BB CHECK this ... not listed in documentation */ - /* SMB2_CANCEL */ __constant_cpu_to_le16(0), - /* SMB2_ECHO */ __constant_cpu_to_le16(4), - /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9), - /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9), - /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9), - /* SMB2_SET_INFO */ __constant_cpu_to_le16(2), + /* SMB2_CANCEL */ cpu_to_le16(0), + /* SMB2_ECHO */ cpu_to_le16(4), + /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9), + /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9), + /* SMB2_QUERY_INFO */ cpu_to_le16(9), + /* SMB2_SET_INFO */ cpu_to_le16(2), /* BB FIXME can also be 44 for lease break */ - /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24) + /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c5f521bcdee2..7ecac610b44b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -600,7 +600,7 @@ smb2_clone_range(const unsigned int xid, goto cchunk_out; /* For now array only one chunk long, will make more flexible later */ - pcchunk->ChunkCount = __constant_cpu_to_le32(1); + pcchunk->ChunkCount = cpu_to_le32(1); pcchunk->Reserved = 0; pcchunk->Reserved2 = 0; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8f1672bb82d5..08f73e168b0b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1359,7 +1359,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, char *ret_data = NULL; fsctl_input.CompressionState = - __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index e3188abdafd0..fbd4df1e1683 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -85,7 +85,7 @@ /* BB FIXME - analyze following length BB */ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ -#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe) +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) /* * SMB2 Header Definition @@ -96,7 +96,7 @@ * */ -#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64) +#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ @@ -137,16 +137,16 @@ struct smb2_transform_hdr { } __packed; /* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) /* * SMB2 flag definitions */ -#define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001) -#define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002) -#define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004) -#define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008) -#define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000) +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) /* * Definitions for SMB2 Protocol Data Units (network frames) @@ -157,7 +157,7 @@ struct smb2_transform_hdr { * */ -#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9) +#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { struct smb2_hdr hdr; @@ -502,12 +502,12 @@ struct create_context { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04) +#define SMB2_LEASE_NONE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02) +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02) #define SMB2_LEASE_KEY_SIZE 16 -- cgit v1.2.3 From 4b99d39b1b2890a178d840f7a068305c7890faee Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 10 Dec 2014 15:41:17 -0800 Subject: fs/cifs/file.c: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Steve French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3e4d00a06c44..b123a64d7be4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1066,7 +1066,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; @@ -1401,7 +1401,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM; -- cgit v1.2.3 From 662e9b2b98a0b8e172c392f3d3437d354a6c4067 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 10 Dec 2014 15:41:20 -0800 Subject: fs/cifs/smb2file.c: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Steve French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/smb2file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 45992944e238..7198eac5dddd 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, return -EINVAL; max_num = max_buf / sizeof(struct smb2_lock_element); - buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) return -ENOMEM; @@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) } max_num = max_buf / sizeof(struct smb2_lock_element); - buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; -- cgit v1.2.3 From f08736bd6c54c33b91d4324eee5713ca6f13319a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 10 Dec 2014 15:41:34 -0800 Subject: ocfs2/dlm: let sender retry if dlm_dispatch_assert_master failed with -ENOMEM Do not BUG() if GFP_ATOMIC allocation fails in dlm_dispatch_assert_master. Instead, return -ENOMEM to the sender and then retry. Signed-off-by: Joseph Qi Reviewed-by: Alex Chen Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 3365839d2971..79b5af5e6a7b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1656,14 +1656,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, req.namelen = res->lockname.len; memcpy(req.name, res->lockname.name, res->lockname.len); +resend: ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, &req, sizeof(req), nodenum, &status); - /* XXX: negative status not handled properly here. */ if (ret < 0) mlog(ML_ERROR, "Error %d when sending message %u (key " "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, dlm->key, nodenum); - else { + else if (status == -ENOMEM) { + mlog_errno(status); + msleep(50); + goto resend; + } else { BUG_ON(status < 0); BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); *real_master = (u8) (status & 0xff); @@ -1705,9 +1709,13 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, int ret = dlm_dispatch_assert_master(dlm, res, 0, 0, flags); if (ret < 0) { - mlog_errno(-ENOMEM); - /* retry!? */ - BUG(); + mlog_errno(ret); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + spin_unlock(&dlm->spinlock); + dlm_put(dlm); + /* sender will take care of this and retry */ + return ret; } else __dlm_lockres_grab_inflight_worker(dlm, res); spin_unlock(&res->spinlock); -- cgit v1.2.3 From 519a286175b6422e13694ebb73aefee0b5749443 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Dec 2014 15:41:37 -0800 Subject: ocfs2: fix an off-by-one BUG_ON() statement The ->si_slots[] array is allocated in ocfs2_init_slot_info() it has "->max_slots" number of elements so this test should be >= instead of >. Static checker work. Compile tested only. Signed-off-by: Dan Carpenter Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/slot_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index a88b2a4fcc85..d5493e361a38 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -306,7 +306,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, assert_spin_locked(&osb->osb_lock); BUG_ON(slot_num < 0); - BUG_ON(slot_num > osb->max_slots); + BUG_ON(slot_num >= osb->max_slots); if (!si->si_slots[slot_num].sl_valid) return -ENOENT; -- cgit v1.2.3 From 2b693005b833ba309c7cb8426cd023d8e80da31e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:41:40 -0800 Subject: ocfs2: Fix xattr check in ocfs2_get_xattr_nolock() ocfs2_get_xattr_nolock() checks whether inode has any extended attributes (OCFS2_HAS_XATTR_FL). If not, it just sets 'ret' to -ENODATA but continues with checking inline and external attributes anyway (which is pointless although it does not harm). Just return immediately when we know there are no extended attributes in the inode. Coverity id: 1226906. Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 016f01df3825..662f8dee149f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1284,7 +1284,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode, return -EOPNOTSUPP; if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) - ret = -ENODATA; + return -ENODATA; xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; -- cgit v1.2.3 From 4a635a113bfe1eaa184410d6b6065cd8eee13607 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:41:42 -0800 Subject: ocfs2: remove bogus test from ocfs2_read_locked_inode() 'args' are always set for ocfs2_read_locked_inode() and brelse() checks whether bh is NULL. So the test (args && bh) is unnecessary (plus the args part is really confusing anyway). Remove it. Coverity id: 1128856. Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 437de7f768c6..c8b25de9efbb 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -540,8 +540,7 @@ bail: if (status < 0) make_bad_inode(inode); - if (args && bh) - brelse(bh); + brelse(bh); return status; } -- cgit v1.2.3 From f5425fcea72eec084c50f36db4d25f697227c602 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:41:45 -0800 Subject: ocfs2: report error from o2hb_do_disk_heartbeat() to user Report return value of o2hb_do_disk_heartbeat() as a part of ML_HEARTBEAT message so that we know whether a heartbeat actually happened or not. This also makes assigned but otherwise unused 'ret' variable used. Coverity id: 1227053. Signed-off-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index eb9d48746ab4..16eff45727ee 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1127,10 +1127,10 @@ static int o2hb_thread(void *data) elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", before_hb.tv_sec, (unsigned long) before_hb.tv_usec, after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec); + elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { -- cgit v1.2.3 From cb79662bc2f83f7b3b60970ad88df43085f96514 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Wed, 10 Dec 2014 15:41:48 -0800 Subject: ocfs2: o2dlm: fix a race between purge and master query Node A sends master query request to node B which is the master. At this time lockres happens to be on purgelist. dlm_master_request_handler gets the dlm spinlock, finds the resource and releases the dlm spin lock. Right at this dlm_thread on this node could purge the lockres. dlm_master_request_handler can then acquire lockres spinlock and reply to Node A that node B is the master even though lockres on node B is purged. The above scenario will now make node A falsely think node B is the master which is inconsistent. Further if another node C tries to master the same resource, every node will respond they are not the master. Node C then masters the resource and sends assert master to all nodes. This will now make node A crash with the following message. dlm_assert_master_handler:1831 ERROR: DIE! Mastery assert from 9, but current owner is 10! Signed-off-by: Srinivas Eeda Cc: Mark Fasheh Cc: Joel Becker Reviewed-by: Wengang Wang Tested-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 215e41abf101..3689b3592042 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1460,6 +1460,18 @@ way_up_top: /* take care of the easy cases up front */ spin_lock(&res->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&res->hash_node)) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + goto way_up_top; + } + if (res->state & (DLM_LOCK_RES_RECOVERING| DLM_LOCK_RES_MIGRATING)) { spin_unlock(&res->spinlock); -- cgit v1.2.3 From 196fe71d643be8285749726e88c1508536fe03b8 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 10 Dec 2014 15:41:51 -0800 Subject: ocfs2: o2net: fix connect expired Set nn_persistent_error to -ENOTCONN will stop reconnect since the "stop" condition in o2net_start_connect() will be true. stop = (nn->nn_sc || (nn->nn_persistent_error && (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); This will make connection never be established if the first connection request is lost. Set nn_persistent_error to 0 when connect expired to fix this. With this changes, dlm will not be waken up when connect expired, this is OK since dlm depends on network, dlm can do nothing in this case if waken up. Let it wait there for network recover and connect built again to continue. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a96044004064..2e355e0f8335 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1736,7 +1736,7 @@ static void o2net_connect_expired(struct work_struct *work) o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); - o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + o2net_set_nn_state(nn, NULL, 0, 0); } spin_unlock(&nn->nn_lock); } -- cgit v1.2.3 From 86b9c6f3f891019b26f8e5bb11a6faa96bba54a8 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 10 Dec 2014 15:41:53 -0800 Subject: ocfs2: remove filesize checks for sync I/O journal commit Filesize is not a good indication that the file needs to be synced. An example where this breaks is: 1. Open the file in O_SYNC|O_RDWR 2. Read a small portion of the file (say 64 bytes) 3. Lseek to starting of the file 4. Write 64 bytes If the node crashes, it is not written out to disk because this was not committed in the journal and the other node which reads the file after recovery reads stale data (even if the write on the other node was successful) Signed-off-by: Goldwyn Rodrigues Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 324dc93ac896..69fb9f75b082 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2381,9 +2381,7 @@ out_dio: if (ret < 0) written = ret; - if (!ret && ((old_size != i_size_read(inode)) || - (old_clusters != OCFS2_I(inode)->ip_clusters) || - has_refcount)) { + if (!ret) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; -- cgit v1.2.3 From dc17158060fb57f49c310a7def12fdaeddf35b19 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:41:56 -0800 Subject: ocfs2: fix error handling when creating debugfs root in ocfs2_init() Error handling if creation of root of debugfs in ocfs2_init() fails is broken. Although error code is set we fail to exit ocfs2_init() with error and thus initialization ends with success. Later when mounting a filesystem, ocfs2 debugfs entries end up being created in the root of debugfs filesystem which is confusing. Fix the error handling to bail out. Coverity id: 1227009. Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 93c85bc745e1..8e3ac25efb9f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1622,8 +1622,9 @@ static int __init ocfs2_init(void) ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); if (!ocfs2_debugfs_root) { - status = -EFAULT; + status = -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); + goto out4; } ocfs2_set_locking_protocol(); -- cgit v1.2.3 From d1e78238741491d24177d99453091439c90ac70e Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 10 Dec 2014 15:41:59 -0800 Subject: ocfs2: do not set OCFS2_LOCK_UPCONVERT_FINISHING if nonblocking lock can not be granted at once ocfs2_readpages() use nonblocking flag to avoid page lock inversion. It will trigger cluster hang because that flag OCFS2_LOCK_UPCONVERT_FINISHING is not cleared if nonblocking lock cannot be granted at once. The flag would prevent dc thread from downconverting. So other nodes cannot acheive this lockres for ever. So we should not set OCFS2_LOCK_UPCONVERT_FINISHING when receiving ast if nonblocking lock had already returned. Signed-off-by: joyce.xue Reviewed-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 37 +++++++++++++++++++++++++++++++------ fs/ocfs2/ocfs2.h | 6 ++++++ 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 21262f2b1654..953f4eb75fa7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -861,8 +861,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing * the OCFS2_LOCK_BUSY flag to prevent the dc thread from * downconverting the lock before the upconvert has fully completed. + * Do not prevent the dc thread from downconverting if NONBLOCK lock + * had already returned. */ - lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED)) + lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + else + lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); } @@ -1324,13 +1329,12 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, /* returns 0 if the mw that was removed was already satisfied, -EBUSY * if the mask still hadn't reached its goal */ -static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, +static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, struct ocfs2_mask_waiter *mw) { - unsigned long flags; int ret = 0; - spin_lock_irqsave(&lockres->l_lock, flags); + assert_spin_locked(&lockres->l_lock); if (!list_empty(&mw->mw_item)) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) ret = -EBUSY; @@ -1338,6 +1342,18 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); init_completion(&mw->mw_complete); } + + return ret; +} + +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = __lockres_remove_mask_waiter(lockres, mw); spin_unlock_irqrestore(&lockres->l_lock, flags); return ret; @@ -1373,6 +1389,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned long flags; unsigned int gen; int noqueue_attempted = 0; + int dlm_locked = 0; ocfs2_init_mask_waiter(&mw); @@ -1481,6 +1498,7 @@ again: ocfs2_recover_from_dlm_error(lockres, 1); goto out; } + dlm_locked = 1; mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", lockres->l_name); @@ -1514,10 +1532,17 @@ out: if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { wait = 0; - if (lockres_remove_mask_waiter(lockres, &mw)) + spin_lock_irqsave(&lockres->l_lock, flags); + if (__lockres_remove_mask_waiter(lockres, &mw)) { + if (dlm_locked) + lockres_or_flags(lockres, + OCFS2_LOCK_NONBLOCK_FINISHED); + spin_unlock_irqrestore(&lockres->l_lock, flags); ret = -EAGAIN; - else + } else { + spin_unlock_irqrestore(&lockres->l_lock, flags); goto again; + } } if (wait) { ret = ocfs2_wait_for_mask(&mw); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index bbec539230fd..7d6b7d090452 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -144,6 +144,12 @@ enum ocfs2_unlock_action { * before the upconvert * has completed */ +#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster + * lock has already + * returned, do not block + * dc thread from + * downconverting */ + struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); -- cgit v1.2.3 From 61fb9ea4b3e1f22341f7d817db18884062b08601 Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Wed, 10 Dec 2014 15:42:02 -0800 Subject: ocfs2: do not set filesystem readonly if link down Do not set the filesystem readonly if the storage link is down. In this case, metadata is not corrupted and only -EIO is returned. And if it is indeed corrupted metadata, it has already called ocfs2_error() in ocfs2_validate_inode_block(). Signed-off-by: Yiwen Jiang Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/dir.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1ef547e49373..d9f222987f24 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1251,7 +1251,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, NULL); if (ret < 0) { - ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " + mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " "at logical block %llu", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_blkno); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 0717662b4aef..e6dc0288dbb9 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -744,7 +744,7 @@ restart: if (ocfs2_read_dir_block(dir, block, &bh, 0)) { /* read error, skip block & hope for the best. * ocfs2_read_dir_block() has released the bh. */ - ocfs2_error(dir->i_sb, "reading directory %llu, " + mlog(ML_ERROR, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); -- cgit v1.2.3 From 88d69b92fc12084af43cba428bda9727c0ca084f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Dec 2014 15:42:04 -0800 Subject: ocfs2: remove bogus NULL check in ocfs2_move_extents() "inode" isn't NULL here, and also we dereference it on the previous line so static checkers get annoyed. Signed-off-by: Dan Carpenter Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/move_extents.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 74caffeeee1d..56a768d06aa6 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -904,9 +904,6 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) struct buffer_head *di_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!inode) - return -ENOENT; - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; -- cgit v1.2.3 From b3e3e5af60e0fda182e978f568cc512568c024cf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Dec 2014 15:42:07 -0800 Subject: ocfs2: remove unneeded NULL check In commit 1faf289454b9 ("ocfs2_dlm: disallow a domain join if node maps mismatch") we introduced a new earlier NULL check so this one is not needed. Also static checkers complain because we dereference it first and then check for NULL. Signed-off-by: Dan Carpenter Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 02d315fef432..50a59d2337b2 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -877,7 +877,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome * times (ie. during recovery). */ - if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + if (dlm->dlm_state != DLM_CTXT_LEAVING) { int bit = query->node_idx; spin_lock(&dlm->spinlock); -- cgit v1.2.3 From e2ab879e96b5e65bf8ce1123f3b7f01ebba27204 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:42:10 -0800 Subject: fs/char_dev.c: remove pointless assignment from __register_chrdev_region() At one place we assign major number we found to ret. That assignment is then never used and actually doesn't make any sense given how the code is currently structured (the assignment comes from pre-git times). Just remove it. Coverity id: 1226852. Signed-off-by: Jan Kara Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/char_dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index f77f7702fabe..67b2007f10fe 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -117,7 +117,6 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, goto out; } major = i; - ret = major; } cd->major = major; -- cgit v1.2.3 From c164e038eee805147e95789dddb88ae3b3aca11c Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 10 Dec 2014 15:44:36 -0800 Subject: mm: fix huge zero page accounting in smaps report As a small zero page, huge zero page should not be accounted in smaps report as normal page. For small pages we rely on vm_normal_page() to filter out zero page, but vm_normal_page() is not designed to handle pmds. We only get here due hackish cast pmd to pte in smaps_pte_range() -- pte and pmd format is not necessary compatible on each and every architecture. Let's add separate codepath to handle pmds. follow_trans_huge_pmd() will detect huge zero page for us. We would need pmd_dirty() helper to do this properly. The patch adds it to THP-enabled architectures which don't yet have one. [akpm@linux-foundation.org: use do_div to fix 32-bit build] Signed-off-by: "Kirill A. Shutemov" Reported-by: Fengguang Wu Tested-by: Fengwei Yin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgtable.h | 1 + arch/powerpc/include/asm/pgtable-ppc64.h | 1 + arch/sparc/include/asm/pgtable_64.h | 7 +++ arch/x86/include/asm/pgtable.h | 5 ++ fs/proc/task_mmu.c | 104 ++++++++++++++++++++----------- 5 files changed, 82 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 41a43bf26492..df22314f57cf 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -279,6 +279,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index ae153c40ab7c..9b4b1904efae 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -467,6 +467,7 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) } #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) +#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index bfeb626085ac..1ff9e7864168 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -667,6 +667,13 @@ static inline unsigned long pmd_pfn(pmd_t pmd) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline unsigned long pmd_dirty(pmd_t pmd) +{ + pte_t pte = __pte(pmd_val(pmd)); + + return pte_dirty(pte); +} + static inline unsigned long pmd_young(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index aa97a070f09f..081d6f45e006 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -99,6 +99,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY; +} + static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f6734c6b66a6..246eae84b13b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -447,58 +447,91 @@ struct mem_size_stats { u64 pss; }; +static void smaps_account(struct mem_size_stats *mss, struct page *page, + unsigned long size, bool young, bool dirty) +{ + int mapcount; + + if (PageAnon(page)) + mss->anonymous += size; -static void smaps_pte_entry(pte_t ptent, unsigned long addr, - unsigned long ptent_size, struct mm_walk *walk) + mss->resident += size; + /* Accumulate the size in pages that have been accessed. */ + if (young || PageReferenced(page)) + mss->referenced += size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + u64 pss_delta; + + if (dirty || PageDirty(page)) + mss->shared_dirty += size; + else + mss->shared_clean += size; + pss_delta = (u64)size << PSS_SHIFT; + do_div(pss_delta, mapcount); + mss->pss += pss_delta; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += size; + else + mss->private_clean += size; + mss->pss += (u64)size << PSS_SHIFT; + } +} + +static void smaps_pte_entry(pte_t *pte, unsigned long addr, + struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; pgoff_t pgoff = linear_page_index(vma, addr); struct page *page = NULL; - int mapcount; - if (pte_present(ptent)) { - page = vm_normal_page(vma, addr, ptent); - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); if (!non_swap_entry(swpent)) - mss->swap += ptent_size; + mss->swap += PAGE_SIZE; else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); - } else if (pte_file(ptent)) { - if (pte_to_pgoff(ptent) != pgoff) - mss->nonlinear += ptent_size; + } else if (pte_file(*pte)) { + if (pte_to_pgoff(*pte) != pgoff) + mss->nonlinear += PAGE_SIZE; } if (!page) return; - if (PageAnon(page)) - mss->anonymous += ptent_size; - if (page->index != pgoff) - mss->nonlinear += ptent_size; + mss->nonlinear += PAGE_SIZE; - mss->resident += ptent_size; - /* Accumulate the size in pages that have been accessed. */ - if (pte_young(ptent) || PageReferenced(page)) - mss->referenced += ptent_size; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - if (pte_dirty(ptent) || PageDirty(page)) - mss->shared_dirty += ptent_size; - else - mss->shared_clean += ptent_size; - mss->pss += (ptent_size << PSS_SHIFT) / mapcount; - } else { - if (pte_dirty(ptent) || PageDirty(page)) - mss->private_dirty += ptent_size; - else - mss->private_clean += ptent_size; - mss->pss += (ptent_size << PSS_SHIFT); - } + smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + struct page *page; + + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return; + mss->anonymous_thp += HPAGE_PMD_SIZE; + smaps_account(mss, page, HPAGE_PMD_SIZE, + pmd_young(*pmd), pmd_dirty(*pmd)); } +#else +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ +} +#endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -509,9 +542,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, spinlock_t *ptl; if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); - mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -524,7 +556,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) - smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); + smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; -- cgit v1.2.3 From 710585d4922fd315f2cada8fbe550ae8ed23e994 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 10 Dec 2014 15:45:01 -0800 Subject: fs/proc: use a rb tree for the directory entries When a lot of netdevices are created, one of the bottleneck is the creation of proc entries. This serie aims to accelerate this part. The current implementation for the directories in /proc is using a single linked list. This is slow when handling directories with large numbers of entries (eg netdevice-related entries when lots of tunnels are opened). This patch replaces this linked list by a red-black tree. Here are some numbers: dummy30000.batch contains 30 000 times 'link add type dummy'. Before the patch: $ time ip -b dummy30000.batch real 2m31.950s user 0m0.440s sys 2m21.440s $ time rmmod dummy real 1m35.764s user 0m0.000s sys 1m24.088s After the patch: $ time ip -b dummy30000.batch real 2m0.874s user 0m0.448s sys 1m49.720s $ time rmmod dummy real 1m13.988s user 0m0.000s sys 1m1.008s The idea of improving this part was suggested by Thierry Herbelot. [akpm@linux-foundation.org: initialise proc_root.subdir at compile time] Signed-off-by: Nicolas Dichtel Acked-by: David S. Miller Cc: Thierry Herbelot . Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 164 ++++++++++++++++++++++++++++++++++------------------- fs/proc/internal.h | 11 ++-- fs/proc/proc_net.c | 1 + fs/proc/root.c | 1 + 4 files changed, 113 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 317b72641ebf..9f8fa1e5e8aa 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -31,9 +31,81 @@ static DEFINE_SPINLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { - if (de->namelen != len) - return 0; - return !memcmp(name, de->name, len); + if (len < de->namelen) + return -1; + if (len > de->namelen) + return 1; + + return memcmp(name, de->name, len); +} + +static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) +{ + struct rb_node *node = rb_first(&dir->subdir); + + if (node == NULL) + return NULL; + + return rb_entry(node, struct proc_dir_entry, subdir_node); +} + +static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) +{ + struct rb_node *node = rb_next(&dir->subdir_node); + + if (node == NULL) + return NULL; + + return rb_entry(node, struct proc_dir_entry, subdir_node); +} + +static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, + const char *name, + unsigned int len) +{ + struct rb_node *node = dir->subdir.rb_node; + + while (node) { + struct proc_dir_entry *de = container_of(node, + struct proc_dir_entry, + subdir_node); + int result = proc_match(len, name, de); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return de; + } + return NULL; +} + +static bool pde_subdir_insert(struct proc_dir_entry *dir, + struct proc_dir_entry *de) +{ + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_dir_entry *this = + container_of(*new, struct proc_dir_entry, subdir_node); + int result = proc_match(de->namelen, de->name, this); + + parent = *new; + if (result < 0) + new = &(*new)->rb_left; + else if (result > 0) + new = &(*new)->rb_right; + else + return false; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&de->subdir_node, parent, new); + rb_insert_color(&de->subdir_node, root); + return true; } static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) @@ -92,10 +164,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, break; len = next - cp; - for (de = de->subdir; de ; de = de->next) { - if (proc_match(len, cp, de)) - break; - } + de = pde_subdir_find(de, cp, len); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; @@ -183,19 +252,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct inode *inode; spin_lock(&proc_subdir_lock); - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - pde_get(de); - spin_unlock(&proc_subdir_lock); - inode = proc_get_inode(dir->i_sb, de); - if (!inode) - return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &simple_dentry_operations); - d_add(dentry, inode); - return NULL; - } + de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); + if (de) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + inode = proc_get_inode(dir->i_sb, de); + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, inode); + return NULL; } spin_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -225,7 +291,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, return 0; spin_lock(&proc_subdir_lock); - de = de->subdir; + de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { @@ -234,7 +300,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, } if (!i) break; - de = de->next; + de = pde_subdir_next(de); i--; } @@ -249,7 +315,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, } spin_lock(&proc_subdir_lock); ctx->pos++; - next = de->next; + next = pde_subdir_next(de); pde_put(de); de = next; } while (de); @@ -286,9 +352,8 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - struct proc_dir_entry *tmp; int ret; - + ret = proc_alloc_inum(&dp->low_ino); if (ret) return ret; @@ -308,17 +373,10 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp } spin_lock(&proc_subdir_lock); - - for (tmp = dir->subdir; tmp; tmp = tmp->next) - if (strcmp(tmp->name, dp->name) == 0) { - WARN(1, "proc_dir_entry '%s/%s' already registered\n", - dir->name, dp->name); - break; - } - - dp->next = dir->subdir; dp->parent = dir; - dir->subdir = dp; + if (pde_subdir_insert(dir, dp) == false) + WARN(1, "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); spin_unlock(&proc_subdir_lock); return 0; @@ -354,6 +412,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; + ent->subdir = RB_ROOT; atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -485,7 +544,6 @@ void pde_put(struct proc_dir_entry *pde) */ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) { - struct proc_dir_entry **p; struct proc_dir_entry *de = NULL; const char *fn = name; unsigned int len; @@ -497,14 +555,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } len = strlen(fn); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - de = *p; - *p = de->next; - de->next = NULL; - break; - } - } + de = pde_subdir_find(parent, fn, len); + if (de) + rb_erase(&de->subdir_node, &parent->subdir); spin_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -516,16 +569,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; - WARN(de->subdir, "%s: removing non-empty directory " - "'%s/%s', leaking at least '%s'\n", __func__, - de->parent->name, de->name, de->subdir->name); + WARN(pde_subdir_first(de), + "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", + __func__, de->parent->name, de->name, pde_subdir_first(de)->name); pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { - struct proc_dir_entry **p; struct proc_dir_entry *root = NULL, *de, *next; const char *fn = name; unsigned int len; @@ -537,24 +589,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) } len = strlen(fn); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - root = *p; - *p = root->next; - root->next = NULL; - break; - } - } + root = pde_subdir_find(parent, fn, len); if (!root) { spin_unlock(&proc_subdir_lock); return -ENOENT; } + rb_erase(&root->subdir_node, &parent->subdir); + de = root; while (1) { - next = de->subdir; + next = pde_subdir_first(de); if (next) { - de->subdir = next->next; - next->next = NULL; + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa7a0ee182e1..7fb1a4869fd0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -24,10 +24,9 @@ struct mempolicy; * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * - * The "next" pointer creates a linked list of one /proc directory, - * while parent/subdir create the directory structure (every - * /proc file has a parent, but "subdir" is NULL for all - * non-directory entries). + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { unsigned int low_ino; @@ -38,7 +37,9 @@ struct proc_dir_entry { loff_t size; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; - struct proc_dir_entry *next, *parent, *subdir; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; void *data; atomic_t count; /* use count */ atomic_t in_use; /* number of callers into module in progress; */ diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a63af3e0a612..1bde894bc624 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index 094e44d4a6be..e74ac9f1a2c0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .subdir = RB_ROOT, .name = "/proc", }; -- cgit v1.2.3 From b208d54b75399b276b05f9e70cce8d3a59a42547 Mon Sep 17 00:00:00 2001 From: Debabrata Banerjee Date: Wed, 10 Dec 2014 15:45:04 -0800 Subject: procfs: fix error handling of proc_register() proc_register() error paths are leaking inodes and directory refcounts. Signed-off-by: Debabrata Banerjee Cc: Alexander Viro Acked-by: Nicolas Dichtel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9f8fa1e5e8aa..be39c6feb3e5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -369,14 +369,21 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp dp->proc_iops = &proc_file_inode_operations; } else { WARN_ON(1); + proc_free_inum(dp->low_ino); return -EINVAL; } spin_lock(&proc_subdir_lock); dp->parent = dir; - if (pde_subdir_insert(dir, dp) == false) + if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); + spin_unlock(&proc_subdir_lock); + if (S_ISDIR(dp->mode)) + dir->nlink--; + proc_free_inum(dp->low_ino); + return -EEXIST; + } spin_unlock(&proc_subdir_lock); return 0; -- cgit v1.2.3 From 2fc1e948e820bddf8a686c6e2989219b471d7982 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 10 Dec 2014 15:45:07 -0800 Subject: fs/proc.c: use rb_entry_safe() instead of rb_entry() Better to use existing macro that rewriting them. Signed-off-by: Nicolas Dichtel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index be39c6feb3e5..7fea13229f33 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -41,22 +41,14 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - struct rb_node *node = rb_first(&dir->subdir); - - if (node == NULL) - return NULL; - - return rb_entry(node, struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) { - struct rb_node *node = rb_next(&dir->subdir_node); - - if (node == NULL) - return NULL; - - return rb_entry(node, struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, -- cgit v1.2.3 From 4af1036df4dd4f0d59fad9d82ed456bfa2e73fa6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:10 -0800 Subject: proc: task_state: read cred->group_info outside of task_lock() task_state() reads cred->group_info under task_lock() because a long ago it was task_struct->group_info and it was actually protected by task->alloc_lock. Today this task_unlock() after rcu_read_unlock() just adds the confusion, move task_unlock() up. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index cd3653e4f35c..b5810c228c10 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -201,11 +201,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "FDSize:\t%d\n" "Groups:\t", fdt ? fdt->max_fds : 0); + task_unlock(p); rcu_read_unlock(); group_info = cred->group_info; - task_unlock(p); - for (g = 0; g < group_info->ngroups; g++) seq_printf(m, "%d ", from_kgid_munged(user_ns, GROUP_AT(group_info, g))); -- cgit v1.2.3 From 0f4a0d53f20c76a70cb5b69b6aa237de2107e171 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:12 -0800 Subject: proc: task_state: deuglify the max_fds calculation 1. The usage of fdt looks very ugly, it can't be NULL if ->files is not NULL. We can use "unsigned int max_fds" instead. 2. This also allows to move seq_printf(max_fds) outside of task_lock() and join it with the previous seq_printf(). See also the next patch. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index b5810c228c10..7c8d9aecb070 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,9 +157,9 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; - struct fdtable *fdt = NULL; const struct cred *cred; pid_t ppid, tpid; + unsigned int max_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? @@ -171,6 +171,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, tpid = task_pid_nr_ns(tracer, ns); } cred = get_task_cred(p); + + task_lock(p); + if (p->files) + max_fds = files_fdtable(p->files)->max_fds; + task_unlock(p); + seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" @@ -179,7 +185,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n", + "Gid:\t%d\t%d\t%d\t%d\n" + "FDSize:\t%d\nGroups:\t", get_task_state(p), task_tgid_nr_ns(p, ns), task_numa_group_id(p), @@ -192,16 +199,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, cred->gid), from_kgid_munged(user_ns, cred->egid), from_kgid_munged(user_ns, cred->sgid), - from_kgid_munged(user_ns, cred->fsgid)); - - task_lock(p); - if (p->files) - fdt = files_fdtable(p->files); - seq_printf(m, - "FDSize:\t%d\n" - "Groups:\t", - fdt ? fdt->max_fds : 0); - task_unlock(p); + from_kgid_munged(user_ns, cred->fsgid), + max_fds); rcu_read_unlock(); group_info = cred->group_info; -- cgit v1.2.3 From b0fafc11115938a3215723f37e8e9cc6a94b05b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:15 -0800 Subject: proc: task_state: move the main seq_printf() outside of rcu_read_lock() task_state() does seq_printf() under rcu_read_lock(), but this is only needed for task_tgid_nr_ns() and task_numa_group_id(). We can calculate tgid/ngid and drop rcu lock. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 7c8d9aecb070..800e30f8f284 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -158,7 +158,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct group_info *group_info; int g; const struct cred *cred; - pid_t ppid, tpid; + pid_t ppid, tpid, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); @@ -170,12 +170,16 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } + + tgid = task_tgid_nr_ns(p, ns); + ngid = task_numa_group_id(p); cred = get_task_cred(p); task_lock(p); if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); + rcu_read_unlock(); seq_printf(m, "State:\t%s\n" @@ -188,10 +192,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "Gid:\t%d\t%d\t%d\t%d\n" "FDSize:\t%d\nGroups:\t", get_task_state(p), - task_tgid_nr_ns(p, ns), - task_numa_group_id(p), - pid_nr_ns(pid, ns), - ppid, tpid, + tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, from_kuid_munged(user_ns, cred->uid), from_kuid_munged(user_ns, cred->euid), from_kuid_munged(user_ns, cred->suid), @@ -201,7 +202,6 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, cred->sgid), from_kgid_munged(user_ns, cred->fsgid), max_fds); - rcu_read_unlock(); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) -- cgit v1.2.3 From abdba6e9ea6d3903c2b0618db720e17b3c1c705c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:18 -0800 Subject: proc: task_state: ptrace_parent() doesn't need pid_alive() check p->ptrace != 0 means that release_task(p) was not called, so pid_alive() buys nothing and we can remove this check. Other callers already use it directly without additional checks. Note: with or without this patch ptrace_parent() can return the pointer to the freed task, this will be explained/fixed later. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 800e30f8f284..bd117d065b82 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,19 +157,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; + struct task_struct *tracer; const struct cred *cred; - pid_t ppid, tpid, tgid, ngid; + pid_t ppid, tpid = 0, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tpid = 0; - if (pid_alive(p)) { - struct task_struct *tracer = ptrace_parent(p); - if (tracer) - tpid = task_pid_nr_ns(tracer, ns); - } + + tracer = ptrace_parent(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); -- cgit v1.2.3 From c6cb898b54ce1705f9edfa4a13be57ad3cced4a1 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 10 Dec 2014 15:45:42 -0800 Subject: binfmt_misc: replace get_unused_fd() with get_unused_fd_flags(0) This patch replaces calls to get_unused_fd() with equivalent call to get_unused_fd_flags(0) to preserve current behavor for existing code. In a further patch, get_unused_fd() will be removed so that new code start using get_unused_fd_flags(), with the hope O_CLOEXEC could be used, either by default or choosen by userspace. Signed-off-by: Yann Droneaud Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fd8beb9657a2..1cc5377ba955 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -153,7 +153,7 @@ static int load_misc_binary(struct linux_binprm *bprm) /* if the binary should be opened on behalf of the * interpreter than keep it open and assign descriptor * to it */ - fd_binary = get_unused_fd(); + fd_binary = get_unused_fd_flags(0); if (fd_binary < 0) { retval = fd_binary; goto _ret; -- cgit v1.2.3 From 8d10a035829cacbd84bf35610870c35e566d1242 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 10 Dec 2014 15:45:44 -0800 Subject: fs/file.c: replace get_unused_fd() with get_unused_fd_flags(0) This patch replaces calls to get_unused_fd() with equivalent call to get_unused_fd_flags(0) to preserve current behavor for existing code. In a further patch, get_unused_fd() will be removed so that new code start using get_unused_fd_flags(), with the hope O_CLOEXEC could be used, either by default or choosen by userspace. Signed-off-by: Yann Droneaud Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index ab3eb6a88239..ee738ea028fa 100644 --- a/fs/file.c +++ b/fs/file.c @@ -869,7 +869,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) struct file *file = fget_raw(fildes); if (file) { - ret = get_unused_fd(); + ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else -- cgit v1.2.3 From 6b899c4e9a049dfca759d990bd53b14f81c3626c Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 10 Dec 2014 15:52:08 -0800 Subject: binfmt_misc: add comments & debug logs When trying to develop a custom format handler, the errors returned all effectively get bucketed as EINVAL with no kernel messages. The other errors (ENOMEM/EFAULT) are internal/obvious and basic. Thus any time a bad handler is rejected, the developer has to walk the dense code and try to guess where it went wrong. Needing to dive into kernel code is itself a fairly high barrier for a lot of people. To improve this situation, let's deploy extensive pr_debug markers at logical parse points, and add comments to the dense parsing logic. It let's you see exactly where the parsing aborts, the string the kernel received (useful when dealing with shell code), how it translated the buffers to binary data, and how it will apply the mask at runtime. Some example output: $ echo ':qemu-foo:M::\x7fELF\xAD\xAD\x01\x00:\xff\xff\xff\xff\xff\x00\xff\x00:/usr/bin/qemu-foo:POC' > register $ dmesg binfmt_misc: register: received 92 bytes binfmt_misc: register: delim: 0x3a {:} binfmt_misc: register: name: {qemu-foo} binfmt_misc: register: type: M (magic) binfmt_misc: register: offset: 0x0 binfmt_misc: register: magic[raw]: 5c 78 37 66 45 4c 46 5c 78 41 44 5c 78 41 44 5c \x7fELF\xAD\xAD\ binfmt_misc: register: magic[raw]: 78 30 31 5c 78 30 30 00 x01\x00. binfmt_misc: register: mask[raw]: 5c 78 66 66 5c 78 66 66 5c 78 66 66 5c 78 66 66 \xff\xff\xff\xff binfmt_misc: register: mask[raw]: 5c 78 66 66 5c 78 30 30 5c 78 66 66 5c 78 30 30 \xff\x00\xff\x00 binfmt_misc: register: mask[raw]: 00 . binfmt_misc: register: magic/mask length: 8 binfmt_misc: register: magic[decoded]: 7f 45 4c 46 ad ad 01 00 .ELF.... binfmt_misc: register: mask[decoded]: ff ff ff ff ff 00 ff 00 ........ binfmt_misc: register: magic[masked]: 7f 45 4c 46 ad 00 01 00 .ELF.... binfmt_misc: register: interpreter: {/usr/bin/qemu-foo} binfmt_misc: register: flag: P (preserve argv0) binfmt_misc: register: flag: O (open binary) binfmt_misc: register: flag: C (preserve creds) The [raw] lines show us exactly what was received from userspace. The lines after that show us how the kernel has decoded things. Signed-off-by: Mike Frysinger Cc: Al Viro Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 121 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1cc5377ba955..d87ddc7c4b14 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -16,6 +16,8 @@ * 2001-02-28 AV: rewritten into something that resembles C. Original didn't. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -30,8 +32,13 @@ #include #include #include +#include -#include +#ifdef DEBUG +# define USE_DEBUG 1 +#else +# define USE_DEBUG 0 +#endif enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ @@ -87,20 +94,24 @@ static Node *check_file(struct linux_binprm *bprm) char *p = strrchr(bprm->interp, '.'); struct list_head *l; + /* Walk all the registered handlers. */ list_for_each(l, &entries) { Node *e = list_entry(l, Node, list); char *s; int j; + /* Make sure this one is currently enabled. */ if (!test_bit(Enabled, &e->flags)) continue; + /* Do matching based on extension if applicable. */ if (!test_bit(Magic, &e->flags)) { if (p && !strcmp(e->magic, p + 1)) return e; continue; } + /* Do matching based on magic & mask. */ s = bprm->buf + e->offset; if (e->mask) { for (j = 0; j < e->size; j++) @@ -259,14 +270,17 @@ static char * check_special_flags (char * sfs, Node * e) while (cont) { switch (*p) { case 'P': + pr_debug("register: flag: P (preserve argv0)\n"); p++; e->flags |= MISC_FMT_PRESERVE_ARGV0; break; case 'O': + pr_debug("register: flag: O (open binary)\n"); p++; e->flags |= MISC_FMT_OPEN_BINARY; break; case 'C': + pr_debug("register: flag: C (preserve creds)\n"); p++; /* this flags also implies the open-binary flag */ @@ -292,6 +306,8 @@ static Node *create_entry(const char __user *buffer, size_t count) char *buf, *p; char del; + pr_debug("register: received %zu bytes\n", count); + /* some sanity checks */ err = -EINVAL; if ((count < 11) || (count > MAX_REGISTER_LENGTH)) @@ -311,8 +327,12 @@ static Node *create_entry(const char __user *buffer, size_t count) del = *p++; /* delimeter */ + pr_debug("register: delim: %#x {%c}\n", del, del); + + /* Pad the buffer with the delim to simplify parsing below. */ memset(buf+count, del, 8); + /* Parse the 'name' field. */ e->name = p; p = strchr(p, del); if (!p) @@ -323,46 +343,113 @@ static Node *create_entry(const char __user *buffer, size_t count) !strcmp(e->name, "..") || strchr(e->name, '/')) goto Einval; + + pr_debug("register: name: {%s}\n", e->name); + + /* Parse the 'type' field. */ switch (*p++) { - case 'E': e->flags = 1<flags = (1<flags = 1 << Enabled; + break; + case 'M': + pr_debug("register: type: M (magic)\n"); + e->flags = (1 << Enabled) | (1 << Magic); + break; + default: + goto Einval; } if (*p++ != del) goto Einval; + if (test_bit(Magic, &e->flags)) { - char *s = strchr(p, del); + /* Handle the 'M' (magic) format. */ + char *s; + + /* Parse the 'offset' field. */ + s = strchr(p, del); if (!s) goto Einval; *s++ = '\0'; e->offset = simple_strtoul(p, &p, 10); if (*p++) goto Einval; + pr_debug("register: offset: %#x\n", e->offset); + + /* Parse the 'magic' field. */ e->magic = p; p = scanarg(p, del); if (!p) goto Einval; p[-1] = '\0'; - if (!e->magic[0]) + if (p == e->magic) goto Einval; + if (USE_DEBUG) + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[raw]: ", + DUMP_PREFIX_NONE, e->magic, p - e->magic); + + /* Parse the 'mask' field. */ e->mask = p; p = scanarg(p, del); if (!p) goto Einval; p[-1] = '\0'; - if (!e->mask[0]) + if (p == e->mask) { e->mask = NULL; + pr_debug("register: mask[raw]: none\n"); + } else if (USE_DEBUG) + print_hex_dump_bytes( + KBUILD_MODNAME ": register: mask[raw]: ", + DUMP_PREFIX_NONE, e->mask, p - e->mask); + + /* + * Decode the magic & mask fields. + * Note: while we might have accepted embedded NUL bytes from + * above, the unescape helpers here will stop at the first one + * it encounters. + */ e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto Einval; if (e->size + e->offset > BINPRM_BUF_SIZE) goto Einval; + pr_debug("register: magic/mask length: %i\n", e->size); + if (USE_DEBUG) { + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[decoded]: ", + DUMP_PREFIX_NONE, e->magic, e->size); + + if (e->mask) { + int i; + char *masked = kmalloc(e->size, GFP_USER); + + print_hex_dump_bytes( + KBUILD_MODNAME ": register: mask[decoded]: ", + DUMP_PREFIX_NONE, e->mask, e->size); + + if (masked) { + for (i = 0; i < e->size; ++i) + masked[i] = e->magic[i] & e->mask[i]; + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[masked]: ", + DUMP_PREFIX_NONE, masked, e->size); + + kfree(masked); + } + } + } } else { + /* Handle the 'E' (extension) format. */ + + /* Skip the 'offset' field. */ p = strchr(p, del); if (!p) goto Einval; *p++ = '\0'; + + /* Parse the 'magic' field. */ e->magic = p; p = strchr(p, del); if (!p) @@ -370,11 +457,16 @@ static Node *create_entry(const char __user *buffer, size_t count) *p++ = '\0'; if (!e->magic[0] || strchr(e->magic, '/')) goto Einval; + pr_debug("register: extension: {%s}\n", e->magic); + + /* Skip the 'mask' field. */ p = strchr(p, del); if (!p) goto Einval; *p++ = '\0'; } + + /* Parse the 'interpreter' field. */ e->interpreter = p; p = strchr(p, del); if (!p) @@ -382,10 +474,10 @@ static Node *create_entry(const char __user *buffer, size_t count) *p++ = '\0'; if (!e->interpreter[0]) goto Einval; + pr_debug("register: interpreter: {%s}\n", e->interpreter); - + /* Parse the 'flags' field. */ p = check_special_flags (p, e); - if (*p == '\n') p++; if (p != buf + count) @@ -553,11 +645,17 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, int res = parse_command(buffer, count); switch (res) { - case 1: clear_bit(Enabled, &e->flags); + case 1: + /* Disable this handler. */ + clear_bit(Enabled, &e->flags); break; - case 2: set_bit(Enabled, &e->flags); + case 2: + /* Enable this handler. */ + set_bit(Enabled, &e->flags); break; - case 3: root = dget(file->f_path.dentry->d_sb->s_root); + case 3: + /* Delete this handler. */ + root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); kill_node(e); @@ -661,9 +759,17 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, struct dentry *root; switch (res) { - case 1: enabled = 0; break; - case 2: enabled = 1; break; - case 3: root = dget(file->f_path.dentry->d_sb->s_root); + case 1: + /* Disable all handlers. */ + enabled = 0; + break; + case 2: + /* Enable all handlers. */ + enabled = 1; + break; + case 3: + /* Delete all handlers. */ + root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); while (!list_empty(&entries)) -- cgit v1.2.3 From e6084d4a086b626acb5cf97795a9243062c2f4cd Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 10 Dec 2014 15:52:10 -0800 Subject: binfmt_misc: clean up code style a bit Clean up various coding style issues that checkpatch complains about. No functional changes here. Signed-off-by: Mike Frysinger Cc: Al Viro Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 293 +++++++++++++++++++++++++++---------------------------- 1 file changed, 145 insertions(+), 148 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index d87ddc7c4b14..01a2cd9ab2bf 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -1,19 +1,10 @@ /* - * binfmt_misc.c + * binfmt_misc.c * - * Copyright (C) 1997 Richard Günther + * Copyright (C) 1997 Richard Günther * - * binfmt_misc detects binaries via a magic or filename extension and invokes - * a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and - * binfmt_mz. - * - * 1997-04-25 first version - * [...] - * 1997-05-19 cleanup - * 1997-06-26 hpa: pass the real filename rather than argv[0] - * 1997-06-30 minor cleanup - * 1997-08-09 removed extension stripping, locking cleanup - * 2001-02-28 AV: rewritten into something that resembles C. Original didn't. + * binfmt_misc detects binaries via a magic or filename extension and invokes + * a specified wrapper. See Documentation/binfmt_misc.txt for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -48,9 +39,9 @@ static LIST_HEAD(entries); static int enabled = 1; enum {Enabled, Magic}; -#define MISC_FMT_PRESERVE_ARGV0 (1<<31) -#define MISC_FMT_OPEN_BINARY (1<<30) -#define MISC_FMT_CREDENTIALS (1<<29) +#define MISC_FMT_PRESERVE_ARGV0 (1 << 31) +#define MISC_FMT_OPEN_BINARY (1 << 30) +#define MISC_FMT_CREDENTIALS (1 << 29) typedef struct { struct list_head list; @@ -134,7 +125,7 @@ static Node *check_file(struct linux_binprm *bprm) static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; - struct file * interp_file = NULL; + struct file *interp_file = NULL; char iname[BINPRM_BUF_SIZE]; const char *iname_addr = iname; int retval; @@ -142,7 +133,7 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = -ENOEXEC; if (!enabled) - goto _ret; + goto ret; /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); @@ -151,25 +142,26 @@ static int load_misc_binary(struct linux_binprm *bprm) strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); read_unlock(&entries_lock); if (!fmt) - goto _ret; + goto ret; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); if (retval) - goto _ret; + goto ret; } if (fmt->flags & MISC_FMT_OPEN_BINARY) { /* if the binary should be opened on behalf of the * interpreter than keep it open and assign descriptor - * to it */ + * to it + */ fd_binary = get_unused_fd_flags(0); - if (fd_binary < 0) { - retval = fd_binary; - goto _ret; - } - fd_install(fd_binary, bprm->file); + if (fd_binary < 0) { + retval = fd_binary; + goto ret; + } + fd_install(fd_binary, bprm->file); /* if the binary is not readable than enforce mm->dumpable=0 regardless of the interpreter's permissions */ @@ -182,32 +174,32 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->interp_flags |= BINPRM_FLAGS_EXECFD; bprm->interp_data = fd_binary; - } else { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - } + } else { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + } /* make argv[1] be the path to the binary */ - retval = copy_strings_kernel (1, &bprm->interp, bprm); + retval = copy_strings_kernel(1, &bprm->interp, bprm); if (retval < 0) - goto _error; + goto error; bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel (1, &iname_addr, bprm); + retval = copy_strings_kernel(1, &iname_addr, bprm); if (retval < 0) - goto _error; - bprm->argc ++; + goto error; + bprm->argc++; /* Update interp in case binfmt_script needs it. */ retval = bprm_change_interp(iname, bprm); if (retval < 0) - goto _error; + goto error; - interp_file = open_exec (iname); - retval = PTR_ERR (interp_file); - if (IS_ERR (interp_file)) - goto _error; + interp_file = open_exec(iname); + retval = PTR_ERR(interp_file); + if (IS_ERR(interp_file)) + goto error; bprm->file = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) { @@ -218,23 +210,23 @@ static int load_misc_binary(struct linux_binprm *bprm) memset(bprm->buf, 0, BINPRM_BUF_SIZE); retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); } else - retval = prepare_binprm (bprm); + retval = prepare_binprm(bprm); if (retval < 0) - goto _error; + goto error; retval = search_binary_handler(bprm); if (retval < 0) - goto _error; + goto error; -_ret: +ret: return retval; -_error: +error: if (fd_binary > 0) sys_close(fd_binary); bprm->interp_flags = 0; bprm->interp_data = 0; - goto _ret; + goto ret; } /* Command parsers */ @@ -261,39 +253,40 @@ static char *scanarg(char *s, char del) return s; } -static char * check_special_flags (char * sfs, Node * e) +static char *check_special_flags(char *sfs, Node *e) { - char * p = sfs; + char *p = sfs; int cont = 1; /* special flags */ while (cont) { switch (*p) { - case 'P': - pr_debug("register: flag: P (preserve argv0)\n"); - p++; - e->flags |= MISC_FMT_PRESERVE_ARGV0; - break; - case 'O': - pr_debug("register: flag: O (open binary)\n"); - p++; - e->flags |= MISC_FMT_OPEN_BINARY; - break; - case 'C': - pr_debug("register: flag: C (preserve creds)\n"); - p++; - /* this flags also implies the - open-binary flag */ - e->flags |= (MISC_FMT_CREDENTIALS | - MISC_FMT_OPEN_BINARY); - break; - default: - cont = 0; + case 'P': + pr_debug("register: flag: P (preserve argv0)\n"); + p++; + e->flags |= MISC_FMT_PRESERVE_ARGV0; + break; + case 'O': + pr_debug("register: flag: O (open binary)\n"); + p++; + e->flags |= MISC_FMT_OPEN_BINARY; + break; + case 'C': + pr_debug("register: flag: C (preserve creds)\n"); + p++; + /* this flags also implies the + open-binary flag */ + e->flags |= (MISC_FMT_CREDENTIALS | + MISC_FMT_OPEN_BINARY); + break; + default: + cont = 0; } } return p; } + /* * This registers a new binary format, it recognises the syntax * ':name:type:offset:magic:mask:interpreter:flags' @@ -323,26 +316,26 @@ static Node *create_entry(const char __user *buffer, size_t count) memset(e, 0, sizeof(Node)); if (copy_from_user(buf, buffer, count)) - goto Efault; + goto efault; del = *p++; /* delimeter */ pr_debug("register: delim: %#x {%c}\n", del, del); /* Pad the buffer with the delim to simplify parsing below. */ - memset(buf+count, del, 8); + memset(buf + count, del, 8); /* Parse the 'name' field. */ e->name = p; p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; if (!e->name[0] || !strcmp(e->name, ".") || !strcmp(e->name, "..") || strchr(e->name, '/')) - goto Einval; + goto einval; pr_debug("register: name: {%s}\n", e->name); @@ -357,10 +350,10 @@ static Node *create_entry(const char __user *buffer, size_t count) e->flags = (1 << Enabled) | (1 << Magic); break; default: - goto Einval; + goto einval; } if (*p++ != del) - goto Einval; + goto einval; if (test_bit(Magic, &e->flags)) { /* Handle the 'M' (magic) format. */ @@ -369,21 +362,21 @@ static Node *create_entry(const char __user *buffer, size_t count) /* Parse the 'offset' field. */ s = strchr(p, del); if (!s) - goto Einval; + goto einval; *s++ = '\0'; e->offset = simple_strtoul(p, &p, 10); if (*p++) - goto Einval; + goto einval; pr_debug("register: offset: %#x\n", e->offset); /* Parse the 'magic' field. */ e->magic = p; p = scanarg(p, del); if (!p) - goto Einval; + goto einval; p[-1] = '\0'; if (p == e->magic) - goto Einval; + goto einval; if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[raw]: ", @@ -393,7 +386,7 @@ static Node *create_entry(const char __user *buffer, size_t count) e->mask = p; p = scanarg(p, del); if (!p) - goto Einval; + goto einval; p[-1] = '\0'; if (p == e->mask) { e->mask = NULL; @@ -412,9 +405,9 @@ static Node *create_entry(const char __user *buffer, size_t count) e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) - goto Einval; + goto einval; if (e->size + e->offset > BINPRM_BUF_SIZE) - goto Einval; + goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { print_hex_dump_bytes( @@ -446,23 +439,23 @@ static Node *create_entry(const char __user *buffer, size_t count) /* Skip the 'offset' field. */ p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; /* Parse the 'magic' field. */ e->magic = p; p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; if (!e->magic[0] || strchr(e->magic, '/')) - goto Einval; + goto einval; pr_debug("register: extension: {%s}\n", e->magic); /* Skip the 'mask' field. */ p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; } @@ -470,27 +463,28 @@ static Node *create_entry(const char __user *buffer, size_t count) e->interpreter = p; p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; if (!e->interpreter[0]) - goto Einval; + goto einval; pr_debug("register: interpreter: {%s}\n", e->interpreter); /* Parse the 'flags' field. */ - p = check_special_flags (p, e); + p = check_special_flags(p, e); if (*p == '\n') p++; if (p != buf + count) - goto Einval; + goto einval; + return e; out: return ERR_PTR(err); -Efault: +efault: kfree(e); return ERR_PTR(-EFAULT); -Einval: +einval: kfree(e); return ERR_PTR(-EINVAL); } @@ -509,7 +503,7 @@ static int parse_command(const char __user *buffer, size_t count) return -EFAULT; if (!count) return 0; - if (s[count-1] == '\n') + if (s[count - 1] == '\n') count--; if (count == 1 && s[0] == '0') return 1; @@ -526,7 +520,7 @@ static void entry_status(Node *e, char *page) { char *dp; char *status = "disabled"; - const char * flags = "flags: "; + const char *flags = "flags: "; if (test_bit(Enabled, &e->flags)) status = "enabled"; @@ -540,19 +534,15 @@ static void entry_status(Node *e, char *page) dp = page + strlen(page); /* print the special flags */ - sprintf (dp, "%s", flags); - dp += strlen (flags); - if (e->flags & MISC_FMT_PRESERVE_ARGV0) { - *dp ++ = 'P'; - } - if (e->flags & MISC_FMT_OPEN_BINARY) { - *dp ++ = 'O'; - } - if (e->flags & MISC_FMT_CREDENTIALS) { - *dp ++ = 'C'; - } - *dp ++ = '\n'; - + sprintf(dp, "%s", flags); + dp += strlen(flags); + if (e->flags & MISC_FMT_PRESERVE_ARGV0) + *dp++ = 'P'; + if (e->flags & MISC_FMT_OPEN_BINARY) + *dp++ = 'O'; + if (e->flags & MISC_FMT_CREDENTIALS) + *dp++ = 'C'; + *dp++ = '\n'; if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); @@ -580,7 +570,7 @@ static void entry_status(Node *e, char *page) static struct inode *bm_get_inode(struct super_block *sb, int mode) { - struct inode * inode = new_inode(sb); + struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); @@ -620,13 +610,14 @@ static void kill_node(Node *e) /* / */ static ssize_t -bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) +bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { Node *e = file_inode(file)->i_private; ssize_t res; char *page; - if (!(page = (char*) __get_free_page(GFP_KERNEL))) + page = (char *) __get_free_page(GFP_KERNEL); + if (!page) return -ENOMEM; entry_status(e, page); @@ -645,26 +636,28 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, int res = parse_command(buffer, count); switch (res) { - case 1: - /* Disable this handler. */ - clear_bit(Enabled, &e->flags); - break; - case 2: - /* Enable this handler. */ - set_bit(Enabled, &e->flags); - break; - case 3: - /* Delete this handler. */ - root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + case 1: + /* Disable this handler. */ + clear_bit(Enabled, &e->flags); + break; + case 2: + /* Enable this handler. */ + set_bit(Enabled, &e->flags); + break; + case 3: + /* Delete this handler. */ + root = dget(file->f_path.dentry->d_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); - kill_node(e); + kill_node(e); - mutex_unlock(&root->d_inode->i_mutex); - dput(root); - break; - default: return res; + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + break; + default: + return res; } + return count; } @@ -752,34 +745,36 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } -static ssize_t bm_status_write(struct file * file, const char __user * buffer, +static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int res = parse_command(buffer, count); struct dentry *root; switch (res) { - case 1: - /* Disable all handlers. */ - enabled = 0; - break; - case 2: - /* Enable all handlers. */ - enabled = 1; - break; - case 3: - /* Delete all handlers. */ - root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + case 1: + /* Disable all handlers. */ + enabled = 0; + break; + case 2: + /* Enable all handlers. */ + enabled = 1; + break; + case 3: + /* Delete all handlers. */ + root = dget(file->f_path.dentry->d_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); - while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); + while (!list_empty(&entries)) + kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&root->d_inode->i_mutex); - dput(root); - break; - default: return res; + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + break; + default: + return res; } + return count; } @@ -796,14 +791,16 @@ static const struct super_operations s_ops = { .evict_inode = bm_evict_inode, }; -static int bm_fill_super(struct super_block * sb, void * data, int silent) +static int bm_fill_super(struct super_block *sb, void *data, int silent) { + int err; static struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; - int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); + + err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; -- cgit v1.2.3 From f7e1ad1a1e23af97419cd8d5adff67fedf7cf169 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 10 Dec 2014 15:52:13 -0800 Subject: fs/binfmt_misc.c: use GFP_KERNEL instead of GFP_USER GFP_USER means "honour cpuset nodes-allowed beancounting". These are regular old kernel objects and there seems no reason to give them this treatment. Acked-by: Mike Frysinger Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 01a2cd9ab2bf..70789e198dea 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -308,7 +308,7 @@ static Node *create_entry(const char __user *buffer, size_t count) err = -ENOMEM; memsize = sizeof(Node) + count + 8; - e = kmalloc(memsize, GFP_USER); + e = kmalloc(memsize, GFP_KERNEL); if (!e) goto out; @@ -416,7 +416,7 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask) { int i; - char *masked = kmalloc(e->size, GFP_USER); + char *masked = kmalloc(e->size, GFP_KERNEL); print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[decoded]: ", -- cgit v1.2.3 From 52f5592e549c013feb9bb71cab3e6fd624633577 Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Wed, 10 Dec 2014 15:52:16 -0800 Subject: fs/binfmt_elf.c: fix internal inconsistency relating to vma dump size vma_dump_size() has been used several times on actual dumper and it is supposed to return the same value for the same vma. But vma_dump_size() could return different values for same vma. The known problem case is concurrent shared memory removal. If a vma is used for a shared memory and that shared memory is removed between writing program header and dumping vma memory, this will result in a dump file which is internally consistent. To fix the problem, we set baseline to get dump size and store the size into vma_filesz and always use the same vma dump size which is stored in vma_filsz. The consistnecy with reality is not actually guranteed, but it's tolerable since that is fully consistent with base line. Signed-off-by: Jungseung Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d8fc0605b9d2..3a6175fe10c0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1994,18 +1994,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, shdr4extnum->sh_info = segs; } -static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma, - unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) - size += vma_dump_size(vma, mm_flags); - return size; -} - /* * Actual dumper * @@ -2017,7 +2005,8 @@ static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; mm_segment_t fs; - int segs; + int segs, i; + size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -2026,6 +2015,7 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; + elf_addr_t *vma_filesz = NULL; /* * We no longer stop all VM operations. @@ -2093,7 +2083,20 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); + vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (!vma_filesz) + goto end_coredump; + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + unsigned long dump_size; + + dump_size = vma_dump_size(vma, cprm->mm_flags); + vma_filesz[i++] = dump_size; + vma_data_size += dump_size; + } + + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2113,7 +2116,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (vma = first_vma(current, gate_vma); vma != NULL; + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { struct elf_phdr phdr; @@ -2121,7 +2124,7 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); + phdr.p_filesz = vma_filesz[i++]; phdr.p_memsz = vma->vm_end - vma->vm_start; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; @@ -2149,12 +2152,12 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->written)) goto end_coredump; - for (vma = first_vma(current, gate_vma); vma != NULL; + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { unsigned long addr; unsigned long end; - end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); + end = vma->vm_start + vma_filesz[i++]; for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; @@ -2187,6 +2190,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); + kfree(vma_filesz); kfree(phdr4note); kfree(elf); out: -- cgit v1.2.3 From a682e9c28cac152e6e54c39efcf046e0c8cfcf63 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:52:22 -0800 Subject: ncpfs: return proper error from NCP_IOC_SETROOT ioctl If some error happens in NCP_IOC_SETROOT ioctl, the appropriate error return value is then (in most cases) just overwritten before we return. This can result in reporting success to userspace although error happened. This bug was introduced by commit 2e54eb96e2c8 ("BKL: Remove BKL from ncpfs"). Propagate the errors correctly. Coverity id: 1226925. Fixes: 2e54eb96e2c80 ("BKL: Remove BKL from ncpfs") Signed-off-by: Jan Kara Cc: Petr Vandrovec Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ncpfs/ioctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index d5659d96ee7f..cf7e043a9447 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -447,7 +447,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg result = -EIO; } } - result = 0; } mutex_unlock(&server->root_setup_lock); -- cgit v1.2.3 From 75dc857c46996e336cb2f9cf0ed1e5508fb241a7 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Wed, 10 Dec 2014 15:54:29 -0800 Subject: nilfs2: avoid duplicate segment construction for fsync() This patch removes filemap_write_and_wait_range() from nilfs_sync_file(), because it triggers a data segment construction by calling nilfs_writepages() with WB_SYNC_ALL. A data segment construction does not remove the inode from the i_dirty list and it does not clear the NILFS_I_DIRTY flag. Therefore nilfs_inode_dirty() still returns true, which leads to an unnecessary duplicate segment construction in nilfs_sync_file(). A call to filemap_write_and_wait_range() is not needed, because NILFS2 does not rely on the generic writeback mechanisms. Instead it implements its own mechanism to collect all dirty pages and write them into segments. It is more efficient to initiate the segment construction directly in nilfs_sync_file() without the detour over filemap_write_and_wait_range(). Additionally the lock of i_mutex is not needed, because all code blocks that are protected by i_mutex are also protected by a NILFS transaction: Function i_mutex nilfs_transaction ------------------------------------------------------ nilfs_ioctl_setflags: yes yes nilfs_fiemap: yes no nilfs_write_begin: yes yes nilfs_write_end: yes yes nilfs_lookup: yes no nilfs_create: yes yes nilfs_link: yes yes nilfs_mknod: yes yes nilfs_symlink: yes yes nilfs_mkdir: yes yes nilfs_unlink: yes yes nilfs_rmdir: yes yes nilfs_rename: yes yes nilfs_setattr: yes yes For nilfs_lookup() i_mutex is held for the parent directory, to protect it from modification. The segment construction does not modify directory inodes, so no lock is needed. nilfs_fiemap() reads the block layout on the disk, by using nilfs_bmap_lookup_contig(). This is already protected by bmap->b_sem. Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/file.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index e9e3325f29f3..3a03e0aea1fb 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -39,21 +39,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ struct the_nilfs *nilfs; struct inode *inode = file->f_mapping->host; - int err; - - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - mutex_lock(&inode->i_mutex); + int err = 0; if (nilfs_inode_dirty(inode)) { if (datasync) err = nilfs_construct_dsync_segment(inode->i_sb, inode, - 0, LLONG_MAX); + start, end); else err = nilfs_construct_segment(inode->i_sb); } - mutex_unlock(&inode->i_mutex); nilfs = inode->i_sb->s_fs_info; if (!err) -- cgit v1.2.3 From 72b9918ea4d7f2d8362a2defdb93e5fd25a86308 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 10 Dec 2014 15:54:31 -0800 Subject: nilfs2: deletion of an unnecessary check before the function call "iput" The iput() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/the_nilfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 9da25fe9ea61..69bd801afb53 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -808,8 +808,7 @@ void nilfs_put_root(struct nilfs_root *root) spin_lock(&nilfs->ns_cptree_lock); rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); - if (root->ifile) - iput(root->ifile); + iput(root->ifile); kfree(root); } -- cgit v1.2.3 From 705304a863cc41585508c0f476f6d3ec28cf7e00 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 10 Dec 2014 15:54:34 -0800 Subject: nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races Same story as in commit 41080b5a2401 ("nfsd race fixes: ext2") (similar ext2 fix) except that nilfs2 needs to use insert_inode_locked4() instead of insert_inode_locked() and a bug of a check for dead inodes needs to be fixed. If nilfs_iget() is called from nfsd after nilfs_new_inode() calls insert_inode_locked4(), nilfs_iget() will wait for unlock_new_inode() at the end of nilfs_mkdir()/nilfs_create()/etc to unlock the inode. If nilfs_iget() is called before nilfs_new_inode() calls insert_inode_locked4(), it will create an in-core inode and read its data from the on-disk inode. But, nilfs_iget() will find i_nlink equals zero and fail at nilfs_read_inode_common(), which will lead it to call iget_failed() and cleanly fail. However, this sanity check doesn't work as expected for reused on-disk inodes because they leave a non-zero value in i_mode field and it hinders the test of i_nlink. This patch also fixes the issue by removing the test on i_mode that nilfs2 doesn't need. Signed-off-by: Ryusuke Konishi Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/inode.c | 32 ++++++++++++++++++++++++-------- fs/nilfs2/namei.c | 15 ++++++++++++--- 2 files changed, 36 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index e1fa69b341b9..8b5969538f39 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -49,6 +49,8 @@ struct nilfs_iget_args { int for_gc; }; +static int nilfs_iget_test(struct inode *inode, void *opaque); + void nilfs_inode_add_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -348,6 +350,17 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; +static int nilfs_insert_inode_locked(struct inode *inode, + struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); +} + struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -383,7 +396,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); if (err < 0) - goto failed_bmap; + goto failed_after_creation; set_bit(NILFS_I_BMAP, &ii->i_state); /* No lock is needed; iget() ensures it. */ @@ -399,21 +412,24 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) spin_lock(&nilfs->ns_next_gen_lock); inode->i_generation = nilfs->ns_next_generation++; spin_unlock(&nilfs->ns_next_gen_lock); - insert_inode_hash(inode); + if (nilfs_insert_inode_locked(inode, root, ino) < 0) { + err = -EIO; + goto failed_after_creation; + } err = nilfs_init_acl(inode, dir); if (unlikely(err)) - goto failed_acl; /* never occur. When supporting + goto failed_after_creation; /* never occur. When supporting nilfs_init_acl(), proper cancellation of above jobs should be considered */ return inode; - failed_acl: - failed_bmap: + failed_after_creation: clear_nlink(inode); + unlock_new_inode(inode); iput(inode); /* raw_inode will be deleted through - generic_delete_inode() */ + nilfs_evict_inode() */ goto failed; failed_ifile_create_inode: @@ -461,8 +477,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - if (inode->i_nlink == 0 && inode->i_mode == 0) - return -EINVAL; /* this inode is deleted */ + if (inode->i_nlink == 0) + return -ESTALE; /* this inode is deleted */ inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); ii->i_flags = le32_to_cpu(raw_inode->i_flags); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9de78f08989e..0f84b257932c 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -51,9 +51,11 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) int err = nilfs_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -182,6 +184,7 @@ out: out_fail: drop_nlink(inode); nilfs_mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -201,11 +204,15 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, inode_inc_link_count(inode); ihold(inode); - err = nilfs_add_nondir(dentry, inode); - if (!err) + err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); err = nilfs_transaction_commit(dir->i_sb); - else + } else { + inode_dec_link_count(inode); + iput(inode); nilfs_transaction_abort(dir->i_sb); + } return err; } @@ -243,6 +250,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) nilfs_mark_inode_dirty(inode); d_instantiate(dentry, inode); + unlock_new_inode(inode); out: if (!err) err = nilfs_transaction_commit(dir->i_sb); @@ -255,6 +263,7 @@ out_fail: drop_nlink(inode); drop_nlink(inode); nilfs_mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); out_dir: drop_nlink(dir); -- cgit v1.2.3 From ddbc22e27e672b6b180757ea1d7f8481dbb88128 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 10 Dec 2014 15:54:37 -0800 Subject: fs/hfs/catalog.c: fix comparison bug in hfs_cat_keycmp Relying on the sign (after casting to int) of the difference of two quantities for comparison is usually wrong. For example, should a-b turn out to be 2^31, the return value of cmp(a,b) is -2^31; but that would also be the return value from cmp(b, a). So a compares less than b and b compares less than a. One can also easily find three values a,b,c such that a compares less than b, b compares less than c, but a does not compare less than c. Signed-off-by: Rasmus Villemoes Reviewed-by: Vyacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/catalog.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index ff0316b925a5..db458ee3a546 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -162,14 +162,16 @@ err2: */ int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) { - int retval; + __be32 k1p, k2p; - retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID); - if (!retval) - retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, - key2->cat.CName.name, key2->cat.CName.len); + k1p = key1->cat.ParID; + k2p = key2->cat.ParID; - return retval; + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, + key2->cat.CName.name, key2->cat.CName.len); } /* Try to get a catalog entry for given catalog id */ -- cgit v1.2.3 From c35a7f18a0b237261dad57c9abd3adfa73f315e1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:56 -0800 Subject: exit: proc: don't try to flush /proc/tgid/task/tgid proc_flush_task_mnt() always tries to flush task/pid, but this is pointless if we reap the leader. d_invalidate() is recursive, and if nothing else the next d_hash_and_lookup(tgid) should fail anyway. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Rik van Riel Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 772efa45a452..e7b04a321cc1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2618,6 +2618,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) dput(dentry); } + if (pid == tgid) + return; + name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); -- cgit v1.2.3