aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
authorDanilo Krummrich <dakr@kernel.org>2026-05-25 02:40:57 +0200
committerDanilo Krummrich <dakr@kernel.org>2026-05-25 02:40:57 +0200
commit56785dcb2ef6d3cff82ac33f2e34db94377416a3 (patch)
treed591052093e8caaa46e93f17a602924a8bfb9211 /fs
parent024480bf8d75bd16894c5b0eb6082b6e6dae4970 (diff)
parente7ae89a0c97ce2b68b0983cd01eda67cf373517d (diff)
downloadlinux-next-history-56785dcb2ef6d3cff82ac33f2e34db94377416a3.tar.gz
Merge tag 'v7.1-rc5' into driver-core-next
We need the driver-core fixes in here as well to build on top of. Signed-off-by: Danilo Krummrich <dakr@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h13
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c12
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/dir.c79
-rw-r--r--fs/afs/file.c24
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/inode.c127
-rw-r--r--fs/afs/internal.h45
-rw-r--r--fs/afs/symlink.c278
-rw-r--r--fs/afs/validation.c14
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/yfsclient.c4
-rw-r--r--fs/btrfs/block-group.c23
-rw-r--r--fs/btrfs/compression.c26
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/extent_io.c14
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-tree.c18
-rw-r--r--fs/btrfs/fs.h1
-rw-r--r--fs/btrfs/inode.c36
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/qgroup.c101
-rw-r--r--fs/btrfs/raid-stripe-tree.c57
-rw-r--r--fs/btrfs/relocation.c30
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/quota.c37
-rw-r--r--fs/ceph/xattr.c17
-rw-r--r--fs/efivarfs/super.c5
-rw-r--r--fs/erofs/xattr.c4
-rw-r--r--fs/erofs/zdata.c15
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/inode.c8
-rw-r--r--fs/iomap/direct-io.c2
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/namei.c11
-rw-r--r--fs/isofs/rock.c9
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/mnt_idmapping.c2
-rw-r--r--fs/netfs/buffered_read.c73
-rw-r--r--fs/netfs/buffered_write.c174
-rw-r--r--fs/netfs/direct_read.c42
-rw-r--r--fs/netfs/direct_write.c6
-rw-r--r--fs/netfs/internal.h3
-rw-r--r--fs/netfs/iterator.c41
-rw-r--r--fs/netfs/misc.c41
-rw-r--r--fs/netfs/read_collect.c19
-rw-r--r--fs/netfs/read_retry.c17
-rw-r--r--fs/netfs/read_single.c23
-rw-r--r--fs/netfs/write_collect.c15
-rw-r--r--fs/netfs/write_issue.c49
-rw-r--r--fs/netfs/write_retry.c6
-rw-r--r--fs/nfsd/nfs4proc.c18
-rw-r--r--fs/nfsd/nfs4state.c64
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/mark.c57
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/attrib.c46
-rw-r--r--fs/ntfs/attrlist.c7
-rw-r--r--fs/ntfs/bdev-io.c7
-rw-r--r--fs/ntfs/bitmap.c19
-rw-r--r--fs/ntfs/dir.c13
-rw-r--r--fs/ntfs/index.c34
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs/iomap.c6
-rw-r--r--fs/ntfs/logfile.c5
-rw-r--r--fs/ntfs/mft.c93
-rw-r--r--fs/ntfs/namei.c17
-rw-r--r--fs/ntfs/reparse.c5
-rw-r--r--fs/ntfs/runlist.c33
-rw-r--r--fs/ntfs/super.c35
-rw-r--r--fs/orangefs/namei.c2
-rw-r--r--fs/overlayfs/util.c2
-rw-r--r--fs/select.c11
-rw-r--r--fs/smb/client/cached_dir.c8
-rw-r--r--fs/smb/client/cifs_spnego.c16
-rw-r--r--fs/smb/client/cifsacl.c37
-rw-r--r--fs/smb/client/cifsfs.c42
-rw-r--r--fs/smb/client/cifsproto.h1
-rw-r--r--fs/smb/client/cifssmb.c3
-rw-r--r--fs/smb/client/file.c13
-rw-r--r--fs/smb/client/fs_context.c194
-rw-r--r--fs/smb/client/inode.c14
-rw-r--r--fs/smb/client/ioctl.c2
-rw-r--r--fs/smb/client/netlink.c6
-rw-r--r--fs/smb/client/readdir.c3
-rw-r--r--fs/smb/client/smb2file.c3
-rw-r--r--fs/smb/client/smb2inode.c14
-rw-r--r--fs/smb/client/smb2misc.c3
-rw-r--r--fs/smb/client/smb2ops.c61
-rw-r--r--fs/smb/client/smb2pdu.c26
-rw-r--r--fs/smb/client/smb2transport.c37
-rw-r--r--fs/smb/client/smbdirect.c3
-rw-r--r--fs/smb/client/smbdirect.h2
-rw-r--r--fs/smb/client/transport.c15
-rw-r--r--fs/smb/common/fscc.h4
-rw-r--r--fs/smb/common/smb2pdu.h4
-rw-r--r--fs/smb/server/connection.c151
-rw-r--r--fs/smb/server/connection.h7
-rw-r--r--fs/smb/server/mgmt/share_config.c12
-rw-r--r--fs/smb/server/oplock.c13
-rw-r--r--fs/smb/server/server.c12
-rw-r--r--fs/smb/server/smb2pdu.c29
-rw-r--r--fs/smb/server/smbacl.c144
-rw-r--r--fs/smb/server/transport_rdma.c3
-rw-r--r--fs/smb/server/transport_rdma.h2
-rw-r--r--fs/smb/server/vfs_cache.c359
-rw-r--r--fs/smb/server/vfs_cache.h4
-rw-r--r--fs/smb/smbdirect/accept.c2
-rw-r--r--fs/smb/smbdirect/connect.c4
-rw-r--r--fs/smb/smbdirect/connection.c18
-rw-r--r--fs/smb/smbdirect/debug.c4
-rw-r--r--fs/smb/smbdirect/devices.c2
-rw-r--r--fs/smb/smbdirect/internal.h4
-rw-r--r--fs/smb/smbdirect/listen.c2
-rw-r--r--fs/smb/smbdirect/mr.c27
-rw-r--r--fs/smb/smbdirect/public.h148
-rw-r--r--fs/smb/smbdirect/rw.c2
-rw-r--r--fs/smb/smbdirect/smbdirect.h52
-rw-r--r--fs/smb/smbdirect/socket.c20
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/udf/misc.c8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c18
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c3
-rw-r--r--fs/xfs/scrub/common.c11
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/dabtree.c7
-rw-r--r--fs/xfs/scrub/dir.c7
-rw-r--r--fs/xfs/xfs_buf.c34
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_notify_failure.c2
-rw-r--r--fs/xfs/xfs_trans.c11
-rw-r--r--fs/xfs/xfs_zone_alloc.c4
-rw-r--r--fs/xfs/xfs_zone_gc.c2
-rw-r--r--fs/zonefs/super.c6
142 files changed, 2482 insertions, 1243 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index d3aefbec4de6e..34c115d7c2502 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -75,17 +75,4 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
int v9fs_open_to_dotl_flags(int flags);
-static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
-{
- /*
- * 32-bit need the lock, concurrent updates could break the
- * sequences and make i_size_read() loop forever.
- * 64-bit updates are atomic and can skip the locking.
- */
- if (sizeof(i_size) > sizeof(long))
- spin_lock(&inode->i_lock);
- i_size_write(inode, i_size);
- if (sizeof(i_size) > sizeof(long))
- spin_unlock(&inode->i_lock);
-}
#endif
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d1508b1fe1092..f468acb8ee7df 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1141,11 +1141,13 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- v9inode->netfs.remote_i_size = stat->length;
+ spin_lock(&inode->i_lock);
+ netfs_write_remote_i_size(inode, stat->length);
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
- v9fs_i_size_write(inode, stat->length);
+ i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
inode->i_blocks = (stat->length + 512 - 1) >> 9;
+ spin_unlock(&inode->i_lock);
v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 71796a89bcf47..141fb54db65d2 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -634,10 +634,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
- v9inode->netfs.remote_i_size = stat->st_size;
+ spin_lock(&inode->i_lock);
+ netfs_write_remote_i_size(inode, stat->st_size);
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
- v9fs_i_size_write(inode, stat->st_size);
+ i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
+ spin_unlock(&inode->i_lock);
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
inode_set_atime(inode, stat->st_atime_sec,
@@ -662,13 +664,15 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
}
+ spin_lock(&inode->i_lock);
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
stat->st_result_mask & P9_STATS_SIZE) {
- v9inode->netfs.remote_i_size = stat->st_size;
- v9fs_i_size_write(inode, stat->st_size);
+ netfs_write_remote_i_size(inode, stat->st_size);
+ i_size_write(inode, stat->st_size);
}
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
+ spin_unlock(&inode->i_lock);
}
if (stat->st_result_mask & P9_STATS_GEN)
inode->i_generation = stat->st_gen;
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index b49b8fe682f39..0d8f1982d596c 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -30,6 +30,7 @@ kafs-y := \
server.o \
server_list.o \
super.o \
+ symlink.o \
validation.o \
vlclient.o \
vl_alias.o \
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index aaaa55878ffd2..498b99ccdf0e2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -44,6 +44,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
+static int afs_dir_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
@@ -68,7 +70,7 @@ const struct inode_operations afs_dir_inode_operations = {
};
const struct address_space_operations afs_dir_aops = {
- .writepages = afs_single_writepages,
+ .writepages = afs_dir_writepages,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -233,22 +235,13 @@ static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file)
struct iov_iter iter;
ssize_t ret;
loff_t i_size;
- bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) &&
- !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags));
i_size = i_size_read(&dvnode->netfs.inode);
- if (is_dir) {
- if (i_size < AFS_DIR_BLOCK_SIZE)
- return afs_bad(dvnode, afs_file_error_dir_small);
- if (i_size > AFS_DIR_BLOCK_SIZE * 1024) {
- trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
- return -EFBIG;
- }
- } else {
- if (i_size > AFSPATHMAX) {
- trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
- return -EFBIG;
- }
+ if (i_size < AFS_DIR_BLOCK_SIZE)
+ return afs_bad(dvnode, afs_file_error_dir_small);
+ if (i_size > AFS_DIR_BLOCK_SIZE * 1024) {
+ trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
+ return -EFBIG;
}
/* Expand the storage. TODO: Shrink the storage too. */
@@ -277,24 +270,18 @@ static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file)
* buffer.
*/
ret = -ESTALE;
- } else if (is_dir) {
+ } else {
int ret2 = afs_dir_check(dvnode);
if (ret2 < 0)
ret = ret2;
- } else if (i_size < folioq_folio_size(dvnode->directory, 0)) {
- /* NUL-terminate a symlink. */
- char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0);
-
- symlink[i_size] = 0;
- kunmap_local(symlink);
}
}
return ret;
}
-ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file)
+static ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file)
{
ssize_t ret;
@@ -1763,13 +1750,20 @@ error:
return ret;
}
+static void afs_symlink_put(struct afs_operation *op)
+{
+ kfree(op->create.symlink);
+ op->create.symlink = NULL;
+ afs_create_put(op);
+}
+
static const struct afs_operation_ops afs_symlink_operation = {
.issue_afs_rpc = afs_fs_symlink,
.issue_yfs_rpc = yfs_fs_symlink,
.success = afs_create_success,
.aborted = afs_check_for_remote_deletion,
.edit_dir = afs_create_edit_dir,
- .put = afs_create_put,
+ .put = afs_symlink_put,
};
/*
@@ -1779,7 +1773,9 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *content)
{
struct afs_operation *op;
+ struct afs_symlink *symlink;
struct afs_vnode *dvnode = AFS_FS_I(dir);
+ size_t clen = strlen(content);
int ret;
_enter("{%llx:%llu},{%pd},%s",
@@ -1791,12 +1787,20 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto error;
ret = -EINVAL;
- if (strlen(content) >= AFSPATHMAX)
+ if (clen >= AFSPATHMAX)
+ goto error;
+
+ ret = -ENOMEM;
+ symlink = kmalloc_flex(struct afs_symlink, content, clen + 1, GFP_KERNEL);
+ if (!symlink)
goto error;
+ refcount_set(&symlink->ref, 1);
+ memcpy(symlink->content, content, clen + 1);
op = afs_alloc_operation(NULL, dvnode->volume);
if (IS_ERR(op)) {
ret = PTR_ERR(op);
+ kfree(symlink);
goto error;
}
@@ -1808,7 +1812,7 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir,
op->dentry = dentry;
op->ops = &afs_symlink_operation;
op->create.reason = afs_edit_dir_for_symlink;
- op->create.symlink = content;
+ op->create.symlink = symlink;
op->mtime = current_time(dir);
ret = afs_do_sync_operation(op);
afs_dir_unuse_cookie(dvnode, ret);
@@ -2192,28 +2196,33 @@ error:
}
/*
- * Write the file contents to the cache as a single blob.
+ * Write the directory contents to the cache as a single blob.
*/
-int afs_single_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int afs_dir_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
struct afs_vnode *dvnode = AFS_FS_I(mapping->host);
struct iov_iter iter;
- bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) &&
- !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags));
int ret = 0;
/* Need to lock to prevent the folio queue and folios from being thrown
* away.
*/
- down_read(&dvnode->validate_lock);
+ if (!down_read_trylock(&dvnode->validate_lock)) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ /* The VFS will have undirtied the inode. */
+ netfs_single_mark_inode_dirty(&dvnode->netfs.inode);
+ return 0;
+ }
+ down_read(&dvnode->validate_lock);
+ }
- if (is_dir ?
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) :
- atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) {
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0,
i_size_read(&dvnode->netfs.inode));
ret = netfs_writeback_single(mapping, wbc, &iter);
+ if (ret == 1)
+ ret = 0; /* Skipped write due to lock conflict. */
}
up_read(&dvnode->validate_lock);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 85696ac984cc3..0467742bfeee3 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -427,21 +427,35 @@ static void afs_free_request(struct netfs_io_request *rreq)
afs_put_wb_key(rreq->netfs_priv2);
}
-static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
+/*
+ * Set the file size and block count, taking ->cb_lock and ->i_lock to maintain
+ * coherency and prevent 64-bit tearing on 32-bit arches.
+ *
+ * Also, estimate the number of 512 bytes blocks used, rounded up to nearest 1K
+ * for consistency with other AFS clients.
+ */
+void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size)
{
- struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct inode *inode = &vnode->netfs.inode;
loff_t i_size;
write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->netfs.inode);
+ spin_lock(&inode->i_lock);
+ i_size = i_size_read(inode);
if (new_i_size > i_size) {
- i_size_write(&vnode->netfs.inode, new_i_size);
- inode_set_bytes(&vnode->netfs.inode, new_i_size);
+ i_size_write(inode, new_i_size);
+ inode_set_bytes(inode, round_up(new_i_size, 1024));
}
+ spin_unlock(&inode->i_lock);
write_sequnlock(&vnode->cb_lock);
fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size);
}
+static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
+{
+ afs_set_i_size(AFS_FS_I(inode), new_i_size);
+}
+
static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq)
{
struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 95494d5f2b8a9..a2ffd60889f89 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -886,7 +886,7 @@ void afs_fs_symlink(struct afs_operation *op)
namesz = name->len;
padsz = (4 - (namesz & 3)) & 3;
- c_namesz = strlen(op->create.symlink);
+ c_namesz = strlen(op->create.symlink->content);
c_padsz = (4 - (c_namesz & 3)) & 3;
reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
@@ -910,7 +910,7 @@ void afs_fs_symlink(struct afs_operation *op)
bp = (void *) bp + padsz;
}
*bp++ = htonl(c_namesz);
- memcpy(bp, op->create.symlink, c_namesz);
+ memcpy(bp, op->create.symlink->content, c_namesz);
bp = (void *) bp + c_namesz;
if (c_padsz > 0) {
memset(bp, 0, c_padsz);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index a5173434f7862..3f48458694baa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -25,96 +25,6 @@
#include "internal.h"
#include "afs_fs.h"
-void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op)
-{
- size_t size = strlen(op->create.symlink) + 1;
- size_t dsize = 0;
- char *p;
-
- if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size,
- mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0)
- return;
-
- vnode->directory_size = dsize;
- p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
- memcpy(p, op->create.symlink, size);
- kunmap_local(p);
- set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
- netfs_single_mark_inode_dirty(&vnode->netfs.inode);
-}
-
-static void afs_put_link(void *arg)
-{
- struct folio *folio = virt_to_folio(arg);
-
- kunmap_local(arg);
- folio_put(folio);
-}
-
-const char *afs_get_link(struct dentry *dentry, struct inode *inode,
- struct delayed_call *callback)
-{
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct folio *folio;
- char *content;
- ssize_t ret;
-
- if (!dentry) {
- /* RCU pathwalk. */
- if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode))
- return ERR_PTR(-ECHILD);
- goto good;
- }
-
- if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
- goto fetch;
-
- ret = afs_validate(vnode, NULL);
- if (ret < 0)
- return ERR_PTR(ret);
-
- if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
- test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
- goto good;
-
-fetch:
- ret = afs_read_single(vnode, NULL);
- if (ret < 0)
- return ERR_PTR(ret);
- set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
-
-good:
- folio = folioq_folio(vnode->directory, 0);
- folio_get(folio);
- content = kmap_local_folio(folio, 0);
- set_delayed_call(callback, afs_put_link, content);
- return content;
-}
-
-int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
- DEFINE_DELAYED_CALL(done);
- const char *content;
- int len;
-
- content = afs_get_link(dentry, d_inode(dentry), &done);
- if (IS_ERR(content)) {
- do_delayed_call(&done);
- return PTR_ERR(content);
- }
-
- len = umin(strlen(content), buflen);
- if (copy_to_user(buffer, content, len))
- len = -EFAULT;
- do_delayed_call(&done);
- return len;
-}
-
-static const struct inode_operations afs_symlink_inode_operations = {
- .get_link = afs_get_link,
- .readlink = afs_readlink,
-};
-
static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
{
static unsigned long once_only;
@@ -214,7 +124,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFLNK | status->mode;
inode->i_op = &afs_symlink_inode_operations;
}
- inode->i_mapping->a_ops = &afs_dir_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
inode_nohighmem(inode);
mapping_set_release_always(inode->i_mapping);
break;
@@ -224,7 +134,8 @@ static int afs_inode_init_from_status(struct afs_operation *op,
return afs_protocol_error(NULL, afs_eproto_file_type);
}
- afs_set_i_size(vnode, status->size);
+ i_size_write(inode, status->size);
+ inode_set_bytes(inode, status->size);
afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
@@ -253,7 +164,8 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
- struct inode *inode = &vnode->netfs.inode;
+ struct netfs_inode *ictx = &vnode->netfs;
+ struct inode *inode = &ictx->inode;
struct timespec64 t;
umode_t mode;
bool unexpected_jump = false;
@@ -336,6 +248,8 @@ static void afs_apply_status(struct afs_operation *op,
}
if (data_changed) {
+ unsigned long long zero_point, size = status->size;
+
inode_set_iversion_raw(inode, status->data_version);
/* Only update the size if the data version jumped. If the
@@ -343,16 +257,25 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
- vnode->netfs.remote_i_size = status->size;
- if (change_size || status->size > i_size_read(inode)) {
- afs_set_i_size(vnode, status->size);
+ spin_lock(&inode->i_lock);
+
+ if (change_size || size > i_size_read(inode)) {
+ /* We can read the sizes directly as we hold i_lock. */
+ zero_point = ictx->_zero_point;
+
if (unexpected_jump)
- vnode->netfs.zero_point = status->size;
+ zero_point = size;
+ netfs_write_sizes(inode, size, size, zero_point);
+ inode_set_bytes(inode, size);
inode_set_ctime_to_ts(inode, t);
inode_set_atime_to_ts(inode, t);
+ } else {
+ netfs_write_remote_i_size(inode, size);
}
+ spin_unlock(&inode->i_lock);
+
if (op->ops == &afs_fetch_data_operation)
- op->fetch.subreq->rreq->i_size = status->size;
+ op->fetch.subreq->rreq->i_size = size;
}
}
@@ -709,7 +632,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
* it, but we need to give userspace the server's size.
*/
if (S_ISDIR(inode->i_mode))
- stat->size = vnode->netfs.remote_i_size;
+ stat->size = netfs_read_remote_i_size(inode);
} while (read_seqretry(&vnode->cb_lock, seq));
return 0;
@@ -756,12 +679,14 @@ void afs_evict_inode(struct inode *inode)
.range_end = LLONG_MAX,
};
- afs_single_writepages(inode->i_mapping, &wbc);
+ inode->i_mapping->a_ops->writepages(inode->i_mapping, &wbc);
}
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
netfs_free_folioq_buffer(vnode->directory);
+ if (vnode->symlink)
+ afs_evict_symlink(vnode);
afs_set_cache_aux(vnode, &aux);
netfs_clear_inode_writeback(inode, &aux);
@@ -889,7 +814,7 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
attr->ia_size < i_size &&
- attr->ia_size > vnode->netfs.remote_i_size) {
+ attr->ia_size > netfs_read_remote_i_size(inode)) {
truncate_setsize(inode, attr->ia_size);
netfs_resize_file(&vnode->netfs, size, false);
fscache_resize_cookie(afs_vnode_cache(vnode),
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 599353c333372..0b72a85662999 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -710,6 +710,7 @@ struct afs_vnode {
#define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */
struct folio_queue *directory; /* Directory contents */
+ struct afs_symlink __rcu *symlink; /* Symlink content */
struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */
struct list_head granted_locks; /* locks granted on this file */
@@ -777,6 +778,15 @@ struct afs_permits {
};
/*
+ * Copy of symlink content for normal use.
+ */
+struct afs_symlink {
+ struct rcu_head rcu;
+ refcount_t ref;
+ char content[];
+};
+
+/*
* Error prioritisation and accumulation.
*/
struct afs_error {
@@ -887,7 +897,7 @@ struct afs_operation {
struct {
int reason; /* enum afs_edit_dir_reason */
mode_t mode;
- const char *symlink;
+ struct afs_symlink *symlink;
} create;
struct {
bool need_rehash;
@@ -1098,13 +1108,10 @@ extern const struct inode_operations afs_dir_inode_operations;
extern const struct address_space_operations afs_dir_aops;
extern const struct dentry_operations afs_fs_dentry_operations;
-ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file);
ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
__acquires(&dvnode->validate_lock);
extern void afs_d_release(struct dentry *);
extern void afs_check_for_remote_deletion(struct afs_operation *);
-int afs_single_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
/*
* dir_edit.c
@@ -1157,6 +1164,7 @@ extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *);
void afs_fetch_data_async_rx(struct work_struct *work);
void afs_fetch_data_immediate_cancel(struct afs_call *call);
+void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size);
/*
* flock.c
@@ -1246,10 +1254,6 @@ extern void afs_fs_probe_cleanup(struct afs_net *);
*/
extern const struct afs_operation_ops afs_fetch_status_operation;
-void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op);
-const char *afs_get_link(struct dentry *dentry, struct inode *inode,
- struct delayed_call *callback);
-int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
@@ -1600,6 +1604,21 @@ extern int __init afs_fs_init(void);
extern void afs_fs_exit(void);
/*
+ * symlink.c
+ */
+extern const struct inode_operations afs_symlink_inode_operations;
+extern const struct address_space_operations afs_symlink_aops;
+
+void afs_invalidate_symlink(struct afs_vnode *vnode);
+void afs_evict_symlink(struct afs_vnode *vnode);
+void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op);
+const char *afs_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback);
+int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
+int afs_symlink_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
+
+/*
* validation.c
*/
bool afs_check_validity(const struct afs_vnode *vnode);
@@ -1759,16 +1778,6 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
}
/*
- * Set the file size and block count. Estimate the number of 512 bytes blocks
- * used, rounded up to nearest 1K for consistency with other AFS clients.
- */
-static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
-{
- i_size_write(&vnode->netfs.inode, size);
- vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1;
-}
-
-/*
* Check for a conflicting operation on a directory that we just unlinked from.
* If someone managed to sneak a link or an unlink in on the file we just
* unlinked, we won't be able to trust nlink on an AFS file (but not YFS).
diff --git a/fs/afs/symlink.c b/fs/afs/symlink.c
new file mode 100644
index 0000000000000..ed5868369f372
--- /dev/null
+++ b/fs/afs/symlink.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* AFS filesystem symbolic link handling
+ *
+ * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/iov_iter.h>
+#include "internal.h"
+
+static void afs_put_symlink(struct afs_symlink *symlink)
+{
+ if (refcount_dec_and_test(&symlink->ref))
+ kfree_rcu(symlink, rcu);
+}
+
+static void afs_replace_symlink(struct afs_vnode *vnode, struct afs_symlink *symlink)
+{
+ struct afs_symlink *old;
+
+ old = rcu_replace_pointer(vnode->symlink, symlink,
+ lockdep_is_held(&vnode->validate_lock));
+ if (old)
+ afs_put_symlink(old);
+}
+
+/*
+ * In the event that a third-party update of a symlink occurs, dispose of the
+ * copy of the old contents. Called under ->validate_lock.
+ */
+void afs_invalidate_symlink(struct afs_vnode *vnode)
+{
+ afs_replace_symlink(vnode, NULL);
+}
+
+/*
+ * Dispose of a symlink copy during inode deletion.
+ */
+void afs_evict_symlink(struct afs_vnode *vnode)
+{
+ struct afs_symlink *old;
+
+ old = rcu_replace_pointer(vnode->symlink, NULL, true);
+ if (old)
+ afs_put_symlink(old);
+
+}
+
+/*
+ * Set up a locally created symlink inode for immediate write to the cache.
+ */
+void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op)
+{
+ struct afs_symlink *symlink = op->create.symlink;
+ size_t dsize = 0;
+ size_t size = strlen(symlink->content) + 1;
+ char *p;
+
+ rcu_assign_pointer(vnode->symlink, symlink);
+ op->create.symlink = NULL;
+
+ if (!fscache_cookie_enabled(netfs_i_cookie(&vnode->netfs)))
+ return;
+
+ if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size,
+ mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0)
+ return;
+
+ vnode->directory_size = dsize;
+ p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
+ memcpy(p, symlink->content, size);
+ kunmap_local(p);
+ netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+}
+
+/*
+ * Read a symlink in a single download.
+ */
+static ssize_t afs_do_read_symlink(struct afs_vnode *vnode)
+{
+ struct afs_symlink *symlink;
+ struct iov_iter iter;
+ ssize_t ret;
+ loff_t i_size;
+
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (i_size > PAGE_SIZE - 1) {
+ trace_afs_file_error(vnode, -EFBIG, afs_file_error_dir_big);
+ return -EFBIG;
+ }
+
+ if (!vnode->directory) {
+ size_t cur_size = 0;
+
+ ret = netfs_alloc_folioq_buffer(NULL,
+ &vnode->directory, &cur_size, PAGE_SIZE,
+ mapping_gfp_mask(vnode->netfs.inode.i_mapping));
+ vnode->directory_size = PAGE_SIZE - 1;
+ if (ret < 0)
+ return ret;
+ }
+
+ iov_iter_folio_queue(&iter, ITER_DEST, vnode->directory, 0, 0, PAGE_SIZE);
+
+ /* AFS requires us to perform the read of a symlink as a single unit to
+ * avoid issues with the content being changed between reads.
+ */
+ ret = netfs_read_single(&vnode->netfs.inode, NULL, &iter);
+ if (ret >= 0) {
+ i_size = ret;
+ if (i_size > PAGE_SIZE - 1) {
+ trace_afs_file_error(vnode, -EFBIG, afs_file_error_dir_big);
+ return -EFBIG;
+ }
+ vnode->directory_size = i_size;
+
+ /* Copy the symlink. */
+ symlink = kmalloc_flex(struct afs_symlink, content, i_size + 1,
+ GFP_KERNEL);
+ if (!symlink)
+ return -ENOMEM;
+
+ refcount_set(&symlink->ref, 1);
+ symlink->content[i_size] = 0;
+
+ const char *s = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
+
+ memcpy(symlink->content, s, i_size);
+ kunmap_local(s);
+
+ afs_replace_symlink(vnode, symlink);
+ }
+
+ if (!fscache_cookie_enabled(netfs_i_cookie(&vnode->netfs))) {
+ netfs_free_folioq_buffer(vnode->directory);
+ vnode->directory = NULL;
+ vnode->directory_size = 0;
+ }
+
+ return ret;
+}
+
+static ssize_t afs_read_symlink(struct afs_vnode *vnode)
+{
+ ssize_t ret;
+
+ fscache_use_cookie(afs_vnode_cache(vnode), false);
+ ret = afs_do_read_symlink(vnode);
+ fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL);
+ return ret;
+}
+
+static void afs_put_link(void *arg)
+{
+ afs_put_symlink(arg);
+}
+
+const char *afs_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ struct afs_symlink *symlink;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ ssize_t ret;
+
+ if (!dentry) {
+ /* RCU pathwalk. */
+ symlink = rcu_dereference(vnode->symlink);
+ if (!symlink || !afs_check_validity(vnode))
+ return ERR_PTR(-ECHILD);
+ set_delayed_call(callback, NULL, NULL);
+ return symlink->content;
+ }
+
+ if (vnode->symlink) {
+ ret = afs_validate(vnode, NULL);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ down_read(&vnode->validate_lock);
+ if (vnode->symlink)
+ goto good;
+ up_read(&vnode->validate_lock);
+ }
+
+ if (down_write_killable(&vnode->validate_lock) < 0)
+ return ERR_PTR(-ERESTARTSYS);
+ if (!vnode->symlink) {
+ ret = afs_read_symlink(vnode);
+ if (ret < 0) {
+ up_write(&vnode->validate_lock);
+ return ERR_PTR(ret);
+ }
+ }
+
+ downgrade_write(&vnode->validate_lock);
+
+good:
+ symlink = rcu_dereference_protected(vnode->symlink,
+ lockdep_is_held(&vnode->validate_lock));
+ refcount_inc(&symlink->ref);
+ up_read(&vnode->validate_lock);
+
+ set_delayed_call(callback, afs_put_link, symlink);
+ return symlink->content;
+}
+
+int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+ DEFINE_DELAYED_CALL(done);
+ const char *content;
+ int len;
+
+ content = afs_get_link(dentry, d_inode(dentry), &done);
+ if (IS_ERR(content)) {
+ do_delayed_call(&done);
+ return PTR_ERR(content);
+ }
+
+ len = umin(strlen(content), buflen);
+ if (copy_to_user(buffer, content, len))
+ len = -EFAULT;
+ do_delayed_call(&done);
+ return len;
+}
+
+/*
+ * Write the symlink contents to the cache as a single blob. We then throw
+ * away the page we used to receive it.
+ */
+int afs_symlink_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
+ int ret = 0;
+
+ if (!down_read_trylock(&vnode->validate_lock)) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ /* The VFS will have undirtied the inode. */
+ netfs_single_mark_inode_dirty(&vnode->netfs.inode);
+ return 0;
+ }
+ down_read(&vnode->validate_lock);
+ }
+
+ if (vnode->directory &&
+ atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) {
+ iov_iter_folio_queue(&iter, ITER_SOURCE, vnode->directory, 0, 0,
+ i_size_read(&vnode->netfs.inode));
+ ret = netfs_writeback_single(mapping, wbc, &iter);
+ }
+
+ if (ret == 0) {
+ mutex_lock(&vnode->netfs.wb_lock);
+ netfs_free_folioq_buffer(vnode->directory);
+ vnode->directory = NULL;
+ vnode->directory_size = 0;
+ mutex_unlock(&vnode->netfs.wb_lock);
+ } else if (ret == 1) {
+ ret = 0; /* Skipped write due to lock conflict. */
+ }
+
+ up_read(&vnode->validate_lock);
+ return ret;
+}
+
+const struct inode_operations afs_symlink_inode_operations = {
+ .get_link = afs_get_link,
+ .readlink = afs_readlink,
+};
+
+const struct address_space_operations afs_symlink_aops = {
+ .writepages = afs_symlink_writepages,
+};
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 0ba8336c90250..e997563af658b 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -465,11 +465,17 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->cb_ro_snapshot = cb_ro_snapshot;
vnode->cb_scrub = cb_scrub;
- /* if the vnode's data version number changed then its contents are
- * different */
+ /* If the vnode's data version number changed then its contents are
+ * different. Note that afs_apply_status() doesn't set ZAP_DATA on
+ * directories.
+ */
zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
- if (zap)
- afs_zap_data(vnode);
+ if (zap) {
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ afs_zap_data(vnode);
+ else if (S_ISLNK(vnode->netfs.inode.i_mode))
+ afs_invalidate_symlink(vnode);
+ }
up_write(&vnode->validate_lock);
_leave(" = 0");
return 0;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index fcfed9d24e0a3..7f34b939706a0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -142,7 +142,7 @@ static void afs_issue_write_worker(struct work_struct *work)
afs_begin_vnode_operation(op);
op->store.write_iter = &subreq->io_iter;
- op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size);
+ op->store.i_size = umax(pos + len, netfs_read_remote_i_size(&vnode->netfs.inode));
op->mtime = inode_get_mtime(&vnode->netfs.inode);
afs_wait_for_operation(op);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 24fb562ebd33a..d941179730a98 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -960,7 +960,7 @@ void yfs_fs_symlink(struct afs_operation *op)
_enter("");
- contents_sz = strlen(op->create.symlink);
+ contents_sz = strlen(op->create.symlink->content);
call = afs_alloc_flat_call(op->net, &yfs_RXYFSSymlink,
sizeof(__be32) +
sizeof(struct yfs_xdr_RPCFlags) +
@@ -981,7 +981,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_string(bp, op->create.symlink, contents_sz);
+ bp = xdr_encode_string(bp, op->create.symlink->content, contents_sz);
bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e6f5a17a13e36..b611c64119dbc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2412,29 +2412,25 @@ static struct btrfs_block_group *btrfs_create_block_group(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- u64 start = 0;
+ struct rb_node *node;
int ret = 0;
- while (1) {
+ /*
+ * This is called during mount from btrfs_read_block_groups(), before
+ * any background threads are started, so no concurrent writers can
+ * modify the mapping_tree. No lock is needed here.
+ */
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;
- /*
- * btrfs_find_chunk_map() will return the first chunk map
- * intersecting the range, so setting @length to 1 is enough to
- * get the first chunk.
- */
- map = btrfs_find_chunk_map(fs_info, start, 1);
- if (!map)
- break;
-
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
bg = btrfs_lookup_block_group(fs_info, map->start);
if (unlikely(!bg)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
map->start, map->chunk_len);
ret = -EUCLEAN;
- btrfs_free_chunk_map(map);
break;
}
if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
@@ -2447,12 +2443,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = map->start + map->chunk_len;
- btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b2393a48a8fe9..a02b62e0a8f33 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -407,22 +407,18 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- /*
- * Avoid direct reclaim when the caller does not allow it. Since
- * add_ra_bio_pages() is always speculative, suppress allocation warnings
- * in either case.
- */
+ /* Avoid direct reclaim when the caller does not allow it. */
+ constraint_gfp = ~__GFP_FS;
+ cache_gfp = GFP_NOFS | __GFP_NOWARN;
if (!direct_reclaim) {
- constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN;
- cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN;
- } else {
- constraint_gfp = (~__GFP_FS) | __GFP_NOWARN;
- cache_gfp = GFP_NOFS | __GFP_NOWARN;
+ constraint_gfp &= ~__GFP_DIRECT_RECLAIM;
+ cache_gfp &= ~__GFP_DIRECT_RECLAIM;
}
while (cur < compressed_end) {
pgoff_t page_end;
pgoff_t pg_index = cur >> PAGE_SHIFT;
+ gfp_t masked_constraint_gfp;
u32 add_size;
if (pg_index > end_index)
@@ -449,8 +445,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
- folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp),
- 0, NULL);
+ /*
+ * Since add_ra_bio_pages() is always speculative, suppress
+ * allocation warnings.
+ */
+ masked_constraint_gfp = mapping_gfp_constraint(mapping, constraint_gfp);
+ masked_constraint_gfp |= __GFP_NOWARN;
+
+ folio = filemap_alloc_folio(masked_constraint_gfp, 0, NULL);
if (!folio)
break;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8a11be02eeb9b..c0a30bb213d7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4686,6 +4686,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
free_extent_buffer_stale(eb);
}
}
+ btrfs_extent_io_tree_release(dirty_pages);
}
static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ca3e4b99aec2b..2275189b78605 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4641,7 +4641,8 @@ int try_release_extent_buffer(struct folio *folio)
* to read the block we will not block on anything.
*/
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 owner_root, u64 gen, int level)
+ u64 bytenr, u64 owner_root, u64 gen, int level,
+ const struct btrfs_key *first_key)
{
struct btrfs_tree_parent_check check = {
.level = level,
@@ -4650,6 +4651,11 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int ret;
+ if (first_key) {
+ memcpy(&check.first_key, first_key, sizeof(struct btrfs_key));
+ check.has_first_key = true;
+ }
+
eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(eb))
return;
@@ -4677,9 +4683,13 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
*/
void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
{
+ struct btrfs_key node_key;
+
+ btrfs_node_key_to_cpu(node, &node_key, slot);
btrfs_readahead_tree_block(node->fs_info,
btrfs_node_blockptr(node, slot),
btrfs_header_owner(node),
btrfs_node_ptr_generation(node, slot),
- btrfs_header_level(node) - 1);
+ btrfs_header_level(node) - 1,
+ &node_key);
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index fd209233317f4..b310a5145cf69 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -287,7 +287,8 @@ static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
}
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 owner_root, u64 gen, int level);
+ u64 bytenr, u64 owner_root, u64 gen, int level,
+ const struct btrfs_key *first_key);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
/* Note: this can be used in for loops without caching the value in a variable. */
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 9284c0a81befb..6b79bff241f21 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1246,7 +1246,9 @@ static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
write_unlock(&tree->lock);
next:
from = btrfs_ino(inode) + 1;
- cond_resched_lock(&root->inodes.xa_lock);
+ xa_unlock(&root->inodes);
+ cond_resched();
+ xa_lock(&root->inodes);
}
xa_unlock(&root->inodes);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cf1cb5c4db757..8c171ed07008b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -633,7 +633,7 @@ again:
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
- return 0;
+ goto mark_dirty;
}
}
@@ -661,7 +661,7 @@ again:
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
- return 0;
+ goto mark_dirty;
}
}
@@ -788,7 +788,12 @@ again:
}
}
- return 0;
+mark_dirty:
+ ret = btrfs_inode_set_file_extent_range(inode, start, end - start);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
}
/*
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 9efd1ec90f031..472b3060e5ac3 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -259,7 +259,11 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
nr++;
path->slots[0]--;
} else {
- ASSERT(0);
+ btrfs_err(fs_info, "unexpected free space tree key type %u",
+ found_key.type);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
}
@@ -405,7 +409,11 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
nr++;
} else {
- ASSERT(0);
+ btrfs_err(fs_info, "unexpected free space tree key type %u",
+ found_key.type);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
}
}
@@ -1518,7 +1526,11 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
nr++;
path->slots[0]--;
} else {
- ASSERT(0);
+ btrfs_err(trans->fs_info, "unexpected free space tree key type %u",
+ found_key.type);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a4758d94b32e9..a8aa086a4df86 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -155,6 +155,7 @@ enum {
BTRFS_FS_LOG_RECOVERING,
BTRFS_FS_OPEN,
BTRFS_FS_QUOTA_ENABLED,
+ BTRFS_FS_SQUOTA_ENABLING,
BTRFS_FS_UPDATE_UUID_TREE_GEN,
BTRFS_FS_CREATING_FREE_SPACE_TREE,
BTRFS_FS_BTREE_ERR,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 40474014c03f1..1ca1cbdf25bcd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1153,7 +1153,7 @@ out_free_reserve:
NULL, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
if (async_extent->cb)
@@ -4959,6 +4959,8 @@ static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
if (ret)
goto out;
+ btrfs_record_unlink_dir(trans, dir, inode, false);
+
/* now the directory is empty */
ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
if (!ret)
@@ -9297,10 +9299,38 @@ next:
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(actual_len > inode->i_size) &&
(cur_offset > inode->i_size)) {
+ u64 range_start;
+ u64 range_end;
+
if (cur_offset > actual_len)
i_size = actual_len;
else
i_size = cur_offset;
+
+ /*
+ * Make sure the file_extent_tree covers the entire
+ * range [old_i_size, new_i_size) before we update
+ * disk_i_size. Without this, a previous KEEP_SIZE
+ * prealloc that extended past i_size (and was lost
+ * across umount/mount because file_extent_tree is
+ * only populated up to round_up(i_size) on inode
+ * load) can leave a gap inside this range. That gap
+ * would cause btrfs_inode_safe_disk_i_size_write()
+ * (via find_contiguous_extent_bit() starting at 0)
+ * to truncate disk_i_size to the start of the gap,
+ * making the persisted size smaller than i_size.
+ */
+ range_start = round_down(inode->i_size, fs_info->sectorsize);
+ range_end = round_up(i_size, fs_info->sectorsize);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ range_start, range_end - range_start);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans);
+ break;
+ }
+
i_size_write(inode, i_size);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
@@ -10669,7 +10699,9 @@ struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
break;
from = btrfs_ino(inode) + 1;
- cond_resched_lock(&root->inodes.xa_lock);
+ xa_unlock(&root->inodes);
+ cond_resched();
+ xa_lock(&root->inodes);
}
xa_unlock(&root->inodes);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2e447f5005c1..a39460bf68a77 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5102,7 +5102,6 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a
return 0;
}
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg)
{
int ret = 0;
@@ -5134,10 +5133,12 @@ static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg
case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH:
btrfs_force_shutdown(fs_info);
break;
+ default:
+ ret = -EINVAL;
+ break;
}
return ret;
}
-#endif
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
@@ -5294,10 +5295,8 @@ long btrfs_ioctl(struct file *file, unsigned int
#endif
case BTRFS_IOC_SUBVOL_SYNC_WAIT:
return btrfs_ioctl_subvol_sync(fs_info, argp);
-#ifdef CONFIG_BTRFS_EXPERIMENTAL
case BTRFS_IOC_SHUTDOWN:
return btrfs_ioctl_shutdown(fs_info, arg);
-#endif
}
return -ENOTTY;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cdf736d3a4e5b..6838faceb6d5c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1107,7 +1107,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (simple) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
- btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
+ /*
+ * Set the enable generation to the next transaction, as we cannot
+ * ensure that extents written during this transaction will see any
+ * state we have set here. So we should treat all extents of the
+ * transaction as coming in before squotas was enabled.
+ */
+ btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid + 1);
} else {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
@@ -1210,7 +1216,15 @@ out_add_root:
goto out_free_path;
}
- fs_info->qgroup_enable_gen = trans->transid;
+ /*
+ * Set fs_info->qgroup_enable_gen and BTRFS_FS_SQUOTA_ENABLING
+ * under the transaction handle. We want to ensure that all extents in
+ * the next transaction definitely see them.
+ */
+ if (simple) {
+ fs_info->qgroup_enable_gen = trans->transid + 1;
+ set_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags);
+ }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
@@ -1224,9 +1238,15 @@ out_add_root:
*/
ret = btrfs_commit_transaction(trans);
trans = NULL;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (ret)
+ if (ret) {
+ if (simple) {
+ clear_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags);
+ fs_info->qgroup_enable_gen = 0;
+ }
goto out_free_path;
+ }
/*
* Set quota enabled flag after committing the transaction, to avoid
@@ -1236,6 +1256,8 @@ out_add_root:
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ if (simple)
+ clear_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
/* Skip rescan for simple qgroups. */
@@ -1715,32 +1737,24 @@ out:
return ret;
}
-static bool can_delete_parent_qgroup(struct btrfs_qgroup *qgroup)
-
+static bool can_delete_parent_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{
ASSERT(btrfs_qgroup_level(qgroup->qgroupid));
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ squota_check_parent_usage(fs_info, qgroup);
return list_empty(&qgroup->members);
}
/*
- * Return true if we can delete the squota qgroup and false otherwise.
- *
- * Rules for whether we can delete:
- *
- * A subvolume qgroup can be removed iff the subvolume is fully deleted, which
- * is iff there is 0 usage in the qgroup.
- *
- * A higher level qgroup can be removed iff it has no members.
- * Note: We audit its usage to warn on inconsitencies without blocking deletion.
+ * Because a shared extent can outlive its owning subvolume, we cannot delete a
+ * subvol squota qgroup until all of the extents it owns are gone, even if the
+ * subvolume itself has been deleted.
*/
-static bool can_delete_squota_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
+static bool can_delete_squota_subvol_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
{
ASSERT(btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
-
- if (btrfs_qgroup_level(qgroup->qgroupid) > 0) {
- squota_check_parent_usage(fs_info, qgroup);
- return can_delete_parent_qgroup(qgroup);
- }
+ ASSERT(btrfs_qgroup_level(qgroup->qgroupid) == 0);
return !(qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr);
}
@@ -1754,14 +1768,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup
{
struct btrfs_key key;
BTRFS_PATH_AUTO_FREE(path);
-
- /* Since squotas cannot be inconsistent, they have special rules for deletion. */
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
- return can_delete_squota_qgroup(fs_info, qgroup);
+ int ret;
/* For higher level qgroup, we can only delete it if it has no child. */
if (btrfs_qgroup_level(qgroup->qgroupid))
- return can_delete_parent_qgroup(qgroup);
+ return can_delete_parent_qgroup(fs_info, qgroup);
/*
* For level-0 qgroups, we can only delete it if it has no subvolume
@@ -1777,10 +1788,21 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup
return -ENOMEM;
/*
- * The @ret from btrfs_find_root() exactly matches our definition for
- * the return value, thus can be returned directly.
+ * Any subvol qgroup, regardless of mode, cannot be deleted if the
+ * subvol still exists.
*/
- return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
+ ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
+ /*
+ * btrfs_find_root returns <0 on error, 0 if found, and >0 if not,
+ * so the "found" and "error" cases match our desired return values.
+ */
+ if (ret <= 0)
+ return ret;
+
+ /* Squotas require additional checks, even if the subvol is deleted. */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return can_delete_squota_subvol_qgroup(fs_info, qgroup);
+ return 1;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -4922,7 +4944,8 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
u64 num_bytes = delta->num_bytes;
const int sign = (delta->is_inc ? 1 : -1);
- if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE &&
+ !test_bit(BTRFS_FS_SQUOTA_ENABLING, &fs_info->flags))
return 0;
if (!btrfs_is_fstree(root))
@@ -4934,8 +4957,9 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->qgroup_lock);
qgroup = find_qgroup_rb(fs_info, root);
- if (!qgroup) {
- ret = -ENOENT;
+ if (WARN_ON_ONCE(!qgroup)) {
+ btrfs_warn(fs_info, "squota failed to find qgroup for root %llu", root);
+ ret = 0;
goto out;
}
@@ -4944,8 +4968,19 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
list_for_each_entry(qg, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
- qg->excl += num_bytes * sign;
- qg->rfer += num_bytes * sign;
+ ASSERT(qg->excl == qg->rfer);
+ if (WARN_ON_ONCE(sign < 0 && qg->excl < num_bytes)) {
+ btrfs_warn(fs_info,
+ "squota underflow qg %hu/%llu excl %llu num_bytes %llu",
+ btrfs_qgroup_level(qg->qgroupid),
+ btrfs_qgroup_subvolid(qg->qgroupid),
+ qg->excl, num_bytes);
+ qg->excl = 0;
+ qg->rfer = 0;
+ } else {
+ qg->excl += num_bytes * sign;
+ qg->rfer += num_bytes * sign;
+ }
qgroup_dirty(fs_info, qg);
list_for_each_entry(glist, &qg->groups, next_group)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 638c4ad572c99..4b0186c83ad1d 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -45,8 +45,11 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
struct btrfs_raid_stride *stride = &extent->strides[i];
+ u64 devid;
u64 phys;
+ devid = btrfs_raid_stride_devid(leaf, stride);
+ btrfs_set_stack_raid_stride_devid(&newitem->strides[i], devid);
phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);
}
@@ -95,14 +98,26 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
while (1) {
key.objectid = start;
key.type = BTRFS_RAID_STRIPE_KEY;
- key.offset = 0;
+ key.offset = (u64)-1;
ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
if (ret < 0)
break;
- if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
- path->slots[0]--;
+ /*
+ * Search with offset=(u64)-1 ensures we land on the correct
+ * leaf even when the target entry is the first item on a leaf.
+ * Since no real entry has offset=(u64)-1, ret is always 1 and
+ * slot points past the last entry with objectid==start (or
+ * past the end of the leaf if that entry is the last item).
+ * Back up one slot to find the actual entry.
+ */
+ if (path->slots[0] == 0) {
+ /* No entry with objectid <= start exists. */
+ ret = 0;
+ break;
+ }
+ path->slots[0]--;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -123,7 +138,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
*/
if (found_start > start) {
if (slot == 0) {
- ret = btrfs_previous_item(stripe_root, path, start,
+ ret = btrfs_previous_item(stripe_root, path, 0,
BTRFS_RAID_STRIPE_KEY);
if (ret) {
if (ret > 0)
@@ -139,7 +154,10 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
btrfs_item_key_to_cpu(leaf, &key, slot);
found_start = key.objectid;
found_end = found_start + key.offset;
- ASSERT(found_start <= start);
+ if (found_start > start || found_end <= start) {
+ ret = -ENOENT;
+ break;
+ }
}
if (key.type != BTRFS_RAID_STRIPE_KEY)
@@ -176,9 +194,19 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
/* The "right" item. */
ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(path);
+ continue;
+ }
if (ret)
break;
+ /*
+ * btrfs_duplicate_item() may have triggered a leaf
+ * split via setup_leaf_for_split(), so we must refresh
+ * our leaf pointer from the path.
+ */
+ leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_stripe_extent);
@@ -195,8 +223,9 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
/* The "left" item. */
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- btrfs_partially_delete_raid_extent(trans, path, &key,
- diff_start, 0);
+ ret = btrfs_partially_delete_raid_extent(trans, path,
+ &key,
+ diff_start, 0);
break;
}
@@ -212,8 +241,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
if (found_start < start) {
u64 diff_start = start - found_start;
- btrfs_partially_delete_raid_extent(trans, path, &key,
- diff_start, 0);
+ ret = btrfs_partially_delete_raid_extent(trans, path,
+ &key,
+ diff_start, 0);
+ if (ret)
+ break;
start += (key.offset - diff_start);
length -= (key.offset - diff_start);
@@ -236,9 +268,10 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
if (found_end > end) {
u64 diff_end = found_end - end;
- btrfs_partially_delete_raid_extent(trans, path, &key,
- key.offset - length,
- length);
+ ret = btrfs_partially_delete_raid_extent(trans, path,
+ &key,
+ key.offset - length,
+ length);
ASSERT(key.offset - diff_end == length);
break;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1c42c5180bddd..3ebaf5880125f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2607,7 +2607,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
if (!block->key_ready)
btrfs_readahead_tree_block(fs_info, block->bytenr,
block->owner, 0,
- block->level);
+ block->level, NULL);
}
/* Get first keys */
@@ -3876,7 +3876,7 @@ static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs
ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path, &batch);
btrfs_release_path(path);
- if (num_entries <= max_items)
+ if (ret || num_entries <= max_items)
break;
num_entries -= max_items;
@@ -4174,6 +4174,12 @@ static int move_existing_remap(struct btrfs_fs_info *fs_info,
return ret;
}
+ if (ins.offset < length) {
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, ins.offset - length);
+ spin_unlock(&sinfo->lock);
+ }
+
dest_addr = ins.objectid;
dest_length = ins.offset;
@@ -5000,6 +5006,12 @@ static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info,
return ret;
}
+ if (ins.offset < remap_length) {
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, ins.offset - remap_length);
+ spin_unlock(&sinfo->lock);
+ }
+
made_reservation = true;
new_addr = ins.objectid;
@@ -5023,21 +5035,27 @@ static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info,
if (bg_needs_free_space) {
ret = btrfs_add_block_group_free_space(trans, dest_bg);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
+ }
}
ret = copy_remapped_data(fs_info, start, new_addr, length);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
+ }
ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
+ }
ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
if (ret) {
- btrfs_add_to_free_space_tree(trans, new_addr, length);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 248adb785051b..194f581b36f36 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1293,14 +1293,13 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
- btrfs_extent_io_tree_release(&trans->transaction->dirty_pages);
-
if (ret)
return ret;
- else if (ret2)
+ if (ret2)
return ret2;
- else
- return 0;
+
+ btrfs_extent_io_tree_release(&trans->transaction->dirty_pages);
+ return 0;
}
/*
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1b83ed0e0a63f..2937db690b40b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -130,6 +130,8 @@ retry:
ret = cachefiles_inject_write_error();
if (ret == 0) {
subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL);
+ if (IS_ERR(subdir))
+ ret = PTR_ERR(subdir);
} else {
end_creating(subdir);
subdir = ERR_PTR(ret);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1454760332ffc..0a86f672cc09c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1336,6 +1336,7 @@ void ceph_process_folio_batch(struct address_space *mapping,
ceph_wbc, folio);
if (rc == -ENODATA) {
folio_unlock(folio);
+ folio_put(folio);
ceph_wbc->fbatch.folios[i] = NULL;
continue;
} else if (rc == -E2BIG) {
@@ -1346,6 +1347,7 @@ void ceph_process_folio_batch(struct address_space *mapping,
if (!folio_clear_dirty_for_io(folio)) {
doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
folio_unlock(folio);
+ folio_put(folio);
ceph_wbc->fbatch.folios[i] = NULL;
continue;
}
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 4dc9426643e83..053d5bf0c9f07 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -228,12 +228,19 @@ static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode,
restart:
realm = ceph_inode(inode)->i_snap_realm;
- if (realm)
+ if (realm) {
ceph_get_snap_realm(mdsc, realm);
- else
- pr_err_ratelimited_client(cl,
- "%p %llx.%llx null i_snap_realm\n",
- inode, ceph_vinop(inode));
+ } else {
+ /*
+ * i_snap_realm is NULL when all caps have been released, e.g.
+ * after an MDS session rejection. This is a transient state;
+ * the realm will be restored once caps are re-granted.
+ * Treat it as "no quota realm found".
+ */
+ doutc(cl, "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
+ }
+
while (realm) {
bool has_inode;
@@ -340,12 +347,19 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
down_read(&mdsc->snap_rwsem);
restart:
realm = ceph_inode(inode)->i_snap_realm;
- if (realm)
+ if (realm) {
ceph_get_snap_realm(mdsc, realm);
- else
- pr_err_ratelimited_client(cl,
- "%p %llx.%llx null i_snap_realm\n",
- inode, ceph_vinop(inode));
+ } else {
+ /*
+ * i_snap_realm is NULL when all caps have been released, e.g.
+ * after an MDS session rejection. This is a transient state;
+ * the realm will be restored once caps are re-granted.
+ * Treat it as "quota not exceeded".
+ */
+ doutc(cl, "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
+ }
+
while (realm) {
bool has_inode;
@@ -496,6 +510,9 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
u64 total = 0, used, free;
bool is_updated = false;
+ if (!ceph_has_realms_with_quotas(d_inode(fsc->sb->s_root)))
+ return false;
+
down_read(&mdsc->snap_rwsem);
get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES,
&realm, true);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5f87f62091a14..e773be07f7674 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1254,6 +1254,22 @@ retry:
ceph_vinop(inode), name, ceph_cap_string(issued));
__build_xattrs(inode);
+ /*
+ * __build_xattrs() may have released and reacquired i_ceph_lock,
+ * during which handle_cap_grant() could have replaced i_xattrs.blob
+ * with a newer MDS-provided blob and bumped i_xattrs.version. If that
+ * caused __build_xattrs() to rebuild the rb-tree from the new blob,
+ * count/names_size/vals_size may now be larger than when
+ * required_blob_size was computed above. Recompute it here so the
+ * prealloc_blob size check below reflects the current tree state.
+ */
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if (required_blob_size > mdsc->mdsmap->m_max_xattr_size) {
+ doutc(cl, "sync (size too large): %d > %llu\n",
+ required_blob_size, mdsc->mdsmap->m_max_xattr_size);
+ goto do_sync;
+ }
+
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob;
@@ -1294,6 +1310,7 @@ retry:
do_sync:
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 1c5224cf183e6..733c19571f1cf 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -191,13 +191,10 @@ static const struct dentry_operations efivarfs_d_ops = {
static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
{
+ struct qstr q = QSTR(name);
struct dentry *d;
- struct qstr q;
int err;
- q.name = name;
- q.len = strlen(name);
-
err = efivarfs_d_hash(parent, &q);
if (err)
return ERR_PTR(err);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 41e311019a251..df7ea019526d7 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -89,13 +89,11 @@ static int erofs_init_inode_xattrs(struct inode *inode)
vi->xattr_isize - sizeof(struct erofs_xattr_ibody_header)) {
erofs_err(sb, "invalid h_shared_count %u @ nid %llu",
vi->xattr_shared_count, vi->nid);
- erofs_put_metabuf(&buf);
ret = -EFSCORRUPTED;
goto out_unlock;
}
vi->xattr_shared_xattrs = kmalloc_objs(uint, vi->xattr_shared_count);
if (!vi->xattr_shared_xattrs) {
- erofs_put_metabuf(&buf);
ret = -ENOMEM;
goto out_unlock;
}
@@ -112,12 +110,12 @@ static int erofs_init_inode_xattrs(struct inode *inode)
}
vi->xattr_shared_xattrs[i] = le32_to_cpu(*xattr_id);
}
- erofs_put_metabuf(&buf);
/* paired with smp_mb() at the beginning of the function. */
smp_mb();
set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
out_unlock:
+ erofs_put_metabuf(&buf);
clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags);
return ret;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 43bb5a6a9924b..27ab7bd844ec7 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1509,8 +1509,15 @@ repeat:
DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
folio = page_folio(zbv.page);
- /* For preallocated managed folios, add them to page cache here */
+ /*
+ * Preallocated folios are added to the managed cache here rather than
+ * in z_erofs_bind_cache() in order to keep these folios locked in
+ * increasing (physical) address order.
+ * Clear folio->private before these folios become visible to others in
+ * the managed cache to avoid duplicate additions for unaligned extents.
+ */
if (folio->private == Z_EROFS_PREALLOCATED_FOLIO) {
+ folio->private = NULL;
tocache = true;
goto out_tocache;
}
@@ -1546,14 +1553,8 @@ repeat:
}
return;
}
- /*
- * Already linked with another pcluster, which only appears in
- * crafted images by fuzzers for now. But handle this anyway.
- */
- tocache = false; /* use temporary short-lived pages */
} else {
DBG_BUGON(1); /* referenced managed folios can't be truncated */
- tocache = true;
}
folio_unlock(folio);
folio_put(folio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c59452d60b8da..f94f3dc082c6b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2176,7 +2176,10 @@ static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
WARN_ON(!ap->num_folios);
- /* Reached max pages */
+ /* Reached max pages or max folio slots */
+ if (ap->num_folios >= fc->max_pages)
+ return true;
+
if (DIV_ROUND_UP(bytes, PAGE_SIZE) > fc->max_pages)
return true;
diff --git a/fs/inode.c b/fs/inode.c
index 6a3cbc7dcd28c..62c579a0cf7df 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2124,7 +2124,13 @@ static int inode_update_cmtime(struct inode *inode, unsigned int flags)
inode_iversion_need_inc(inode))
return -EAGAIN;
} else {
- if (inode_maybe_inc_iversion(inode, !!dirty))
+ /*
+ * Don't force iversion increment for pure lazytime
+ * updates (I_DIRTY_TIME only), let I_VERSION_QUERIED
+ * dictate whether the increment is needed.
+ */
+ if (inode_maybe_inc_iversion(inode,
+ dirty != I_DIRTY_TIME))
dirty |= I_DIRTY_SYNC;
}
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b0a6549b38487..b36ee619cdcdd 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -355,7 +355,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
if (dio->flags & IOMAP_DIO_BOUNCE)
ret = bio_iov_iter_bounce(bio, dio->submit.iter,
- iomap_max_bio_size(&iter->iomap));
+ iomap_max_bio_size(&iter->iomap), alignment);
else
ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
alignment - 1);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 421d247fae523..78f80c1a5c54a 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -24,7 +24,7 @@ isofs_export_iget(struct super_block *sb,
{
struct inode *inode;
- if (block == 0)
+ if (block == 0 || block >= ISOFS_SB(sb)->s_nzones)
return ERR_PTR(-ESTALE);
inode = isofs_iget(sb, block, offset);
if (IS_ERR(inode))
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 8dd3911717e0c..3ace3d6a55e79 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -10,20 +10,13 @@
#include <linux/gfp.h>
#include "isofs.h"
-/*
- * ok, we cannot use strncmp, as the name is not in our data space.
- * Thus we'll have to use isofs_match. No big problem. Match also makes
- * some sanity tests.
- */
static int
isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
{
- struct qstr qstr;
- qstr.name = compare;
- qstr.len = dlen;
if (likely(!dentry->d_op))
return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
- return dentry->d_op->d_compare(NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
+ return dentry->d_op->d_compare(NULL, dentry->d_name.len, dentry->d_name.name,
+ &QSTR_LEN(compare, dlen));
}
/*
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 6fe6dbd0c740f..1232fab59a4e6 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -101,6 +101,15 @@ static int rock_continue(struct rock_state *rs)
goto out;
}
+ if ((unsigned)rs->cont_extent >= ISOFS_SB(rs->inode->i_sb)->s_nzones) {
+ printk(KERN_NOTICE "rock: corrupted directory entry. "
+ "extent=%u out of volume (nzones=%lu)\n",
+ (unsigned)rs->cont_extent,
+ ISOFS_SB(rs->inode->i_sb)->s_nzones);
+ ret = -EIO;
+ goto out;
+ }
+
if (rs->cont_extent) {
struct buffer_head *bh;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 60c4a0e0fca5e..442d626792622 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -309,7 +309,7 @@ static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
out1:
jfs_info("jfs_mkdir: rc:%d", rc);
- return ERR_PTR(rc);
+ return rc ? ERR_PTR(rc) : NULL;
}
/*
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 6472c4ea3d1e6..cb61fbdb52e90 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -375,6 +375,8 @@ int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_
continue;
seq_printf(seq, "%u %u %u", extent->first, lower, extent->count);
+ if (seq_has_overflowed(seq))
+ return -EAGAIN;
seq->count++; /* mappings are separated by \0 */
if (seq_has_overflowed(seq))
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index a8c0d86118c58..76d0f6a29abab 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -156,9 +156,8 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
netfs_cache_read_terminated, subreq);
}
-static void netfs_queue_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq,
- bool last_subreq)
+void netfs_queue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
@@ -169,7 +168,8 @@ static void netfs_queue_read(struct netfs_io_request *rreq,
* remove entries off of the front.
*/
spin_lock(&rreq->lock);
- list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ /* Write IN_PROGRESS before pointer to new subreq */
+ list_add_tail_release(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
if (!stream->active) {
stream->collected_to = subreq->start;
@@ -178,11 +178,6 @@ static void netfs_queue_read(struct netfs_io_request *rreq,
}
}
- if (last_subreq) {
- smp_wmb(); /* Write lists before ALL_QUEUED. */
- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- }
-
spin_unlock(&rreq->lock);
}
@@ -214,7 +209,6 @@ static void netfs_issue_read(struct netfs_io_request *rreq,
static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
struct readahead_control *ractl)
{
- struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned long long start = rreq->start;
ssize_t size = rreq->len;
int ret = 0;
@@ -233,10 +227,13 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
subreq->start = start;
subreq->len = size;
+ netfs_queue_read(rreq, subreq);
+
source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
subreq->source = source;
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
- unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
+ unsigned long long zero_point = netfs_read_zero_point(rreq->inode);
+ unsigned long long zp = umin(zero_point, rreq->i_size);
size_t len = subreq->len;
if (unlikely(rreq->origin == NETFS_READ_SINGLE))
@@ -252,7 +249,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx",
rreq->debug_id, subreq->debug_index,
subreq->len, size,
- subreq->start, ictx->zero_point, rreq->i_size);
+ subreq->start, zero_point, rreq->i_size);
+ netfs_cancel_read(subreq, ret);
break;
}
subreq->len = len;
@@ -261,12 +259,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) {
- subreq->error = ret;
- /* Not queued - release both refs. */
- netfs_put_subrequest(subreq,
- netfs_sreq_trace_put_cancel);
- netfs_put_subrequest(subreq,
- netfs_sreq_trace_put_cancel);
+ netfs_cancel_read(subreq, ret);
break;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -289,24 +282,29 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
pr_err("Unexpected read source %u\n", source);
WARN_ON_ONCE(1);
+ netfs_cancel_read(subreq, ret);
break;
issue:
slice = netfs_prepare_read_iterator(subreq, ractl);
if (slice < 0) {
ret = slice;
- subreq->error = ret;
- trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
- /* Not queued - release both refs. */
- netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
- netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
+ netfs_cancel_read(subreq, ret);
break;
}
- size -= slice;
start += slice;
+ size -= slice;
+ if (size <= 0) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ }
- netfs_queue_read(rreq, subreq, size <= 0);
netfs_issue_read(rreq, subreq);
+
+ if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ netfs_wait_for_paused_read(rreq);
+ if (test_bit(NETFS_RREQ_FAILED, &rreq->flags))
+ break;
cond_resched();
} while (size > 0);
@@ -397,6 +395,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
{
struct netfs_io_request *rreq;
struct address_space *mapping = folio->mapping;
+ struct netfs_group *group = netfs_folio_group(folio);
struct netfs_folio *finfo = netfs_folio_info(folio);
struct netfs_inode *ctx = netfs_inode(mapping->host);
struct folio *sink = NULL;
@@ -458,14 +457,20 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
netfs_read_to_pagecache(rreq, NULL);
- if (sink)
- folio_put(sink);
-
ret = netfs_wait_for_read(rreq);
if (ret >= 0) {
+ if (group)
+ folio_change_private(folio, group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
+
+ if (sink)
+ folio_put(sink);
folio_unlock(folio);
netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
@@ -498,10 +503,10 @@ int netfs_read_folio(struct file *file, struct folio *folio)
struct netfs_inode *ctx = netfs_inode(mapping->host);
int ret;
- if (folio_test_dirty(folio)) {
- trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+ folio_wait_writeback(folio);
+
+ if (folio_test_dirty(folio))
return netfs_read_gaps(file, folio);
- }
_enter("%lx", folio->index);
@@ -667,7 +672,7 @@ retry:
ret = PTR_ERR(rreq);
goto error;
}
- rreq->no_unlock_folio = folio->index;
+ rreq->no_unlock_folio = folio;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
ret = netfs_begin_cache_read(rreq, ctx);
@@ -684,9 +689,9 @@ retry:
netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
if (ret < 0)
goto error;
- netfs_put_request(rreq, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_private_2_killable(folio);
@@ -733,7 +738,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
goto error;
}
- rreq->no_unlock_folio = folio->index;
+ rreq->no_unlock_folio = folio;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
ret = netfs_begin_cache_read(rreq, ctx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 05ea5b0cc0e8b..6bde3320bcec6 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -12,24 +12,6 @@
#include <linux/slab.h>
#include "internal.h"
-static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
-{
- if (netfs_group)
- folio_attach_private(folio, netfs_get_group(netfs_group));
-}
-
-static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
-{
- void *priv = folio_get_private(folio);
-
- if (unlikely(priv != netfs_group)) {
- if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
- folio_attach_private(folio, netfs_get_group(netfs_group));
- else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
- folio_detach_private(folio);
- }
-}
-
/*
* Grab a folio for writing and lock it. Attempt to allocate as large a folio
* as possible to hold as much of the remaining length as possible in one go.
@@ -149,6 +131,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
do {
+ enum netfs_folio_trace trace;
struct netfs_folio *finfo;
struct netfs_group *group;
unsigned long long fpos;
@@ -156,6 +139,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
size_t offset; /* Offset into pagecache folio */
size_t part; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
+ void *priv;
offset = pos & (max_chunk - 1);
part = min(max_chunk - offset, iov_iter_count(iter));
@@ -201,73 +185,99 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
goto error_folio_unlock;
}
- /* Decide how we should modify a folio. We might be attempting
- * to do write-streaming, in which case we don't want to a
- * local RMW cycle if we can avoid it. If we're doing local
- * caching or content crypto, we award that priority over
- * avoiding RMW. If the file is open readably, then we also
- * assume that we may want to read what we wrote.
- */
finfo = netfs_folio_info(folio);
group = netfs_folio_group(folio);
+ /* If the requested group differs from the group set on the
+ * page, then we need to flush out the folio if it has a group
+ * set (ie. is non-NULL). Note that COPY_TO_CACHE is a special
+ * case, being a netfs annotation rather than an actual group.
+ *
+ * The filesystem isn't permitted to mix writes with groups and
+ * writes without groups as the NULL group is used to indicate
+ * that no group is set.
+ */
if (unlikely(group != netfs_group) &&
- group != NETFS_FOLIO_COPY_TO_CACHE)
+ group != NETFS_FOLIO_COPY_TO_CACHE &&
+ group) {
+ WARN_ON_ONCE(!netfs_group);
goto flush_content;
+ }
+ /* Decide how we should modify a folio. We might be attempting
+ * to do write-streaming, as we don't want to a local RMW cycle
+ * if we can avoid it. If we're doing local caching or content
+ * crypto, we award that priority over avoiding RMW. If the
+ * file is open readably, then we let ->read_folio() fill in
+ * the gaps.
+ */
if (folio_test_uptodate(folio)) {
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
- netfs_set_group(folio, netfs_group);
- trace_netfs_folio(folio, netfs_folio_is_uptodate);
- goto copied;
+ trace = netfs_folio_is_uptodate;
+ goto copied_uptodate;
}
/* If the page is above the zero-point then we assume that the
* server would just return a block of zeros or a short read if
* we try to read it.
*/
- if (fpos >= ctx->zero_point) {
+ if (fpos >= netfs_read_zero_point(inode)) {
folio_zero_segment(folio, 0, offset);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
folio_zero_segment(folio, offset + copied, flen);
- __netfs_set_group(folio, netfs_group);
- folio_mark_uptodate(folio);
- trace_netfs_folio(folio, netfs_modify_and_clear);
- goto copied;
+ if (finfo)
+ trace = netfs_modify_and_clear_rm_finfo;
+ else
+ trace = netfs_modify_and_clear;
+ goto mark_uptodate;
}
/* See if we can write a whole folio in one go. */
if (!maybe_trouble && offset == 0 && part >= flen) {
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
- if (unlikely(copied == 0))
+ if (likely(copied == part)) {
+ if (finfo)
+ trace = netfs_whole_folio_modify_filled;
+ else
+ trace = netfs_whole_folio_modify;
+ goto mark_uptodate;
+ }
+ if (copied == 0)
goto copy_failed;
- if (unlikely(copied < part)) {
+ if (!finfo || copied <= finfo->dirty_offset) {
maybe_trouble = true;
iov_iter_revert(iter, copied);
copied = 0;
folio_unlock(folio);
goto retry;
}
- __netfs_set_group(folio, netfs_group);
- folio_mark_uptodate(folio);
- trace_netfs_folio(folio, netfs_whole_folio_modify);
+
+ /* We overwrote some existing dirty data, so we have to
+ * accept the partial write.
+ */
+ finfo->dirty_len += finfo->dirty_offset;
+ if (finfo->dirty_len == flen) {
+ trace = netfs_whole_folio_modify_filled_efault;
+ goto mark_uptodate;
+ }
+ if (copied > finfo->dirty_len)
+ finfo->dirty_len = copied;
+ finfo->dirty_offset = 0;
+ trace = netfs_whole_folio_modify_efault;
goto copied;
}
/* We don't want to do a streaming write on a file that loses
* caching service temporarily because the backing store got
- * culled and we don't really want to get a streaming write on
- * a file that's open for reading as ->read_folio() then has to
- * be able to flush it.
+ * culled.
*/
- if ((file->f_mode & FMODE_READ) ||
- netfs_is_cache_enabled(ctx)) {
+ if (netfs_is_cache_enabled(ctx)) {
if (finfo) {
netfs_stat(&netfs_n_wh_wstream_conflict);
goto flush_content;
@@ -282,11 +292,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
- netfs_set_group(folio, netfs_group);
- trace_netfs_folio(folio, netfs_just_prefetch);
- goto copied;
+ trace = netfs_just_prefetch;
+ goto copied_uptodate;
}
+ /* Do a streaming write on a folio that has nothing in it yet. */
if (!finfo) {
ret = -EIO;
if (WARN_ON(folio_get_private(folio)))
@@ -295,10 +305,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(copied == 0))
goto copy_failed;
if (offset == 0 && copied == flen) {
- __netfs_set_group(folio, netfs_group);
- folio_mark_uptodate(folio);
- trace_netfs_folio(folio, netfs_streaming_filled_page);
- goto copied;
+ trace = netfs_streaming_filled_page;
+ goto mark_uptodate;
}
finfo = kzalloc_obj(*finfo);
@@ -312,7 +320,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
finfo->dirty_len = copied;
folio_attach_private(folio, (void *)((unsigned long)finfo |
NETFS_FOLIO_INFO));
- trace_netfs_folio(folio, netfs_streaming_write);
+ trace = netfs_streaming_write;
goto copied;
}
@@ -326,16 +334,10 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
goto copy_failed;
finfo->dirty_len += copied;
if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
- if (finfo->netfs_group)
- folio_change_private(folio, finfo->netfs_group);
- else
- folio_detach_private(folio);
- folio_mark_uptodate(folio);
- kfree(finfo);
- trace_netfs_folio(folio, netfs_streaming_cont_filled_page);
- } else {
- trace_netfs_folio(folio, netfs_streaming_write_cont);
+ trace = netfs_streaming_cont_filled_page;
+ goto mark_uptodate;
}
+ trace = netfs_streaming_write_cont;
goto copied;
}
@@ -349,7 +351,38 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
goto out;
continue;
+ /* Mark a folio as being up to data when we've filled it
+ * completely. If the folio has a group attached, then it must
+ * be the same group, otherwise we should have flushed it out
+ * above. We have to get rid of the netfs_folio struct if
+ * there was one.
+ */
+ mark_uptodate:
+ folio_mark_uptodate(folio);
+
+ copied_uptodate:
+ priv = folio_get_private(folio);
+ if (likely(priv == netfs_group)) {
+ /* Already set correctly; no change required. */
+ } else if (priv == NETFS_FOLIO_COPY_TO_CACHE) {
+ if (!netfs_group)
+ folio_detach_private(folio);
+ else
+ folio_change_private(folio, netfs_get_group(netfs_group));
+ } else if (!priv) {
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ } else {
+ WARN_ON_ONCE(!finfo);
+ if (netfs_group)
+ /* finfo->netfs_group has a ref */
+ folio_change_private(folio, netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
+
copied:
+ trace_netfs_folio(folio, trace);
flush_dcache_folio(folio);
/* Update the inode size if we moved the EOF marker */
@@ -510,6 +543,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_NOPAGE;
+ void *priv;
int err;
_enter("%lx", folio->index);
@@ -530,7 +564,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
}
group = netfs_folio_group(folio);
- if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
+ if (group &&
+ group != netfs_group &&
+ group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
err = filemap_fdatawrite_range(mapping,
folio_pos(folio),
@@ -552,7 +588,19 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
else
trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
- netfs_set_group(folio, netfs_group);
+
+ priv = folio_get_private(folio);
+ if (priv != netfs_group) {
+ if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_detach_private(folio);
+ else if (netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_change_private(folio, netfs_get_group(netfs_group));
+ else if (netfs_group && !priv)
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ else
+ WARN_ON_ONCE(1);
+ }
+
file_update_time(file);
set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags);
if (ictx->ops->post_modify)
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index f72e6da88cca7..6a8fb0d55e040 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -45,12 +45,11 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
* Perform a read to a buffer from the server, slicing up the region to be read
* according to the network rsize.
*/
-static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
+static void netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
{
- struct netfs_io_stream *stream = &rreq->io_streams[0];
unsigned long long start = rreq->start;
ssize_t size = rreq->len;
- int ret = 0;
+ int ret;
do {
struct netfs_io_subrequest *subreq;
@@ -58,7 +57,10 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
subreq = netfs_alloc_subrequest(rreq);
if (!subreq) {
- ret = -ENOMEM;
+ /* Stash the error in the request if there's not
+ * already an error set.
+ */
+ cmpxchg(&rreq->error, 0, -ENOMEM);
break;
}
@@ -66,25 +68,13 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
subreq->start = start;
subreq->len = size;
- __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
-
- spin_lock(&rreq->lock);
- list_add_tail(&subreq->rreq_link, &stream->subrequests);
- if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
- if (!stream->active) {
- stream->collected_to = subreq->start;
- /* Store list pointers before active flag */
- smp_store_release(&stream->active, true);
- }
- }
- trace_netfs_sreq(subreq, netfs_sreq_trace_added);
- spin_unlock(&rreq->lock);
+ netfs_queue_read(rreq, subreq);
netfs_stat(&netfs_n_rh_download);
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) {
- netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
+ netfs_cancel_read(subreq, ret);
break;
}
}
@@ -113,8 +103,6 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
netfs_wake_collector(rreq);
}
-
- return ret;
}
/*
@@ -137,21 +125,17 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
// TODO: Use bounce buffer if requested
inode_dio_begin(rreq->inode);
+ netfs_dispatch_unbuffered_reads(rreq);
- ret = netfs_dispatch_unbuffered_reads(rreq);
-
- if (!rreq->submitted) {
- netfs_put_request(rreq, netfs_rreq_trace_put_no_submit);
- inode_dio_end(rreq->inode);
- ret = 0;
- goto out;
- }
+ /* The collector will get run, even if we don't manage to submit any
+ * subreqs, so we shouldn't call inode_dio_end() here.
+ */
if (sync)
ret = netfs_wait_for_read(rreq);
else
ret = -EIOCBQUEUED;
-out:
+
_leave(" = %zd", ret);
return ret;
}
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index f9ab69de3e298..25f8ceb15fad6 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -376,8 +376,10 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret < 0)
goto out;
end = iocb->ki_pos + iov_iter_count(from);
- if (end > ictx->zero_point)
- ictx->zero_point = end;
+ spin_lock(&inode->i_lock);
+ if (end > ictx->_zero_point)
+ netfs_write_zero_point(inode, end);
+ spin_unlock(&inode->i_lock);
fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
FSCACHE_INVAL_DIO_WRITE);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index d436e20d34185..645996ecfc803 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -23,6 +23,8 @@
/*
* buffered_read.c
*/
+void netfs_queue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq);
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len);
@@ -108,6 +110,7 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq,
*/
bool netfs_read_collection(struct netfs_io_request *rreq);
void netfs_read_collection_worker(struct work_struct *work);
+void netfs_cancel_read(struct netfs_io_subrequest *subreq, int error);
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
/*
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 154a14bb2d7f7..b375567e0520e 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -22,7 +22,7 @@
*
* Extract the page fragments from the given amount of the source iterator and
* build up a second iterator that refers to all of those bits. This allows
- * the original iterator to disposed of.
+ * the original iterator to be disposed of.
*
* @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA be
* allowed on the pages extracted.
@@ -43,7 +43,7 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
unsigned int max_pages;
unsigned int npages = 0;
unsigned int i;
- ssize_t ret;
+ ssize_t ret = 0;
size_t count = orig_len, offset, len;
size_t bv_size, pg_size;
@@ -67,26 +67,29 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
ret = iov_iter_extract_pages(orig, &pages, count,
max_pages - npages, extraction_flags,
&offset);
- if (ret < 0) {
- pr_err("Couldn't get user pages (rc=%zd)\n", ret);
+ if (unlikely(ret <= 0)) {
+ ret = ret ?: -EIO;
break;
}
- if (ret > count) {
- pr_err("get_pages rc=%zd more than %zu\n", ret, count);
+ if (WARN(ret > count,
+ "%s: extract_pages overrun %zd > %zu bytes\n",
+ __func__, ret, count)) {
+ ret = -EIO;
break;
}
- count -= ret;
- ret += offset;
- cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE);
-
- if (npages + cur_npages > max_pages) {
- pr_err("Out of bvec array capacity (%u vs %u)\n",
- npages + cur_npages, max_pages);
+ cur_npages = DIV_ROUND_UP(offset + ret, PAGE_SIZE);
+ if (WARN(cur_npages > max_pages - npages,
+ "%s: extract_pages overrun %u > %u pages\n",
+ __func__, npages + cur_npages, max_pages)) {
+ ret = -EIO;
break;
}
+ count -= ret;
+ ret += offset;
+
for (i = 0; i < cur_npages; i++) {
len = ret > PAGE_SIZE ? PAGE_SIZE : ret;
bvec_set_page(bv + npages + i, *pages++, len - offset, offset);
@@ -97,6 +100,18 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
npages += cur_npages;
}
+ /* Note: Don't try to clean up after EIO. Either we got no pages, so
+ * nothing to clean up, or we got a buffer overrun, memory corruption
+ * and can't trust the stuff in the buffer (a WARN was emitted).
+ */
+
+ if (ret < 0 && (ret == -ENOMEM || npages == 0)) {
+ for (i = 0; i < npages; i++)
+ unpin_user_page(bv[i].bv_page);
+ kvfree(bv);
+ return ret;
+ }
+
iov_iter_bvec(new, orig->data_source, bv, npages, orig_len - count);
return npages;
}
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 6df89c92b10b0..5d554512ed23a 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -211,18 +211,25 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
struct netfs_folio *finfo;
- struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+ struct inode *inode = folio_inode(folio);
+ struct netfs_inode *ctx = netfs_inode(inode);
size_t flen = folio_size(folio);
_enter("{%lx},%zx,%zx", folio->index, offset, length);
if (offset == 0 && length == flen) {
- unsigned long long i_size = i_size_read(&ctx->inode);
+ unsigned long long i_size, remote_i_size, zero_point;
unsigned long long fpos = folio_pos(folio), end;
+ netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point);
end = umin(fpos + flen, i_size);
- if (fpos < i_size && end > ctx->zero_point)
- ctx->zero_point = end;
+ if (fpos < i_size && end > zero_point) {
+ spin_lock(&inode->i_lock);
+ end = umin(fpos + flen, inode->i_size);
+ if (fpos < i_size && end > ctx->_zero_point)
+ netfs_write_zero_point(inode, end);
+ spin_unlock(&inode->i_lock);
+ }
}
folio_wait_private_2(folio); /* [DEPRECATED] */
@@ -255,7 +262,8 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
goto erase_completely;
/* Move the start of the data. */
finfo->dirty_len = fend - iend;
- finfo->dirty_offset = offset;
+ finfo->dirty_offset = iend;
+ trace_netfs_folio(folio, netfs_folio_trace_invalidate_front);
return;
}
@@ -264,12 +272,14 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
*/
if (iend >= fend) {
finfo->dirty_len = offset - fstart;
+ trace_netfs_folio(folio, netfs_folio_trace_invalidate_tail);
return;
}
/* A partial write was split. The caller has already zeroed
* it, so just absorb the hole.
*/
+ trace_netfs_folio(folio, netfs_folio_trace_invalidate_middle);
}
return;
@@ -277,8 +287,9 @@ erase_completely:
netfs_put_group(netfs_folio_group(folio));
folio_detach_private(folio);
folio_clear_uptodate(folio);
+ folio_cancel_dirty(folio);
kfree(finfo);
- return;
+ trace_netfs_folio(folio, netfs_folio_trace_invalidate_all);
}
EXPORT_SYMBOL(netfs_invalidate_folio);
@@ -292,15 +303,22 @@ EXPORT_SYMBOL(netfs_invalidate_folio);
*/
bool netfs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
- unsigned long long end;
+ struct inode *inode = folio_inode(folio);
+ struct netfs_inode *ctx = netfs_inode(inode);
+ unsigned long long i_size, remote_i_size, zero_point, end;
if (folio_test_dirty(folio))
return false;
- end = umin(folio_next_pos(folio), i_size_read(&ctx->inode));
- if (end > ctx->zero_point)
- ctx->zero_point = end;
+ netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point);
+ end = folio_next_pos(folio);
+ if (end > zero_point) {
+ spin_lock(&inode->i_lock);
+ end = umin(end, ctx->_remote_i_size);
+ if (end > ctx->_zero_point)
+ netfs_write_zero_point(inode, end);
+ spin_unlock(&inode->i_lock);
+ }
if (folio_test_private(folio))
return false;
@@ -356,6 +374,7 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
DEFINE_WAIT(myself);
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ smp_rmb(); /* Read ->next before IN_PROGRESS. */
if (!netfs_check_subreq_in_progress(subreq))
continue;
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index e5f6665b3341e..23660a5901246 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -83,7 +83,7 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
}
just_unlock:
- if (folio->index == rreq->no_unlock_folio &&
+ if (folio == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
_debug("no unlock");
} else {
@@ -205,8 +205,10 @@ reassess:
* in progress. The issuer thread may be adding stuff to the tail
* whilst we're doing this.
*/
- front = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
+ front = list_first_entry_or_null_acquire(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ /* Read first subreq pointer before IN_PROGRESS flag. */
+
while (front) {
size_t transferred;
@@ -576,6 +578,17 @@ skip_error_checks:
EXPORT_SYMBOL(netfs_read_subreq_terminated);
/*
+ * Cancel a read subrequest due to preparation failure.
+ */
+void netfs_cancel_read(struct netfs_io_subrequest *subreq, int error)
+{
+ trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
+ subreq->error = error;
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ netfs_read_subreq_terminated(subreq);
+}
+
+/*
* Handle termination of a read from the cache.
*/
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error)
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index cca9ac43c0773..f59a70f3a086b 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -175,7 +175,9 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous);
+ spin_lock(&rreq->lock);
list_del(&subreq->rreq_link);
+ spin_unlock(&rreq->lock);
netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
if (subreq == to)
break;
@@ -203,8 +205,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
+ spin_lock(&rreq->lock);
list_add(&subreq->rreq_link, &to->rreq_link);
- to = list_next_entry(to, rreq_link);
+ spin_unlock(&rreq->lock);
+ to = subreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
stream->sreq_max_len = umin(len, rreq->rsize);
@@ -288,8 +292,15 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)
struct folio *folio = folioq_folio(p, slot);
if (folio && !folioq_is_marked2(p, slot)) {
- trace_netfs_folio(folio, netfs_folio_trace_abandon);
- folio_unlock(folio);
+ if (folio == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO,
+ &rreq->flags)) {
+ _debug("no unlock");
+ } else {
+ trace_netfs_folio(folio,
+ netfs_folio_trace_abandon);
+ folio_unlock(folio);
+ }
}
}
}
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index d0e23bc42445f..8833550d2eb60 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -89,7 +89,6 @@ static void netfs_single_read_cache(struct netfs_io_request *rreq,
*/
static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
{
- struct netfs_io_stream *stream = &rreq->io_streams[0];
struct netfs_io_subrequest *subreq;
int ret = 0;
@@ -102,14 +101,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
subreq->len = rreq->len;
subreq->io_iter = rreq->buffer.iter;
- __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
-
- spin_lock(&rreq->lock);
- list_add_tail(&subreq->rreq_link, &stream->subrequests);
- trace_netfs_sreq(subreq, netfs_sreq_trace_added);
- /* Store list pointers before active flag */
- smp_store_release(&stream->active, true);
- spin_unlock(&rreq->lock);
+ netfs_queue_read(rreq, subreq);
netfs_single_cache_prepare_read(rreq, subreq);
switch (subreq->source) {
@@ -121,10 +113,14 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
goto cancel;
}
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
rreq->netfs_ops->issue_read(subreq);
rreq->submitted += subreq->len;
break;
case NETFS_READ_FROM_CACHE:
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
netfs_single_read_cache(rreq, subreq);
rreq->submitted += subreq->len;
@@ -134,14 +130,15 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
pr_warn("Unexpected single-read source %u\n", subreq->source);
WARN_ON_ONCE(true);
ret = -EIO;
- break;
+ goto cancel;
}
- smp_wmb(); /* Write lists before ALL_QUEUED. */
- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
return ret;
cancel:
- netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
+ netfs_cancel_read(subreq, ret);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ netfs_wake_collector(rreq);
return ret;
}
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index b194447f4b111..24fc2bb2f8a47 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -57,7 +57,8 @@ static void netfs_dump_request(const struct netfs_io_request *rreq)
int netfs_folio_written_back(struct folio *folio)
{
enum netfs_folio_trace why = netfs_folio_trace_clear;
- struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
+ struct inode *inode = folio_inode(folio);
+ struct netfs_inode *ictx = netfs_inode(inode);
struct netfs_folio *finfo;
struct netfs_group *group = NULL;
int gcount = 0;
@@ -69,8 +70,10 @@ int netfs_folio_written_back(struct folio *folio)
unsigned long long fend;
fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
- if (fend > ictx->zero_point)
- ictx->zero_point = fend;
+ spin_lock(&ictx->inode.i_lock);
+ if (fend > ictx->_zero_point)
+ netfs_write_zero_point(inode, fend);
+ spin_unlock(&ictx->inode.i_lock);
folio_detach_private(folio);
group = finfo->netfs_group;
@@ -228,8 +231,10 @@ reassess_streams:
if (!smp_load_acquire(&stream->active))
continue;
- front = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
+ front = list_first_entry_or_null_acquire(&stream->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ /* Read first subreq pointer before IN_PROGRESS flag. */
+
while (front) {
trace_netfs_collect_sreq(wreq, front);
//_debug("sreq [%x] %llx %zx/%zx",
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 2db688f941251..c03c7cc45e471 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -204,7 +204,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq,
* remove entries off of the front.
*/
spin_lock(&wreq->lock);
- list_add_tail(&subreq->rreq_link, &stream->subrequests);
+ /* Write IN_PROGRESS before pointer to new subreq */
+ list_add_tail_release(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
if (!stream->active) {
stream->collected_to = subreq->start;
@@ -413,12 +414,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (streamw)
netfs_issue_write(wreq, cache);
- /* Flip the page to the writeback state and unlock. If we're called
- * from write-through, then the page has already been put into the wb
- * state.
- */
- if (wreq->origin == NETFS_WRITEBACK)
- folio_start_writeback(folio);
+ folio_start_writeback(folio);
folio_unlock(folio);
if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
@@ -646,29 +642,41 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
struct folio *folio, size_t copied, bool to_page_end,
struct folio **writethrough_cache)
{
+ int ret;
+
_enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end);
- if (!*writethrough_cache) {
- if (folio_test_dirty(folio))
- /* Sigh. mmap. */
- folio_clear_dirty_for_io(folio);
+ /* The folio is locked. */
+ if (*writethrough_cache != folio) {
+ if (*writethrough_cache) {
+ /* Did the folio get moved? */
+ folio_put(*writethrough_cache);
+ *writethrough_cache = NULL;
+ }
/* We can make multiple writes to the folio... */
- folio_start_writeback(folio);
if (wreq->len == 0)
trace_netfs_folio(folio, netfs_folio_trace_wthru);
else
trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
*writethrough_cache = folio;
+ folio_get(folio);
}
wreq->len += copied;
- if (!to_page_end)
+
+ if (!to_page_end) {
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
return 0;
+ }
+ ret = netfs_write_folio(wreq, wbc, folio);
+ folio_put(*writethrough_cache);
*writethrough_cache = NULL;
- return netfs_write_folio(wreq, wbc, folio);
+ wreq->submitted = wreq->len;
+ return ret;
}
/*
@@ -682,8 +690,12 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c
_enter("R=%x", wreq->debug_id);
- if (writethrough_cache)
+ if (writethrough_cache) {
+ folio_lock(writethrough_cache);
netfs_write_folio(wreq, wbc, writethrough_cache);
+ folio_put(writethrough_cache);
+ wreq->submitted = wreq->len;
+ }
netfs_end_issue_write(wreq);
@@ -818,6 +830,9 @@ static int netfs_write_folio_single(struct netfs_io_request *wreq,
*
* Write a monolithic, non-pagecache object back to the server and/or
* the cache.
+ *
+ * Return: 0 if successful; 1 if skipped due to lock conflict and WB_SYNC_NONE;
+ * or a negative error code.
*/
int netfs_writeback_single(struct address_space *mapping,
struct writeback_control *wbc,
@@ -834,8 +849,10 @@ int netfs_writeback_single(struct address_space *mapping,
if (!mutex_trylock(&ictx->wb_lock)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
+ /* The VFS will have undirtied the inode. */
+ netfs_single_mark_inode_dirty(&ictx->inode);
netfs_stat(&netfs_n_wb_lock_skip);
- return 0;
+ return 1;
}
netfs_stat(&netfs_n_wb_lock_wait);
mutex_lock(&ictx->wb_lock);
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
index 29489a23a2209..32735abfa03f0 100644
--- a/fs/netfs/write_retry.c
+++ b/fs/netfs/write_retry.c
@@ -130,7 +130,9 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ spin_lock(&wreq->lock);
list_del(&subreq->rreq_link);
+ spin_unlock(&wreq->lock);
netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
if (subreq == to)
break;
@@ -153,8 +155,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
netfs_sreq_trace_new);
trace_netfs_sreq(subreq, netfs_sreq_trace_split);
+ spin_lock(&wreq->lock);
list_add(&subreq->rreq_link, &to->rreq_link);
- to = list_next_entry(to, rreq_link);
+ spin_unlock(&wreq->lock);
+ to = subreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
stream->sreq_max_len = len;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 85e94c30285a2..ab39ec8854405 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1413,6 +1413,9 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dst, clone->cl_dst_pos, clone->cl_count,
EX_ISSYNC(cstate->current_fh.fh_export));
+ if (!status && (READ_ONCE(dst->nf_file->f_mode) & FMODE_NOCMTIME) != 0)
+ nfsd_update_cmtime_attr(dst->nf_file, 0);
+
nfsd_file_put(dst);
nfsd_file_put(src);
out:
@@ -2118,8 +2121,10 @@ do_callback:
set_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags);
trace_nfsd_copy_async_done(copy);
- nfsd4_send_cb_offload(copy);
atomic_dec(&copy->cp_nn->pending_async_copies);
+ if (copy->cp_res.wr_bytes_written > 0 && copy->attr_update)
+ nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0);
+ nfsd4_send_cb_offload(copy);
return 0;
}
@@ -2179,6 +2184,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
memcpy(&result->cb_stateid, &copy->cp_stateid.cs_stid,
sizeof(result->cb_stateid));
dup_copy_fields(copy, async_copy);
+ if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) &
+ FMODE_NOCMTIME) != 0)
+ async_copy->attr_update = true;
memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data,
cstate->session->se_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
@@ -2197,6 +2205,10 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
} else {
status = nfsd4_do_copy(copy, copy->nf_src->nf_file,
copy->nf_dst->nf_file, true);
+ if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) &
+ FMODE_NOCMTIME) != 0 &&
+ copy->cp_res.wr_bytes_written > 0)
+ nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0);
}
out:
trace_nfsd_copy_done(copy, status);
@@ -2535,10 +2547,6 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
dd = nfsd_get_dir_deleg(cstate, gdd, nf);
nfsd_file_put(nf);
if (IS_ERR(dd)) {
- int err = PTR_ERR(dd);
-
- if (err != -EAGAIN)
- return nfserrno(err);
gdd->gddrnf_status = GDD4_UNAVAIL;
return nfs_ok;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c2d13b26a6876..6837b63d98645 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1221,10 +1221,6 @@ static void put_deleg_file(struct nfs4_file *fp)
static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f)
{
- struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME | ATTR_DELEG };
- struct inode *inode = file_inode(f);
- int ret;
-
/* don't do anything if FMODE_NOCMTIME isn't set */
if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0)
return;
@@ -1242,17 +1238,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f
return;
/* Stamp everything to "now" */
- inode_lock(inode);
- ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL);
- inode_unlock(inode);
- if (ret) {
- struct inode *inode = file_inode(f);
-
- pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%llu: %d\n",
- MAJOR(inode->i_sb->s_dev),
- MINOR(inode->i_sb->s_dev),
- inode->i_ino, ret);
- }
+ nfsd_update_cmtime_attr(f, ATTR_ATIME);
}
static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
@@ -1865,6 +1851,13 @@ void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb)
break;
case SC_TYPE_LAYOUT:
ls = layoutstateid(stid);
+ spin_lock(&clp->cl_lock);
+ if (stid->sc_status == 0) {
+ stid->sc_status |=
+ SC_STATUS_ADMIN_REVOKED;
+ atomic_inc(&clp->cl_admin_revoked);
+ }
+ spin_unlock(&clp->cl_lock);
nfsd4_close_layout(ls);
break;
}
@@ -6378,7 +6371,6 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open,
}
open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG :
OPEN_DELEGATE_WRITE;
- dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat);
dp->dl_atime = stat.atime;
dp->dl_ctime = stat.ctime;
@@ -9429,11 +9421,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
if (status != nfserr_jukebox ||
!nfsd_wait_for_delegreturn(rqstp, inode))
goto out_status;
+ status = nfs_ok;
+ goto out_status;
+ }
+ if (!ncf->ncf_file_modified) {
+ if (ncf->ncf_initial_cinfo != ncf->ncf_cb_change)
+ ncf->ncf_file_modified = true;
+ else if (i_size_read(inode) != ncf->ncf_cb_fsize)
+ ncf->ncf_file_modified = true;
}
- if (!ncf->ncf_file_modified &&
- (ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
- ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
- ncf->ncf_file_modified = true;
if (ncf->ncf_file_modified) {
int err;
@@ -9560,3 +9556,31 @@ out_delegees:
put_nfs4_file(fp);
return ERR_PTR(status);
}
+
+/**
+ * nfsd_update_cmtime_attr - update file's delegated ctime/mtime,
+ * and optionally other attributes (ie ATTR_ATIME).
+ * @f: pointer to an opened file
+ * @flags: any additional flags that should be updated
+ *
+ * Given upon opening a file delegated attributes were issues, update
+ * @f attributes to current times.
+ */
+void nfsd_update_cmtime_attr(struct file *f, unsigned int flags)
+{
+ int ret;
+ struct inode *inode = file_inode(f);
+ struct iattr attr = {
+ .ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_DELEG | flags,
+ };
+
+ inode_lock(inode);
+ ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &attr, NULL);
+ inode_unlock(inode);
+ if (ret)
+ pr_notice_ratelimited("nfsd: Unable to update timestamps on "
+ "inode %02x:%02x:%llu: %d\n",
+ MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev),
+ inode->i_ino, ret);
+}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 953675eba5c36..c5ccea64c2817 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -843,6 +843,7 @@ extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
void nfsd4_put_client(struct nfs4_client *clp);
void nfsd4_async_copy_reaper(struct nfsd_net *nn);
bool nfsd4_has_active_async_copies(struct nfs4_client *clp);
+void nfsd_update_cmtime_attr(struct file *f, unsigned int flags);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
struct xdr_netobj princhash, struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 417e9ad9fbb39..9a4124c77e049 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -752,6 +752,7 @@ struct nfsd4_copy {
struct nfsd_file *nf_src;
struct nfsd_file *nf_dst;
+ bool attr_update;
copy_stateid_t cp_stateid;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index bfe884d624e7b..38290b9c07f7b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -457,7 +457,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
/*
* Unlike file_handle, type and len of struct fanotify_fh are u8.
* Traditionally, filesystem return handle_type < 0xff, but there
- * is no enforecement for that in vfs.
+ * is no enforcement for that in vfs.
*/
BUILD_BUG_ON(MAX_HANDLE_SZ > 0xff || FILEID_INVALID > 0xff);
if (type <= 0 || type >= FILEID_INVALID || fh_len != dwords << 2)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index b7198c4744e3a..2dac70b99b0d7 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -388,7 +388,7 @@ static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector
return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}
-static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
+struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
struct hlist_node *node = NULL;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c2ed5b11b0fe6..e256b420100dc 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -238,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
return inode;
}
-static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+/*
+ * Calculate mask of events for a list of marks.
+ *
+ * Return true if any of the attached marks want to hold an inode reference.
+ */
+static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
bool want_iref = false;
@@ -262,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
*/
WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
+ return want_iref;
+}
+
+/*
+ * Calculate mask of events for a list of marks after attach/modify mark
+ * and get an inode reference for the connector if needed.
+ *
+ * A concurrent add of evictable mark and detach of non-evictable mark can
+ * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
+ * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
+ * from fsnotify_put_mark().
+ */
+static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
+{
+ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
+ bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
+
+ (void) fsnotify_update_iref(conn, want_iref);
+}
+
+/*
+ * Calculate mask of events for a list of marks after detach mark
+ * and return the inode object if its reference is no longer needed.
+ */
+static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
+{
+ bool want_iref = __fsnotify_recalc_mask(conn);
+
return fsnotify_update_iref(conn, want_iref);
}
@@ -298,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
spin_lock(&conn->lock);
update_children = !fsnotify_conn_watches_children(conn);
- __fsnotify_recalc_mask(conn);
+ fsnotify_recalc_mask_set_iref(conn);
update_children &= fsnotify_conn_watches_children(conn);
spin_unlock(&conn->lock);
/*
@@ -419,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
/* Update watched objects after detaching mark */
if (sb)
fsnotify_update_sb_watchers(sb, conn);
- objp = __fsnotify_recalc_mask(conn);
+ objp = fsnotify_recalc_mask_clear_iref(conn);
type = conn->type;
}
WRITE_ONCE(mark->connector, NULL);
@@ -457,9 +490,6 @@ EXPORT_SYMBOL_GPL(fsnotify_put_mark);
*/
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
- if (!mark)
- return true;
-
if (refcount_inc_not_zero(&mark->refcnt)) {
spin_lock(&mark->lock);
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
@@ -500,15 +530,22 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
int type;
fsnotify_foreach_iter_type(type) {
+ struct fsnotify_mark *mark = iter_info->marks[type];
+
/* This can fail if mark is being removed */
- if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
- __release(&fsnotify_mark_srcu);
- goto fail;
+ while (mark && !fsnotify_get_mark_safe(mark)) {
+ if (mark->group == iter_info->current_group) {
+ __release(&fsnotify_mark_srcu);
+ goto fail;
+ }
+ /* This is a mark in an unrelated group, skip */
+ mark = fsnotify_next_mark(mark);
+ iter_info->marks[type] = mark;
}
}
/*
- * Now that both marks are pinned by refcount in the inode / vfsmount
+ * Now that all marks are pinned by refcount in the inode / vfsmount / etc
* lists, we can drop SRCU lock, and safely resume the list iteration
* once userspace returns.
*/
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 51e8c9430477b..160018c4fb362 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -266,7 +266,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
else
tsk = find_task_by_pid_ns(arg, pid_ns);
if (!tsk)
- break;
+ return ret;
switch (ioctl) {
case NS_GET_PID_FROM_PIDNS:
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 97b660eaa00c1..421c6cdcbb530 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -583,24 +583,13 @@ static u32 ntfs_resident_attr_min_value_length(const __le32 type)
case AT_STANDARD_INFORMATION:
return offsetof(struct standard_information, ver) +
sizeof(((struct standard_information *)0)->ver.v1.reserved12);
- case AT_ATTRIBUTE_LIST:
- return offsetof(struct attr_list_entry, name);
case AT_FILE_NAME:
- return offsetof(struct file_name_attr, file_name);
- case AT_OBJECT_ID:
- return sizeof(struct guid);
- case AT_SECURITY_DESCRIPTOR:
- return sizeof(struct security_descriptor_relative);
+ return offsetof(struct file_name_attr, file_name) +
+ sizeof(__le16) * 1;
case AT_VOLUME_INFORMATION:
return sizeof(struct volume_information);
- case AT_INDEX_ROOT:
- return sizeof(struct index_root);
- case AT_REPARSE_POINT:
- return offsetof(struct reparse_point, reparse_data);
case AT_EA_INFORMATION:
return sizeof(struct ea_information);
- case AT_EA:
- return offsetof(struct ea_attr, ea_name) + 1;
default:
return 0;
}
@@ -672,6 +661,9 @@ static int ntfs_attr_find(const __le32 type, const __le16 *name,
__le16 *upcase = vol->upcase;
u32 upcase_len = vol->upcase_len;
unsigned int space;
+ u16 name_offset;
+ u32 attr_len;
+ u32 name_size;
/*
* Iterate over attributes in mft record starting at @ctx->attr, or the
@@ -699,6 +691,20 @@ static int ntfs_attr_find(const __le32 type, const __le16 *name,
return -ENOENT;
if (unlikely(!a->length))
break;
+ if (a->name_length) {
+ name_offset = le16_to_cpu(a->name_offset);
+ attr_len = le32_to_cpu(a->length);
+ name_size = a->name_length * sizeof(__le16);
+
+ if (name_offset > attr_len ||
+ attr_len - name_offset < name_size) {
+ ntfs_error(vol->sb,
+ "Corrupt attribute name in MFT record %llu\n",
+ ctx->ntfs_ino->mft_no);
+ break;
+ }
+ }
+
if (type == AT_UNUSED)
return 0;
if (a->type != type)
@@ -712,14 +718,6 @@ static int ntfs_attr_find(const __le32 type, const __le16 *name,
if (a->name_length)
return -ENOENT;
} else {
- if (a->name_length && ((le16_to_cpu(a->name_offset) +
- a->name_length * sizeof(__le16)) >
- le32_to_cpu(a->length))) {
- ntfs_error(vol->sb, "Corrupt attribute name in MFT record %llu\n",
- ctx->ntfs_ino->mft_no);
- break;
- }
-
if (!ntfs_are_names_equal(name, name_len,
(__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)),
a->name_length, ic, upcase, upcase_len)) {
@@ -2924,12 +2922,12 @@ int ntfs_attr_open(struct ntfs_inode *ni, const __le32 type,
struct ntfs_inode *base_ni;
int err;
- ntfs_debug("Entering for inode %lld, attr 0x%x.\n",
- (unsigned long long)ni->mft_no, type);
-
if (!ni || !ni->vol)
return -EINVAL;
+ ntfs_debug("Entering for inode %lld, attr 0x%x.\n",
+ ni->mft_no, type);
+
if (NInoAttr(ni))
base_ni = ni->ext.base_ntfs_ino;
else
diff --git a/fs/ntfs/attrlist.c b/fs/ntfs/attrlist.c
index bd501e8a628c4..c2594d4c83b06 100644
--- a/fs/ntfs/attrlist.c
+++ b/fs/ntfs/attrlist.c
@@ -119,15 +119,14 @@ int ntfs_attrlist_entry_add(struct ntfs_inode *ni, struct attr_record *attr)
struct mft_record *ni_mrec;
u8 *old_al;
- ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n",
- (long long) ni->mft_no,
- (unsigned int) le32_to_cpu(attr->type));
-
if (!ni || !attr) {
ntfs_debug("Invalid arguments.\n");
return -EINVAL;
}
+ ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n",
+ ni->mft_no, (unsigned int) le32_to_cpu(attr->type));
+
ni_mrec = map_mft_record(ni);
if (IS_ERR(ni_mrec)) {
ntfs_debug("Invalid arguments.\n");
diff --git a/fs/ntfs/bdev-io.c b/fs/ntfs/bdev-io.c
index 67e65c88d6818..27d7c2767a33f 100644
--- a/fs/ntfs/bdev-io.c
+++ b/fs/ntfs/bdev-io.c
@@ -97,6 +97,8 @@ int ntfs_bdev_write(struct super_block *sb, void *buf, loff_t start, size_t size
idx_end++;
for (; idx < idx_end; idx++, from = 0) {
+ u32 len;
+
folio = read_mapping_folio(sb->s_bdev->bd_mapping, idx, NULL);
if (IS_ERR(folio)) {
ntfs_error(sb, "Unable to read %ld page", idx);
@@ -105,9 +107,10 @@ int ntfs_bdev_write(struct super_block *sb, void *buf, loff_t start, size_t size
offset = (loff_t)idx << PAGE_SHIFT;
to = min_t(u32, end - offset, PAGE_SIZE);
+ len = to - from;
- memcpy_to_folio(folio, from, buf + buf_off, to);
- buf_off += to;
+ memcpy_to_folio(folio, from, buf + buf_off, len);
+ buf_off += len;
folio_mark_uptodate(folio);
folio_mark_dirty(folio);
folio_put(folio);
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 656d802333e35..b1436b3151b98 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -125,7 +125,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
struct address_space *mapping;
struct folio *folio;
u8 *kaddr;
- int pos, len;
+ int pos, len, err;
u8 bit;
struct ntfs_inode *ni = NTFS_I(vi);
struct ntfs_volume *vol = ni->vol;
@@ -201,8 +201,10 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
/* If we are not in the last page, deal with all subsequent pages. */
while (index < end_index) {
- if (cnt <= 0)
+ if (cnt <= 0) {
+ err = -EIO;
goto rollback;
+ }
/* Update @index and get the next folio. */
folio_mark_dirty(folio);
@@ -214,6 +216,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
ntfs_error(vi->i_sb,
"Failed to map subsequent page (error %li), aborting.",
PTR_ERR(folio));
+ err = PTR_ERR(folio);
goto rollback;
}
@@ -265,7 +268,7 @@ rollback:
* - @count - @cnt is the number of bits that have been modified
*/
if (is_rollback)
- return PTR_ERR(folio);
+ return err;
if (count != cnt)
pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
value ? 0 : 1, true);
@@ -274,14 +277,14 @@ rollback:
if (!pos) {
/* Rollback was successful. */
ntfs_error(vi->i_sb,
- "Failed to map subsequent page (error %li), aborting.",
- PTR_ERR(folio));
+ "Failed to map subsequent page (error %i), aborting.",
+ err);
} else {
/* Rollback failed. */
ntfs_error(vi->i_sb,
- "Failed to map subsequent page (error %li) and rollback failed (error %i). Aborting and leaving inconsistent metadata. Unmount and run chkdsk.",
- PTR_ERR(folio), pos);
+ "Failed to map subsequent page (error %i) and rollback failed (error %i). Aborting and leaving inconsistent metadata. Unmount and run chkdsk.",
+ err, pos);
NVolSetErrors(NTFS_SB(vi->i_sb));
}
- return PTR_ERR(folio);
+ return err;
}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index bfa904d2ce665..20f5c7074bdd1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -911,8 +911,8 @@ static int ntfs_readdir(struct file *file, struct dir_context *actor)
if (next->flags & INDEX_ENTRY_NODE) {
next = ntfs_index_walk_down(next, ictx);
- if (!next) {
- err = -EIO;
+ if (IS_ERR(next)) {
+ err = PTR_ERR(next);
goto out;
}
}
@@ -920,7 +920,14 @@ static int ntfs_readdir(struct file *file, struct dir_context *actor)
if (next && !(next->flags & INDEX_ENTRY_END))
goto nextdir;
- while ((next = ntfs_index_next(next, ictx)) != NULL) {
+ while (1) {
+ next = ntfs_index_next(next, ictx);
+ if (IS_ERR(next)) {
+ err = PTR_ERR(next);
+ goto out;
+ }
+ if (!next)
+ break;
nextdir:
/* Check the consistency of an index entry */
if (ntfs_index_entry_inconsistent(ictx, vol, next, COLLATION_FILE_NAME,
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2080f39691378..146e011c1a418 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -677,11 +677,11 @@ static int ntfs_ib_read(struct ntfs_index_context *icx, s64 vcn, struct index_bl
static int ntfs_icx_parent_inc(struct ntfs_index_context *icx)
{
- icx->pindex++;
- if (icx->pindex >= MAX_PARENT_VCN) {
+ if (icx->pindex >= MAX_PARENT_VCN - 1) {
ntfs_error(icx->idx_ni->vol->sb, "Index is over %d level deep", MAX_PARENT_VCN);
return -EOPNOTSUPP;
}
+ icx->pindex++;
return 0;
}
@@ -1969,20 +1969,31 @@ err_out:
struct index_entry *ntfs_index_walk_down(struct index_entry *ie, struct ntfs_index_context *ictx)
{
struct index_entry *entry;
+ struct index_block *ib;
+ int err;
s64 vcn;
entry = ie;
do {
vcn = ntfs_ie_get_vcn(entry);
if (ictx->is_in_root) {
- /* down from level zero */
- ictx->ir = NULL;
- ictx->ib = kvzalloc(ictx->block_size, GFP_NOFS);
+ ib = kvzalloc(ictx->block_size, GFP_NOFS);
+ if (!ib)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * Descending from root index (level 0) to the first
+ * child level. is_in_root == true implies pindex == 0,
+ * so advance to level 1.
+ */
ictx->pindex = 1;
+ ictx->ir = NULL;
+ ictx->ib = ib;
ictx->is_in_root = false;
} else {
/* down from non-zero level */
- ictx->pindex++;
+ err = ntfs_icx_parent_inc(ictx);
+ if (err)
+ return ERR_PTR(err);
}
ictx->parent_pos[ictx->pindex] = 0;
@@ -1991,8 +2002,8 @@ struct index_entry *ntfs_index_walk_down(struct index_entry *ie, struct ntfs_ind
ictx->entry = ntfs_ie_get_first(&ictx->ib->index);
entry = ictx->entry;
} else
- entry = NULL;
- } while (entry && (entry->flags & INDEX_ENTRY_NODE));
+ entry = ERR_PTR(-EIO);
+ } while (!IS_ERR(entry) && (entry->flags & INDEX_ENTRY_NODE));
return entry;
}
@@ -2097,10 +2108,15 @@ struct index_entry *ntfs_index_next(struct index_entry *ie, struct ntfs_index_co
/* walk down if it has a subnode */
if (flags & INDEX_ENTRY_NODE) {
- if (!ictx->ia_ni)
+ if (!ictx->ia_ni) {
ictx->ia_ni = ntfs_ia_open(ictx, ictx->idx_ni);
+ if (!ictx->ia_ni)
+ return ERR_PTR(-EIO);
+ }
next = ntfs_index_walk_down(next, ictx);
+ if (IS_ERR(next))
+ return next;
} else {
/* walk up it has no subnode, nor data */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 16890d411194d..360bebd1ee3fe 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2582,8 +2582,8 @@ int ntfs_inode_sync_filename(struct ntfs_inode *ni)
mutex_lock_nested(&index_ni->mrec_lock, NTFS_INODE_MUTEX_PARENT);
if (NInoBeingDeleted(ni)) {
- iput(index_vi);
mutex_unlock(&index_ni->mrec_lock);
+ iput(index_vi);
continue;
}
@@ -2591,8 +2591,8 @@ int ntfs_inode_sync_filename(struct ntfs_inode *ni)
if (!ictx) {
ntfs_error(sb, "Failed to get index ctx, inode %llu",
index_ni->mft_no);
- iput(index_vi);
mutex_unlock(&index_ni->mrec_lock);
+ iput(index_vi);
continue;
}
@@ -2601,8 +2601,8 @@ int ntfs_inode_sync_filename(struct ntfs_inode *ni)
ntfs_debug("Index lookup failed, inode %llu",
index_ni->mft_no);
ntfs_index_ctx_put(ictx);
- iput(index_vi);
mutex_unlock(&index_ni->mrec_lock);
+ iput(index_vi);
continue;
}
/* Update flags and file size. */
diff --git a/fs/ntfs/iomap.c b/fs/ntfs/iomap.c
index 74a4d3e971f4d..dc7d8c893a699 100644
--- a/fs/ntfs/iomap.c
+++ b/fs/ntfs/iomap.c
@@ -788,8 +788,7 @@ static int ntfs_write_iomap_end_resident(struct inode *inode, loff_t pos,
ctx = ntfs_attr_get_search_ctx(ni, NULL);
if (!ctx) {
written = -ENOMEM;
- mutex_unlock(&ni->mrec_lock);
- return written;
+ goto err_out;
}
err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
@@ -810,7 +809,8 @@ static int ntfs_write_iomap_end_resident(struct inode *inode, loff_t pos,
memcpy(kattr + pos, iomap_inline_data(iomap, pos), written);
mark_mft_record_dirty(ctx->ntfs_ino);
err_out:
- ntfs_attr_put_search_ctx(ctx);
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
put_page(ipage);
mutex_unlock(&ni->mrec_lock);
return written;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 3f8d1640f1d50..d3f25d8e29f9d 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -710,6 +710,9 @@ map_vcn:
if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
vcn = rl->vcn;
kvfree(empty_buf);
+ empty_buf = NULL;
+ kfree(ra);
+ ra = NULL;
goto map_vcn;
}
/* If this run is not valid abort with an error. */
@@ -753,7 +756,7 @@ map_vcn:
} while (start < end);
} while ((++rl)->vcn < end_vcn);
up_write(&log_ni->runlist.lock);
- kfree(empty_buf);
+ kvfree(empty_buf);
kfree(ra);
truncate_inode_pages(log_vi->i_mapping, 0);
/* Set the flag so we do not have to do it again on remount. */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 7d989267a82b9..a7d10ee41b344 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -30,6 +30,8 @@ int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m,
{
struct attr_record *a;
struct super_block *sb = vol->sb;
+ u16 attrs_offset;
+ u32 bytes_in_use;
if (!ntfs_is_file_record(m->magic)) {
ntfs_error(sb, "Record %llu has no FILE magic (0x%x)\n",
@@ -65,7 +67,16 @@ int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m,
goto err_out;
}
- a = (struct attr_record *)((char *)m + le16_to_cpu(m->attrs_offset));
+ attrs_offset = le16_to_cpu(m->attrs_offset);
+ bytes_in_use = le32_to_cpu(m->bytes_in_use);
+
+ if (attrs_offset > bytes_in_use ||
+ bytes_in_use - attrs_offset < sizeof_field(struct attr_record, type)) {
+ ntfs_error(sb, "Record %llu has corrupt attribute offset\n", mft_no);
+ goto err_out;
+ }
+
+ a = (struct attr_record *)((char *)m + attrs_offset);
if ((char *)a < (char *)m || (char *)a > (char *)m + vol->mft_record_size) {
ntfs_error(sb, "Record %llu is corrupt\n", mft_no);
goto err_out;
@@ -449,7 +460,7 @@ static void ntfs_bio_end_io(struct bio *bio)
int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
struct mft_record *m)
{
- u8 *kmirr = NULL;
+ u8 *kmirr;
struct folio *folio;
unsigned int folio_ofs, lcn_folio_off = 0;
int err = 0;
@@ -479,6 +490,7 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
kmirr = kmap_local_folio(folio, 0) + folio_ofs;
/* Copy the mst protected mft record to the mirror. */
memcpy(kmirr, m, vol->mft_record_size);
+ kunmap_local(kmirr);
if (vol->cluster_size_bits > PAGE_SHIFT) {
lcn_folio_off = folio->index << PAGE_SHIFT;
@@ -490,20 +502,22 @@ int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) +
lcn_folio_off + folio_ofs);
- if (!bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) {
+ if (bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs))
+ err = submit_bio_wait(bio);
+ else
err = -EIO;
- bio_put(bio);
- goto unlock_folio;
- }
+ bio_put(bio);
- bio->bi_end_io = ntfs_bio_end_io;
- submit_bio(bio);
- /* Current state: all buffers are clean, unlocked, and uptodate. */
+ /*
+ * The in-memory mirror is now valid because we just memcpy()'d the
+ * mst-protected mft record into it. Mark the folio uptodate even on
+ * write error so a subsequent read_mapping_folio() does not refetch
+ * the stale on-disk mirror and overwrite this copy. The error is
+ * propagated to the caller via @err.
+ */
folio_mark_uptodate(folio);
-unlock_folio:
folio_unlock(folio);
- kunmap_local(kmirr);
folio_put(folio);
if (likely(!err)) {
ntfs_debug("Done.");
@@ -588,20 +602,36 @@ int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int syn
}
/* Synchronize the mft mirror now if not @sync. */
- if (!sync && ni->mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
+ if (!sync && ni->mft_no < vol->mftmirr_size) {
+ int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no,
+ fixup_m);
+ if (unlikely(sub_err) && !err)
+ err = sub_err;
+ }
- folio_get(folio);
- bio->bi_private = folio;
- bio->bi_end_io = ntfs_bio_end_io;
- submit_bio(bio);
+ if (sync) {
+ int sub_err = submit_bio_wait(bio);
+
+ bio_put(bio);
+ if (unlikely(sub_err) && !err)
+ err = sub_err;
+ } else {
+ folio_get(folio);
+ bio->bi_private = folio;
+ bio->bi_end_io = ntfs_bio_end_io;
+ submit_bio(bio);
+ }
offset += vol->cluster_size;
i++;
}
/* If @sync, now synchronize the mft mirror. */
- if (sync && ni->mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
+ if (sync && ni->mft_no < vol->mftmirr_size) {
+ int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
+
+ if (unlikely(sub_err) && !err)
+ err = sub_err;
+ }
kunmap_local(kaddr);
if (unlikely(err)) {
/* I/O error during writing. This is really bad! */
@@ -617,10 +647,10 @@ put_bio_out:
bio_put(bio);
err_out:
/*
- * Current state: all buffers are clean, unlocked, and uptodate.
- * The caller should mark the base inode as bad so that no more i/o
- * happens. ->drop_inode() will still be invoked so all extent inodes
- * and other allocated memory will be freed.
+ * The caller should mark the base inode as bad so no more I/O
+ * happens. ->drop_inode() will still be invoked so all extent inodes
+ * and other allocated memory will be freed. ENOMEM is retried by
+ * redirtying the mft record below.
*/
if (err == -ENOMEM) {
ntfs_error(vol->sb,
@@ -833,7 +863,7 @@ static bool ntfs_may_write_mft_record(struct ntfs_volume *vol, const u64 mft_no,
vi = igrab(mft_vi);
WARN_ON(vi != mft_vi);
} else {
- vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na);
+ vi = find_inode_nowait(sb, na.mft_no, ntfs_test_inode_wb, &na);
if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated)
return false;
}
@@ -1034,7 +1064,7 @@ static s64 ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(struct ntfs_volume *vo
b = ffz((unsigned long)*byte);
if (b < 8 && b >= (bit & 7)) {
ll = data_pos + (bit & ~7ull) + b;
- if (unlikely(ll > (1ll << 32))) {
+ if (unlikely(ll >= (1ll << 32))) {
folio_unlock(folio);
kunmap_local(buf);
folio_put(folio);
@@ -2721,8 +2751,11 @@ static int ntfs_write_mft_block(struct folio *folio, struct writeback_control *w
ntfs_debug("Entering for inode 0x%llx, attribute type 0x%x, folio index 0x%lx.",
ni->mft_no, ni->type, folio->index);
- if (!locked_nis || !ref_inos)
+ if (!locked_nis || !ref_inos) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
return -ENOMEM;
+ }
/* We have to zero every time due to mmap-at-end-of-file. */
if (folio->index >= (i_size >> folio_shift(folio)))
@@ -2840,9 +2873,13 @@ flush_bio:
}
prev_mft_ofs = mft_ofs;
- if (mft_no < vol->mftmirr_size)
- ntfs_sync_mft_mirror(vol, mft_no,
+ if (mft_no < vol->mftmirr_size) {
+ int sub_err = ntfs_sync_mft_mirror(vol, mft_no,
(struct mft_record *)(kaddr + mft_ofs));
+
+ if (unlikely(sub_err) && !err)
+ err = sub_err;
+ }
} else if (ref_inos[nr_ref_inos])
nr_ref_inos++;
}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 10894de519c39..c4f82846c58c3 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -344,9 +344,9 @@ static int ntfs_sd_add_everyone(struct ntfs_inode *ni)
sd_len = sizeof(struct security_descriptor_relative) + 2 *
(sizeof(struct ntfs_sid) + 8) + sizeof(struct ntfs_acl) +
sizeof(struct ntfs_ace) + 4;
- sd = kmalloc(sd_len, GFP_NOFS);
+ sd = kzalloc(sd_len, GFP_NOFS);
if (!sd)
- return -1;
+ return -ENOMEM;
sd->revision = 1;
sd->control = SE_DACL_PRESENT | SE_SELF_RELATIVE;
@@ -945,7 +945,8 @@ search:
ni_mrec = actx->base_mrec ? actx->base_mrec : actx->mrec;
ni_mrec->link_count = cpu_to_le16(le16_to_cpu(ni_mrec->link_count) - 1);
- drop_nlink(VFS_I(ni));
+ if (!S_ISDIR(VFS_I(ni)->i_mode))
+ drop_nlink(VFS_I(ni));
mark_mft_record_dirty(ni);
if (looking_for_dos_name) {
@@ -956,6 +957,13 @@ search:
}
/*
+ * For directories, Drop VFS nlink only when mft record link count
+ * becomes zero. Because we fixes VFS nlink to 1 for directories.
+ */
+ if (S_ISDIR(VFS_I(ni)->i_mode) && !le16_to_cpu(ni_mrec->link_count))
+ drop_nlink(VFS_I(ni));
+
+ /*
* If hard link count is not equal to zero then we are done. In other
* case there are no reference to this inode left, so we should free all
* non-resident attributes and mark all MFT record as not in use.
@@ -1221,7 +1229,8 @@ static int __ntfs_link(struct ntfs_inode *ni, struct ntfs_inode *dir_ni,
}
/* Increment hard links count. */
ni_mrec->link_count = cpu_to_le16(le16_to_cpu(ni_mrec->link_count) + 1);
- inc_nlink(VFS_I(ni));
+ if (!S_ISDIR(vi->i_mode))
+ inc_nlink(VFS_I(ni));
/* Done! */
mark_mft_record_dirty(ni);
diff --git a/fs/ntfs/reparse.c b/fs/ntfs/reparse.c
index 8f60ec6f66c19..74713716813f2 100644
--- a/fs/ntfs/reparse.c
+++ b/fs/ntfs/reparse.c
@@ -505,7 +505,6 @@ int ntfs_reparse_set_wsl_symlink(struct ntfs_inode *ni,
struct reparse_point *reparse;
struct wsl_link_reparse_data *data;
- utarget = (char *)NULL;
len = ntfs_ucstonls(ni->vol, target, target_len, &utarget, 0);
if (len <= 0)
return -EINVAL;
@@ -514,7 +513,7 @@ int ntfs_reparse_set_wsl_symlink(struct ntfs_inode *ni,
reparse = kvzalloc(reparse_len, GFP_NOFS);
if (!reparse) {
err = -ENOMEM;
- kvfree(utarget);
+ kfree(utarget);
} else {
data = (struct wsl_link_reparse_data *)reparse->reparse_data;
reparse->reparse_tag = IO_REPARSE_TAG_LX_SYMLINK;
@@ -528,6 +527,8 @@ int ntfs_reparse_set_wsl_symlink(struct ntfs_inode *ni,
kvfree(reparse);
if (!err)
ni->target = utarget;
+ else
+ kfree(utarget);
}
return err;
}
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index b213b4976d2b6..e7de3d01257e7 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -15,6 +15,8 @@
* Copyright (c) 2007-2022 Jean-Pierre Andre
*/
+#include <linux/overflow.h>
+
#include "ntfs.h"
#include "attrib.h"
@@ -739,6 +741,7 @@ struct runlist_element *ntfs_mapping_pairs_decompress(const struct ntfs_volume *
int rlsize; /* Size of runlist buffer. */
u16 rlpos; /* Current runlist position in units of struct runlist_elements. */
u8 b; /* Current byte offset in buf. */
+ u64 lowest_vcn; /* Raw on-disk lowest_vcn. */
#ifdef DEBUG
/* Make sure attr exists and is non-resident. */
@@ -747,8 +750,14 @@ struct runlist_element *ntfs_mapping_pairs_decompress(const struct ntfs_volume *
return ERR_PTR(-EINVAL);
}
#endif
+ lowest_vcn = le64_to_cpu(attr->data.non_resident.lowest_vcn);
+ /* Validate lowest_vcn from on-disk metadata to ensure it is sane. */
+ if (overflows_type(lowest_vcn, vcn)) {
+ ntfs_error(vol->sb, "Invalid lowest_vcn in mapping pairs.");
+ return ERR_PTR(-EIO);
+ }
/* Start at vcn = lowest_vcn and lcn 0. */
- vcn = le64_to_cpu(attr->data.non_resident.lowest_vcn);
+ vcn = lowest_vcn;
lcn = 0;
/* Get start of the mapping pairs array. */
buf = (u8 *)attr +
@@ -823,8 +832,17 @@ struct runlist_element *ntfs_mapping_pairs_decompress(const struct ntfs_volume *
* element.
*/
rl[rlpos].length = deltaxcn;
- /* Increment the current vcn by the current run length. */
- vcn += deltaxcn;
+ /*
+ * Increment the current vcn by the current run length.
+ * Guard against s64 overflow from a crafted mapping
+ * pairs array to preserve the monotonically-increasing
+ * vcn invariant.
+ */
+ if (unlikely(check_add_overflow(vcn, deltaxcn, &vcn))) {
+ ntfs_error(vol->sb, "VCN overflow in mapping pairs array.");
+ goto err_out;
+ }
+
/*
* There might be no lcn change at all, as is the case for
* sparse clusters on NTFS 3.0+, in which case we set the lcn
@@ -2038,10 +2056,11 @@ struct runlist_element *ntfs_rl_collapse_range(struct runlist_element *dst_rl, i
* consists of holes.
*/
merge_cnt = 0;
- i = new_1st_cnt == 0 ? 1 : new_1st_cnt;
- if (ntfs_rle_lcn_contiguous(&new_rl[i - 1], &new_rl[i])) {
- /* Merge right and left */
- s_rl = &new_rl[new_1st_cnt - 1];
+ if (new_1st_cnt > 0 &&
+ ntfs_rle_lcn_contiguous(&new_rl[new_1st_cnt - 1],
+ &new_rl[new_1st_cnt])) {
+ /* Merge right and left. */
+ s_rl = &new_rl[new_1st_cnt - 1];
s_rl->length += s_rl[1].length;
merge_cnt = 1;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 22dc7865eca79..9e321cc2febe7 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -413,6 +413,7 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label)
{
struct ntfs_inode *vol_ni = NTFS_I(vol->vol_ino);
struct ntfs_attr_search_ctx *ctx;
+ char *new_label;
__le16 *uname;
int uname_len, ret;
@@ -425,7 +426,7 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label)
return uname_len;
}
- if (uname_len > NTFS_MAX_LABEL_LEN) {
+ if (uname_len > NTFS_MAX_LABEL_LEN) {
ntfs_error(vol->sb,
"Volume label is too long (max %d characters).",
NTFS_MAX_LABEL_LEN);
@@ -433,11 +434,22 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label)
return -EINVAL;
}
+ /*
+ * Allocate the in-memory label copy up front. If kstrdup() fails we
+ * bail out before touching on-disk metadata, so the in-memory label
+ * and the on-disk label stay in sync.
+ */
+ new_label = kstrdup(label, GFP_KERNEL);
+ if (!new_label) {
+ kvfree(uname);
+ return -ENOMEM;
+ }
+
mutex_lock(&vol_ni->mrec_lock);
ctx = ntfs_attr_get_search_ctx(vol_ni, NULL);
if (!ctx) {
ret = -ENOMEM;
- goto out;
+ goto out;
}
if (!ntfs_attr_lookup(AT_VOLUME_NAME, NULL, 0, 0, 0, NULL, 0,
@@ -450,12 +462,14 @@ int ntfs_write_volume_label(struct ntfs_volume *vol, char *label)
out:
mutex_unlock(&vol_ni->mrec_lock);
kvfree(uname);
- mark_inode_dirty_sync(vol->vol_ino);
if (ret >= 0) {
kfree(vol->volume_label);
- vol->volume_label = kstrdup(label, GFP_KERNEL);
+ vol->volume_label = new_label;
+ mark_inode_dirty_sync(vol->vol_ino);
ret = 0;
+ } else {
+ kfree(new_label);
}
return ret;
}
@@ -979,6 +993,13 @@ mft_unmap_out:
ntfs_is_baad_recordp((__le32 *)kmirr))
bytes = vol->mft_record_size;
}
+ /* Compare the two records. */
+ if (memcmp(kmft, kmirr, bytes)) {
+ ntfs_error(sb,
+ "$MFT and $MFTMirr record %i do not match. Run chkdsk.",
+ i);
+ goto mm_unmap_out;
+ }
kmft += vol->mft_record_size;
kmirr += vol->mft_record_size;
} while (++i < vol->mftmirr_size);
@@ -1671,7 +1692,7 @@ iput_attrdef_err_out:
iput_upcase_err_out:
vol->upcase_len = 0;
mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
+ if (vol->upcase && vol->upcase == default_upcase) {
ntfs_nr_upcase_users--;
vol->upcase = NULL;
}
@@ -1701,7 +1722,7 @@ static void ntfs_volume_free(struct ntfs_volume *vol)
* the number of upcase users if we are a user.
*/
mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
+ if (vol->upcase && vol->upcase == default_upcase) {
ntfs_nr_upcase_users--;
vol->upcase = NULL;
}
@@ -2494,7 +2515,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
}
vol->upcase_len = 0;
mutex_lock(&ntfs_lock);
- if (vol->upcase == default_upcase) {
+ if (vol->upcase && vol->upcase == default_upcase) {
ntfs_nr_upcase_users--;
vol->upcase = NULL;
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index bec5475de094d..75e65e72c2d64 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -362,7 +362,7 @@ static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
__orangefs_setattr(dir, &iattr);
out:
op_release(new_op);
- return ERR_PTR(ret);
+ return ret ? ERR_PTR(ret) : NULL;
}
static int orangefs_rename(struct mnt_idmap *idmap,
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 7b86a6bac6449..b41f4788e4f06 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1354,7 +1354,7 @@ int ovl_ensure_verity_loaded(const struct path *datapath)
struct inode *inode = d_inode(datapath->dentry);
struct file *filp;
- if (!fsverity_active(inode) && IS_VERITY(inode)) {
+ if (IS_VERITY(inode) && fsverity_get_info(inode) == NULL) {
/*
* If this inode was not yet opened, the verity info hasn't been
* loaded yet, so we need to do that here to force it into memory.
diff --git a/fs/select.c b/fs/select.c
index 75978b18f48f8..bf71c9838dfe1 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -708,6 +708,17 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;
+ /*
+ * Reject negative components before normalisation. The seconds
+ * sum below is performed in signed long and a crafted negative
+ * timeval can wrap to a positive value that passes
+ * timespec64_valid() and turns into an effectively-infinite
+ * deadline via timespec64_add_safe()'s saturation, instead of
+ * the -EINVAL POSIX requires for negative timeouts.
+ */
+ if (tv.tv_sec < 0 || tv.tv_usec < 0)
+ return -EINVAL;
+
to = &end_time;
if (poll_select_set_timeout(to,
tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 02791ec3c5a16..88d5e9a32f28b 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -286,6 +286,14 @@ replay_again:
&rqst[0], &oplock, &oparms, utf16_path);
if (rc)
goto oshr_free;
+
+ if (oplock != SMB2_OPLOCK_LEVEL_II) {
+ rc = -EINVAL;
+ cifs_dbg(FYI, "%s: Oplock level %d not suitable for cached directory\n",
+ __func__, oplock);
+ goto oshr_free;
+ }
+
smb2_set_next_command(tcon, &rqst[0]);
memset(&qi_iov, 0, sizeof(qi_iov));
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 3a41bbada04c7..44c4072756804 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -8,6 +8,7 @@
*/
#include <linux/list.h>
+#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <keys/user-type.h>
@@ -40,12 +41,27 @@ cifs_spnego_key_destroy(struct key *key)
kfree(key->payload.data[0]);
}
+static int
+cifs_spnego_key_vet_description(const char *description)
+{
+ /*
+ * cifs.spnego descriptions are authority-bearing inputs to cifs.upcall.
+ * They are only valid when produced by CIFS while using the private
+ * spnego_cred installed below. Do not let userspace create this type
+ * of key through request_key(2)/add_key(2), since the helper treats
+ * pid/uid/creduid/upcall_target as kernel-originating fields.
+ */
+ if (current_cred() != spnego_cred)
+ return -EPERM;
+ return 0;
+}
/*
* keytype for CIFS spnego keys
*/
struct key_type cifs_spnego_key_type = {
.name = "cifs.spnego",
+ .vet_description = cifs_spnego_key_vet_description,
.instantiate = cifs_spnego_key_instantiate,
.destroy = cifs_spnego_key_destroy,
.describe = user_describe,
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index ec5d477793040..786dbbc43c5b9 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -1264,6 +1264,17 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl)
return 0;
}
+static bool dacl_offset_valid(unsigned int acl_len, __u32 dacloffset)
+{
+ if (acl_len < sizeof(struct smb_acl))
+ return false;
+
+ if (dacloffset < sizeof(struct smb_ntsd))
+ return false;
+
+ return dacloffset <= acl_len - sizeof(struct smb_acl);
+}
+
/* Convert CIFS ACL to POSIX form */
static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
@@ -1284,7 +1295,6 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
group_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
dacloffset = le32_to_cpu(pntsd->dacloffset);
- dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
le32_to_cpu(pntsd->gsidoffset),
@@ -1315,11 +1325,18 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
return rc;
}
- if (dacloffset)
+ if (dacloffset) {
+ if (!dacl_offset_valid(acl_len, dacloffset)) {
+ cifs_dbg(VFS, "Server returned illegal DACL offset\n");
+ return -EINVAL;
+ }
+
+ dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
group_sid_ptr, fattr, get_mode_from_special_sid);
- else
+ } else {
cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */
+ }
return rc;
}
@@ -1342,6 +1359,11 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd,
dacloffset = le32_to_cpu(pntsd->dacloffset);
if (dacloffset) {
+ if (!dacl_offset_valid(secdesclen, dacloffset)) {
+ cifs_dbg(VFS, "Server returned illegal DACL offset\n");
+ return -EINVAL;
+ }
+
dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
rc = validate_dacl(dacl_ptr, end_of_acl);
if (rc)
@@ -1710,6 +1732,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
nsecdesclen = sizeof(struct smb_ntsd) + (sizeof(struct smb_sid) * 2);
dacloffset = le32_to_cpu(pntsd->dacloffset);
if (dacloffset) {
+ if (!dacl_offset_valid(secdesclen, dacloffset)) {
+ cifs_dbg(VFS, "Server returned illegal DACL offset\n");
+ rc = -EINVAL;
+ goto id_mode_to_cifs_acl_exit;
+ }
+
dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
rc = validate_dacl(dacl_ptr, (char *)pntsd + secdesclen);
if (rc) {
@@ -1732,7 +1760,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
* descriptor parameters, and security descriptor itself
*/
nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN);
- pnntsd = kmalloc(nsecdesclen, GFP_KERNEL);
+ pnntsd = kzalloc(nsecdesclen, GFP_KERNEL);
if (!pnntsd) {
kfree(pntsd);
cifs_put_tlink(tlink);
@@ -1752,6 +1780,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag);
cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
}
+id_mode_to_cifs_acl_exit:
cifs_put_tlink(tlink);
kfree(pnntsd);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 9f76b0347fa9d..ce23924f01b3a 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -306,6 +306,8 @@ static void cifs_kill_sb(struct super_block *sb)
/* Wait for all pending oplock breaks to complete */
flush_workqueue(cifsoplockd_wq);
+ /* Wait for all opened files to release */
+ flush_workqueue(deferredclose_wq);
/* finally release root dentry */
dput(cifs_sb->root);
@@ -434,7 +436,8 @@ cifs_alloc_inode(struct super_block *sb)
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
- cifs_inode->netfs.remote_i_size = 0;
+ cifs_inode->netfs._remote_i_size = 0;
+ cifs_inode->netfs._zero_point = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
@@ -1303,7 +1306,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
struct cifsFileInfo *smb_file_src = src_file->private_data;
struct cifsFileInfo *smb_file_target = dst_file->private_data;
struct cifs_tcon *target_tcon, *src_tcon;
- unsigned long long destend, fstart, fend, old_size, new_size;
+ unsigned long long i_size, new_size;
+ unsigned long long destend, fstart, fend;
unsigned int xid;
int rc;
@@ -1347,7 +1351,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
- if (src_cifsi->netfs.remote_i_size < off + len) {
+ if (netfs_read_remote_i_size(src_inode) < off + len) {
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
@@ -1368,22 +1372,24 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
if (rc)
goto unlock;
- if (fend > target_cifsi->netfs.zero_point)
- target_cifsi->netfs.zero_point = fend + 1;
- old_size = target_cifsi->netfs.remote_i_size;
+
+ spin_lock(&target_inode->i_lock);
+ if (fend > target_cifsi->netfs._zero_point)
+ netfs_write_zero_point(target_inode, fend + 1);
+ i_size = target_inode->i_size;
+ spin_unlock(&target_inode->i_lock);
/* Discard all the folios that overlap the destination region. */
cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
- fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
- i_size_read(target_inode), 0);
+ fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size, 0);
rc = -EOPNOTSUPP;
if (target_tcon->ses->server->ops->duplicate_extents) {
rc = target_tcon->ses->server->ops->duplicate_extents(xid,
smb_file_src, smb_file_target, off, len, destoff);
- if (rc == 0 && new_size > old_size) {
+ if (rc == 0 && new_size > i_size) {
truncate_setsize(target_inode, new_size);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
@@ -1402,8 +1408,12 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
rc = -EINVAL;
}
}
- if (rc == 0 && new_size > target_cifsi->netfs.zero_point)
- target_cifsi->netfs.zero_point = new_size;
+ if (rc == 0) {
+ spin_lock(&target_inode->i_lock);
+ if (new_size > target_cifsi->netfs._zero_point)
+ netfs_write_zero_point(target_inode, new_size);
+ spin_unlock(&target_inode->i_lock);
+ }
}
/* force revalidate of size and timestamps of target file now
@@ -1474,7 +1484,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
- if (src_cifsi->netfs.remote_i_size < off + len) {
+ if (netfs_read_remote_i_size(src_inode) < off + len) {
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
@@ -1502,8 +1512,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
fscache_resize_cookie(cifs_inode_cookie(target_inode),
i_size_read(target_inode));
}
- if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point)
- target_cifsi->netfs.zero_point = destoff + rc;
+ if (rc > 0) {
+ spin_lock(&target_inode->i_lock);
+ if (destoff + rc > target_cifsi->netfs._zero_point)
+ netfs_write_zero_point(target_inode, destoff + rc);
+ spin_unlock(&target_inode->i_lock);
+ }
}
file_accessed(src_file);
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 4a25afda9448a..79d891f7df1a5 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -89,7 +89,6 @@ int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
char *smb3_fs_context_fullpath(const struct smb3_fs_context *ctx, char dirsep);
int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx);
-int smb3_parse_opt(const char *options, const char *key, char **val);
int cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs);
bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs);
int cifs_discard_remaining_data(struct TCP_Server_Info *server);
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 3990a90122640..9e27bfa7376b1 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1465,6 +1465,7 @@ cifs_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)
struct cifs_io_subrequest *rdata = mid->callback_data;
struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode);
struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
+ struct inode *inode = &ictx->inode;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 1,
.rq_iter = rdata->subreq.io_iter };
@@ -1538,7 +1539,7 @@ do_retry:
} else {
size_t trans = rdata->subreq.transferred + rdata->got_bytes;
if (trans < rdata->subreq.len &&
- rdata->subreq.start + trans >= ictx->remote_i_size) {
+ rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) {
rdata->result = 0;
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
} else if (rdata->got_bytes > 0) {
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 664a2c2230890..b60344125f271 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -2517,18 +2517,23 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result)
{
struct netfs_io_request *wreq = wdata->rreq;
- struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ struct inode *inode = wreq->inode;
+ struct netfs_inode *ictx = netfs_inode(inode);
loff_t wrend;
if (result > 0) {
+ spin_lock(&inode->i_lock);
+
wrend = wdata->subreq.start + wdata->subreq.transferred + result;
- if (wrend > ictx->zero_point &&
+ if (wrend > ictx->_zero_point &&
(wdata->rreq->origin == NETFS_UNBUFFERED_WRITE ||
wdata->rreq->origin == NETFS_DIO_WRITE))
- ictx->zero_point = wrend;
- if (wrend > ictx->remote_i_size)
+ netfs_write_zero_point(inode, wrend);
+ if (wrend > ictx->_remote_i_size)
netfs_resize_file(ictx, wrend, true);
+
+ spin_unlock(&inode->i_lock);
}
netfs_write_subrequest_terminated(&wdata->subreq, result);
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index b9544eb0381b7..2f86158f85d7b 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -420,7 +420,7 @@ static int parse_symlink_flavor(struct fs_context *fc, char *value,
#define DUP_CTX_STR(field) \
do { \
if (ctx->field) { \
- new_ctx->field = kstrdup(ctx->field, GFP_ATOMIC); \
+ new_ctx->field = kstrdup(ctx->field, GFP_KERNEL); \
if (new_ctx->field == NULL) { \
smb3_cleanup_fs_context_contents(new_ctx); \
return -ENOMEM; \
@@ -536,37 +536,6 @@ cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_contex
return 0;
}
-int smb3_parse_opt(const char *options, const char *key, char **val)
-{
- int rc = -ENOENT;
- char *opts, *orig, *p;
-
- orig = opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
-
- while ((p = strsep(&opts, ","))) {
- char *nval;
-
- if (!*p)
- continue;
- if (strncasecmp(p, key, strlen(key)))
- continue;
- nval = strchr(p, '=');
- if (nval) {
- if (nval == p)
- continue;
- *nval++ = 0;
- *val = kstrdup(nval, GFP_KERNEL);
- rc = !*val ? -ENOMEM : 0;
- goto out;
- }
- }
-out:
- kfree(orig);
- return rc;
-}
-
/*
* Remove duplicate path delimiters. Windows is supposed to do that
* but there are some bugs that prevent rename from working if there are
@@ -767,7 +736,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
static int smb3_fs_context_parse_monolithic(struct fs_context *fc,
void *data);
static int smb3_get_tree(struct fs_context *fc);
-static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels);
+static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels);
static int smb3_reconfigure(struct fs_context *fc);
static const struct fs_context_operations smb3_fs_context_ops = {
@@ -1041,25 +1010,34 @@ do { \
int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
{
+ char *password = NULL, *password2 = NULL;
+
if (ses->password &&
cifs_sb->ctx->password &&
strcmp(ses->password, cifs_sb->ctx->password)) {
- kfree_sensitive(cifs_sb->ctx->password);
- cifs_sb->ctx->password = kstrdup(ses->password, GFP_KERNEL);
- if (!cifs_sb->ctx->password)
+ password = kstrdup(ses->password, GFP_KERNEL);
+ if (!password)
return -ENOMEM;
}
if (ses->password2 &&
cifs_sb->ctx->password2 &&
strcmp(ses->password2, cifs_sb->ctx->password2)) {
- kfree_sensitive(cifs_sb->ctx->password2);
- cifs_sb->ctx->password2 = kstrdup(ses->password2, GFP_KERNEL);
- if (!cifs_sb->ctx->password2) {
- kfree_sensitive(cifs_sb->ctx->password);
- cifs_sb->ctx->password = NULL;
+ password2 = kstrdup(ses->password2, GFP_KERNEL);
+ if (!password2) {
+ kfree_sensitive(password);
return -ENOMEM;
}
}
+
+ if (password) {
+ kfree_sensitive(cifs_sb->ctx->password);
+ cifs_sb->ctx->password = password;
+ }
+ if (password2) {
+ kfree_sensitive(cifs_sb->ctx->password2);
+ cifs_sb->ctx->password2 = password2;
+ }
+
return 0;
}
@@ -1072,7 +1050,7 @@ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_se
* with the session's channel lock. This should be called whenever the maximum
* allowed channels for a session changes (e.g., after a remount or reconfigure).
*/
-static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels)
+static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels)
{
spin_lock(&ses->chan_lock);
ses->chan_max = max_channels;
@@ -1082,12 +1060,15 @@ static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channe
static int smb3_reconfigure(struct fs_context *fc)
{
struct smb3_fs_context *ctx = smb3_fc2context(fc);
+ struct smb3_fs_context *new_ctx = NULL;
+ struct smb3_fs_context *old_ctx = NULL;
struct dentry *root = fc->root;
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses;
unsigned int rsize = ctx->rsize, wsize = ctx->wsize;
char *new_password = NULL, *new_password2 = NULL;
bool need_recon = false;
+ bool need_mchan_update;
int rc;
if (ses->expired_pwd)
@@ -1097,6 +1078,16 @@ static int smb3_reconfigure(struct fs_context *fc)
if (rc)
return rc;
+ old_ctx = kzalloc_obj(*old_ctx);
+ if (!old_ctx)
+ return -ENOMEM;
+
+ rc = smb3_fs_context_dup(old_ctx, cifs_sb->ctx);
+ if (rc) {
+ kfree(old_ctx);
+ return rc;
+ }
+
/*
* We can not change UNC/username/password/domainname/
* workstation_name/nodename/iocharset
@@ -1106,16 +1097,22 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, UNC);
STEAL_STRING(cifs_sb, ctx, source);
STEAL_STRING(cifs_sb, ctx, username);
+ STEAL_STRING(cifs_sb, ctx, domainname);
+ STEAL_STRING(cifs_sb, ctx, nodename);
+ STEAL_STRING(cifs_sb, ctx, iocharset);
- if (need_recon == false)
+ if (!need_recon) {
STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
- else {
+ } else {
if (ctx->password) {
new_password = kstrdup(ctx->password, GFP_KERNEL);
- if (!new_password)
- return -ENOMEM;
- } else
+ if (!new_password) {
+ rc = -ENOMEM;
+ goto restore_ctx;
+ }
+ } else {
STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+ }
}
/*
@@ -1125,11 +1122,29 @@ static int smb3_reconfigure(struct fs_context *fc)
if (ctx->password2) {
new_password2 = kstrdup(ctx->password2, GFP_KERNEL);
if (!new_password2) {
- kfree_sensitive(new_password);
- return -ENOMEM;
+ rc = -ENOMEM;
+ goto restore_ctx;
}
- } else
+ } else {
STEAL_STRING_SENSITIVE(cifs_sb, ctx, password2);
+ }
+
+ /* if rsize or wsize not passed in on remount, use previous values */
+ ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize;
+ ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize;
+
+ new_ctx = kzalloc_obj(*new_ctx);
+ if (!new_ctx) {
+ rc = -ENOMEM;
+ goto restore_ctx;
+ }
+
+ rc = smb3_fs_context_dup(new_ctx, ctx);
+ if (rc)
+ goto restore_ctx;
+
+ need_mchan_update = ctx->multichannel != cifs_sb->ctx->multichannel ||
+ ctx->max_channels != cifs_sb->ctx->max_channels;
/*
* we may update the passwords in the ses struct below. Make sure we do
@@ -1140,54 +1155,55 @@ static int smb3_reconfigure(struct fs_context *fc)
/*
* smb2_reconnect may swap password and password2 in case session setup
* failed. First get ctx passwords in sync with ses passwords. It should
- * be okay to do this even if this function were to return an error at a
- * later stage
+ * be done before committing new passwords.
*/
rc = smb3_sync_session_ctx_passwords(cifs_sb, ses);
if (rc) {
mutex_unlock(&ses->session_mutex);
- kfree_sensitive(new_password);
- kfree_sensitive(new_password2);
- return rc;
+ goto cleanup_new_ctx;
+ }
+
+ /*
+ * If multichannel or max_channels has changed, update the session's channels accordingly.
+ * This may add or remove channels to match the new configuration.
+ */
+ if (need_mchan_update) {
+ /* Prevent concurrent scaling operations */
+ spin_lock(&ses->ses_lock);
+ if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) {
+ spin_unlock(&ses->ses_lock);
+ mutex_unlock(&ses->session_mutex);
+ rc = -EINVAL;
+ goto cleanup_new_ctx;
+ }
+ ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
}
/*
- * now that allocations for passwords are done, commit them
+ * Commit session passwords before any channel work so newly added
+ * channels authenticate with the new credentials.
*/
if (new_password) {
kfree_sensitive(ses->password);
ses->password = new_password;
+ new_password = NULL;
}
if (new_password2) {
kfree_sensitive(ses->password2);
ses->password2 = new_password2;
+ new_password2 = NULL;
}
- /*
- * If multichannel or max_channels has changed, update the session's channels accordingly.
- * This may add or remove channels to match the new configuration.
- */
- if ((ctx->multichannel != cifs_sb->ctx->multichannel) ||
- (ctx->max_channels != cifs_sb->ctx->max_channels)) {
-
+ if (need_mchan_update) {
/* Synchronize ses->chan_max with the new mount context */
smb3_sync_ses_chan_max(ses, ctx->max_channels);
- /* Now update the session's channels to match the new configuration */
- /* Prevent concurrent scaling operations */
- spin_lock(&ses->ses_lock);
- if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) {
- spin_unlock(&ses->ses_lock);
- mutex_unlock(&ses->session_mutex);
- return -EINVAL;
- }
- ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
- spin_unlock(&ses->ses_lock);
mutex_unlock(&ses->session_mutex);
- rc = smb3_update_ses_channels(ses, ses->server,
- false /* from_reconnect */,
- false /* disable_mchan */);
+ smb3_update_ses_channels(ses, ses->server,
+ false /* from_reconnect */,
+ false /* disable_mchan */);
/* Clear scaling flag after operation */
spin_lock(&ses->ses_lock);
@@ -1197,16 +1213,12 @@ static int smb3_reconfigure(struct fs_context *fc)
mutex_unlock(&ses->session_mutex);
}
- STEAL_STRING(cifs_sb, ctx, domainname);
- STEAL_STRING(cifs_sb, ctx, nodename);
- STEAL_STRING(cifs_sb, ctx, iocharset);
-
- /* if rsize or wsize not passed in on remount, use previous values */
- ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize;
- ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize;
-
smb3_cleanup_fs_context_contents(cifs_sb->ctx);
- rc = smb3_fs_context_dup(cifs_sb->ctx, ctx);
+ memcpy(cifs_sb->ctx, new_ctx, sizeof(*new_ctx));
+ kfree(new_ctx);
+ new_ctx = NULL;
+ smb3_cleanup_fs_context(old_ctx);
+ old_ctx = NULL;
smb3_update_mnt_flags(cifs_sb);
#ifdef CONFIG_CIFS_DFS_UPCALL
if (!rc)
@@ -1214,6 +1226,18 @@ static int smb3_reconfigure(struct fs_context *fc)
#endif
return rc;
+
+cleanup_new_ctx:
+ smb3_cleanup_fs_context_contents(new_ctx);
+restore_ctx:
+ kfree(new_ctx);
+ kfree_sensitive(new_password);
+ kfree_sensitive(new_password2);
+ smb3_cleanup_fs_context_contents(cifs_sb->ctx);
+ memcpy(cifs_sb->ctx, old_ctx, sizeof(*old_ctx));
+ kfree(old_ctx);
+
+ return rc;
}
static int smb3_fs_context_parse_param(struct fs_context *fc,
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 16a5310155d5a..9472c0a6c187c 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -119,7 +119,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
mtime = inode_get_mtime(inode);
if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
- cifs_i->netfs.remote_i_size == fattr->cf_eof) {
+ netfs_read_remote_i_size(inode) == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
return;
@@ -173,12 +173,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
CIFS_I(inode)->time = 0; /* force reval */
return -ESTALE;
}
- if (inode_state_read_once(inode) & I_NEW)
- CIFS_I(inode)->netfs.zero_point = fattr->cf_eof;
-
cifs_revalidate_cache(inode, fattr);
spin_lock(&inode->i_lock);
+ if (inode_state_read_once(inode) & I_NEW)
+ netfs_write_zero_point(inode, fattr->cf_eof);
+
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode);
fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode);
@@ -212,7 +212,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
else
clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
- cifs_i->netfs.remote_i_size = fattr->cf_eof;
+ netfs_write_remote_i_size(inode, fattr->cf_eof);
/*
* Can't safely change the file size here if the client is writing to
* it due to potential races.
@@ -2772,7 +2772,9 @@ cifs_revalidate_mapping(struct inode *inode)
if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_RW_CACHE)
goto skip_invalidate;
- cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size;
+ spin_lock(&inode->i_lock);
+ netfs_write_zero_point(inode, netfs_inode(inode)->_remote_i_size);
+ spin_unlock(&inode->i_lock);
rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX);
if (rc) {
cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n",
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 9afab3237e54c..17408bb8ab65b 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -296,7 +296,7 @@ search_end:
break;
case SMB2_ENCRYPTION_AES256_CCM:
case SMB2_ENCRYPTION_AES256_GCM:
- out.session_key_length = CIFS_SESS_KEY_SIZE;
+ out.session_key_length = ses->auth_key.len;
out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE;
break;
default:
diff --git a/fs/smb/client/netlink.c b/fs/smb/client/netlink.c
index 147d9409252cd..0dd10913c37a0 100644
--- a/fs/smb/client/netlink.c
+++ b/fs/smb/client/netlink.c
@@ -33,13 +33,17 @@ static const struct nla_policy cifs_genl_policy[CIFS_GENL_ATTR_MAX + 1] = {
static const struct genl_ops cifs_genl_ops[] = {
{
.cmd = CIFS_GENL_CMD_SWN_NOTIFY,
+ .flags = GENL_ADMIN_PERM,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = cifs_swn_notify,
},
};
static const struct genl_multicast_group cifs_genl_mcgrps[] = {
- [CIFS_GENL_MCGRP_SWN] = { .name = CIFS_GENL_MCGRP_SWN_NAME },
+ [CIFS_GENL_MCGRP_SWN] = {
+ .name = CIFS_GENL_MCGRP_SWN_NAME,
+ .flags = GENL_MCAST_CAP_NET_ADMIN,
+ },
};
struct genl_family cifs_genl_family = {
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index be22bbc4a65a0..e860fa08b5e30 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -143,7 +143,8 @@ retry:
fattr->cf_rdev = inode->i_rdev;
fattr->cf_uid = inode->i_uid;
fattr->cf_gid = inode->i_gid;
- fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
+ fattr->cf_eof =
+ netfs_read_remote_i_size(inode);
fattr->cf_symlink_target = NULL;
} else {
CIFS_I(inode)->time = 0;
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index b292aa94a5932..6860eff316932 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -49,6 +49,9 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
__func__, le32_to_cpu(p->ErrorId));
len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8);
+ if (len > end - ((u8 *)p + sizeof(*p)))
+ return ERR_PTR(-EINVAL);
+
p = (struct smb2_error_context_rsp *)(p->ErrorContextData + len);
}
} else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) &&
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index c6dd282fc3a90..6c9c229b91f65 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -111,7 +111,7 @@ static int check_wsl_eas(struct kvec *rsp_iov)
u32 outlen, next;
u16 vlen;
u8 nlen;
- u8 *end;
+ u8 *ea_end, *iov_end;
outlen = le32_to_cpu(rsp->OutputBufferLength);
if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE ||
@@ -120,15 +120,19 @@ static int check_wsl_eas(struct kvec *rsp_iov)
ea = (void *)((u8 *)rsp_iov->iov_base +
le16_to_cpu(rsp->OutputBufferOffset));
- end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len;
+ ea_end = (u8 *)ea + outlen;
+ iov_end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len;
+ if (ea_end > iov_end)
+ return -EINVAL;
+
for (;;) {
- if ((u8 *)ea > end - sizeof(*ea))
+ if ((u8 *)ea > ea_end - sizeof(*ea))
return -EINVAL;
nlen = ea->ea_name_length;
vlen = le16_to_cpu(ea->ea_value_length);
if (nlen != SMB2_WSL_XATTR_NAME_LEN ||
- (u8 *)ea->ea_data + nlen + 1 + vlen > end)
+ (u8 *)ea->ea_data + nlen + 1 + vlen > ea_end)
return -EINVAL;
switch (vlen) {
@@ -230,7 +234,7 @@ replay_again:
num_rqst = 0;
server = cifs_pick_channel(ses);
- vars = kzalloc_obj(*vars, GFP_ATOMIC);
+ vars = kzalloc_obj(*vars, GFP_KERNEL);
if (vars == NULL) {
rc = -ENOMEM;
goto out;
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 973fce3c959c4..2a7355ce1a078 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -241,7 +241,8 @@ smb2_check_message(char *buf, unsigned int pdu_len, unsigned int len,
if (len != calc_len) {
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
- shdr->Status == STATUS_STOPPED_ON_SYMLINK)
+ shdr->Status == STATUS_STOPPED_ON_SYMLINK &&
+ len > calc_len)
return 0;
/* Windows 7 server returns 24 bytes more */
if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 7f346ee502896..61b60114e4b85 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -111,10 +111,21 @@ smb2_add_credits(struct TCP_Server_Info *server,
cifs_trace_rw_credits_zero_in_flight);
}
server->in_flight--;
+
+ /*
+ * Rebalance credits when an op drains in_flight. For session setup,
+ * do this only when the total accumulated credits are high enough (>2)
+ * so that a newly established secondary channel can reserve credits for
+ * echoes and oplocks. We expect this to happen at the end of the final
+ * session setup response.
+ */
if (server->in_flight == 0 &&
((optype & CIFS_OP_MASK) != CIFS_NEG_OP) &&
((optype & CIFS_OP_MASK) != CIFS_SESS_OP))
rc = change_conf(server);
+ else if (server->in_flight == 0 &&
+ ((optype & CIFS_OP_MASK) == CIFS_SESS_OP) && *val > 2)
+ rc = change_conf(server);
/*
* Sometimes server returns 0 credits on oplock break ack - we need to
* rebalance credits in this case.
@@ -3391,8 +3402,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
struct inode *inode = file_inode(file);
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsFileInfo *cfile = file->private_data;
- struct netfs_inode *ictx = netfs_inode(inode);
- unsigned long long i_size, new_size, remote_size;
+ unsigned long long i_size, new_size, remote_i_size, zero_point;
long rc;
unsigned int xid;
@@ -3403,9 +3413,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
filemap_invalidate_lock(inode->i_mapping);
- i_size = i_size_read(inode);
- remote_size = ictx->remote_i_size;
- if (offset + len >= remote_size && offset < i_size) {
+ netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point);
+ if (offset + len >= remote_i_size && offset < i_size) {
unsigned long long top = umin(offset + len, i_size);
rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1);
@@ -3438,9 +3447,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, new_size);
if (rc >= 0) {
truncate_setsize(inode, new_size);
+ spin_lock(&inode->i_lock);
netfs_resize_file(&cifsi->netfs, new_size, true);
- if (offset < cifsi->netfs.zero_point)
- cifsi->netfs.zero_point = offset;
+ if (offset < cifsi->netfs._zero_point)
+ netfs_write_zero_point(inode, offset);
+ spin_unlock(&inode->i_lock);
fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
}
}
@@ -3463,7 +3474,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
- unsigned long long end = offset + len, i_size, remote_i_size;
+ unsigned long long end = offset + len, i_size, remote_i_size, zero_point;
long rc;
unsigned int xid;
__u8 set_sparse = 1;
@@ -3505,14 +3516,17 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
* that we locally hole-punch the tail of the dirty data, the proposed
* EOF update will end up in the wrong place.
*/
- i_size = i_size_read(inode);
- remote_i_size = netfs_inode(inode)->remote_i_size;
+ netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point);
+
if (end > remote_i_size && i_size > remote_i_size) {
unsigned long long extend_to = umin(end, i_size);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, extend_to);
- if (rc >= 0)
- netfs_inode(inode)->remote_i_size = extend_to;
+ if (rc >= 0) {
+ spin_lock(&inode->i_lock);
+ netfs_write_remote_i_size(inode, extend_to);
+ spin_unlock(&inode->i_lock);
+ }
}
unlock:
@@ -3776,7 +3790,6 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
struct inode *inode = file_inode(file);
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsFileInfo *cfile = file->private_data;
- struct netfs_inode *ictx = &cifsi->netfs;
loff_t old_eof, new_eof;
xid = get_xid();
@@ -3794,7 +3807,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
goto out_2;
truncate_pagecache_range(inode, off, old_eof);
- ictx->zero_point = old_eof;
+ spin_lock(&inode->i_lock);
+ netfs_write_zero_point(inode, old_eof);
+ spin_unlock(&inode->i_lock);
netfs_wait_for_outstanding_io(inode);
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
@@ -3811,8 +3826,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
rc = 0;
truncate_setsize(inode, new_eof);
+ spin_lock(&inode->i_lock);
netfs_resize_file(&cifsi->netfs, new_eof, true);
- ictx->zero_point = new_eof;
+ netfs_write_zero_point(inode, new_eof);
+ spin_unlock(&inode->i_lock);
fscache_resize_cookie(cifs_inode_cookie(inode), new_eof);
out_2:
filemap_invalidate_unlock(inode->i_mapping);
@@ -3855,13 +3872,17 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
goto out_2;
truncate_setsize(inode, new_eof);
+ spin_lock(&inode->i_lock);
netfs_resize_file(&cifsi->netfs, i_size_read(inode), true);
+ spin_unlock(&inode->i_lock);
fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
if (rc < 0)
goto out_2;
- cifsi->netfs.zero_point = new_eof;
+ spin_lock(&inode->i_lock);
+ netfs_write_zero_point(inode, new_eof);
+ spin_unlock(&inode->i_lock);
rc = smb3_zero_data(file, tcon, off, len, xid);
if (rc < 0)
@@ -4710,6 +4731,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
{
unsigned int data_offset;
unsigned int data_len;
+ unsigned int end_off;
unsigned int cur_off;
unsigned int cur_page_idx;
unsigned int pad_len;
@@ -4814,7 +4836,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
/* Copy the data to the output I/O iterator. */
- rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len,
+ rdata->result = cifs_copy_folioq_to_iter(buffer, data_len,
cur_off, &rdata->subreq.io_iter);
if (rdata->result != 0) {
if (is_offloaded)
@@ -4823,9 +4845,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
dequeue_mid(server, mid, rdata->result);
return 0;
}
- rdata->got_bytes = buffer_len;
+ rdata->got_bytes = data_len;
- } else if (buf_len >= data_offset + data_len) {
+ } else if (!check_add_overflow(data_offset, data_len, &end_off) &&
+ buf_len >= end_off) {
/* read response payload is in buf */
WARN_ONCE(buffer, "read data can be either in buf or in buffer");
copied = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter);
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index cb61051f9af3b..3bd300347f16e 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1713,17 +1713,30 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
is_binding = (ses->ses_status == SES_GOOD);
spin_unlock(&ses->ses_lock);
+ /*
+ * Per MS-SMB2 3.2.5.3, Session.SessionKey is the first 16 bytes of the
+ * GSS cryptographic key, right-padded with zero bytes if shorter.
+ * Allocate at least SMB2_NTLMV2_SESSKEY_SIZE bytes (zeroed) so the KDF
+ * input buffer is always valid for HMAC-SHA256 even with deprecated
+ * Kerberos enctypes that return a short session key.
+ */
+ if (unlikely(msg->sesskey_len < SMB2_NTLMV2_SESSKEY_SIZE))
+ cifs_dbg(VFS,
+ "short GSS session key (%u bytes); zero-padding per MS-SMB2 3.2.5.3\n",
+ msg->sesskey_len);
+
kfree_sensitive(ses->auth_key.response);
- ses->auth_key.response = kmemdup(msg->data,
- msg->sesskey_len,
- GFP_KERNEL);
+ ses->auth_key.len = max_t(unsigned int, msg->sesskey_len,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
if (!ses->auth_key.response) {
cifs_dbg(VFS, "%s: can't allocate (%u bytes) memory\n",
- __func__, msg->sesskey_len);
+ __func__, ses->auth_key.len);
+ ses->auth_key.len = 0;
rc = -ENOMEM;
goto out_put_spnego_key;
}
- ses->auth_key.len = msg->sesskey_len;
+ memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
sess_data->iov[1].iov_len = msg->secblob_len;
@@ -4595,6 +4608,7 @@ smb2_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid)
struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode);
struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct smb2_hdr *shdr = (struct smb2_hdr *)rdata->iov[0].iov_base;
+ struct inode *inode = &ictx->inode;
struct cifs_credits credits = {
.value = 0,
.instance = 0,
@@ -4708,7 +4722,7 @@ do_retry:
} else {
size_t trans = rdata->subreq.transferred + rdata->got_bytes;
if (trans < rdata->subreq.len &&
- rdata->subreq.start + trans >= ictx->remote_i_size) {
+ rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) {
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0;
}
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 41009039b4cbe..1143ee52470a7 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -169,7 +169,9 @@ smb2_find_smb_sess_tcon_unlocked(struct cifs_ses *ses, __u32 tid)
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != tid)
continue;
+ spin_lock(&tcon->tc_lock);
++tcon->tc_count;
+ spin_unlock(&tcon->tc_lock);
trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count,
netfs_trace_tcon_ref_get_find_sess_tcon);
return tcon;
@@ -251,7 +253,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
}
static void generate_key(struct cifs_ses *ses, struct kvec label,
- struct kvec context, __u8 *key, unsigned int key_size)
+ struct kvec context, __u8 *key, unsigned int key_size,
+ unsigned int full_key_size)
{
unsigned char zero = 0x0;
__u8 i[4] = {0, 0, 0, 1};
@@ -265,7 +268,7 @@ static void generate_key(struct cifs_ses *ses, struct kvec label,
memset(key, 0x0, key_size);
hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response,
- SMB2_NTLMV2_SESSKEY_SIZE);
+ full_key_size);
hmac_sha256_update(&hmac_ctx, i, 4);
hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len);
hmac_sha256_update(&hmac_ctx, &zero, 1);
@@ -298,6 +301,7 @@ generate_smb3signingkey(struct cifs_ses *ses,
struct TCP_Server_Info *server,
const struct derivation_triplet *ptriplet)
{
+ unsigned int full_key_size = SMB2_NTLMV2_SESSKEY_SIZE;
bool is_binding = false;
int chan_index = 0;
@@ -330,12 +334,24 @@ generate_smb3signingkey(struct cifs_ses *ses,
if (is_binding) {
generate_key(ses, ptriplet->signing.label,
ptriplet->signing.context,
- ses->chans[chan_index].signkey,
- SMB3_SIGN_KEY_SIZE);
+ ses->chans[chan_index].signkey, SMB3_SIGN_KEY_SIZE,
+ SMB2_NTLMV2_SESSKEY_SIZE);
} else {
generate_key(ses, ptriplet->signing.label,
- ptriplet->signing.context,
- ses->smb3signingkey, SMB3_SIGN_KEY_SIZE);
+ ptriplet->signing.context, ses->smb3signingkey,
+ SMB3_SIGN_KEY_SIZE, SMB2_NTLMV2_SESSKEY_SIZE);
+
+ /*
+ * Per MS-SMB2 3.2.5.3.1, signing key always uses Session.SessionKey
+ * (first 16 bytes). Encryption/decryption keys use
+ * Session.FullSessionKey when dialect is 3.1.1 and cipher is
+ * AES-256-CCM or AES-256-GCM, otherwise Session.SessionKey.
+ */
+
+ if (server->dialect == SMB311_PROT_ID &&
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
+ full_key_size = ses->auth_key.len;
/* safe to access primary channel, since it will never go away */
spin_lock(&ses->chan_lock);
@@ -345,10 +361,13 @@ generate_smb3signingkey(struct cifs_ses *ses,
generate_key(ses, ptriplet->encryption.label,
ptriplet->encryption.context,
- ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE);
+ ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE,
+ full_key_size);
+
generate_key(ses, ptriplet->decryption.label,
ptriplet->decryption.context,
- ses->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE);
+ ses->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE,
+ full_key_size);
}
#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
@@ -361,7 +380,7 @@ generate_smb3signingkey(struct cifs_ses *ses,
&ses->Suid);
cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type);
cifs_dbg(VFS, "Session Key %*ph\n",
- SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response);
+ (int)ses->auth_key.len, ses->auth_key.response);
cifs_dbg(VFS, "Signing Key %*ph\n",
SMB3_SIGN_KEY_SIZE, ses->smb3signingkey);
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 75f9f91a7ec96..563ef488a2258 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -9,7 +9,6 @@
#include "cifs_debug.h"
#include "cifsproto.h"
#include "smb2proto.h"
-#include "../smbdirect/public.h"
/* Port numbers for SMBD transport */
#define SMB_PORT 445
@@ -558,3 +557,5 @@ void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m)
server->rdma_readwrite_threshold,
m);
}
+
+MODULE_IMPORT_NS("SMBDIRECT");
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index 287ac849213d4..be205ec02077e 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -12,7 +12,7 @@
#include "cifsglob.h"
-#include "../smbdirect/smbdirect.h"
+#include <linux/smbdirect.h>
extern int rdma_readwrite_threshold;
extern int smbd_max_frmr_depth;
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 05f8099047e1a..fdf4e50c27ceb 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1158,7 +1158,7 @@ int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length, len;
- unsigned int data_offset, data_len;
+ unsigned int data_offset, data_len, end_off;
struct cifs_io_subrequest *rdata = mid->callback_data;
char *buf = server->smallbuf;
unsigned int buflen = server->pdu_size;
@@ -1256,11 +1256,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
use_rdma_mr = rdata->mr;
#endif
data_len = server->ops->read_data_length(buf, use_rdma_mr);
- if (!use_rdma_mr && (data_offset + data_len > buflen)) {
- /* data_len is corrupt -- discard frame */
- rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed,
- data_offset + data_len, buflen);
- return cifs_readv_discard(server, mid);
+ if (!use_rdma_mr) {
+ if (check_add_overflow(data_offset, data_len, &end_off) ||
+ end_off > buflen) {
+ /* data_len is corrupt -- discard frame */
+ rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed,
+ end_off, buflen);
+ return cifs_readv_discard(server, mid);
+ }
}
#ifdef CONFIG_CIFS_SMB_DIRECT
diff --git a/fs/smb/common/fscc.h b/fs/smb/common/fscc.h
index b4ccddca92565..bc3012cc295da 100644
--- a/fs/smb/common/fscc.h
+++ b/fs/smb/common/fscc.h
@@ -260,12 +260,12 @@ typedef struct {
char FileName[];
} __packed FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */
-/* See MS-FSCC 2.4.13 */
+/* See MS-FSCC 2.4.14 */
struct smb2_file_eof_info { /* encoding of request for level 10 */
__le64 EndOfFile; /* new end of file value */
} __packed; /* level 20 Set */
-/* See MS-FSCC 2.4.14 */
+/* See MS-FSCC 2.4.15 */
typedef struct {
__le32 NextEntryOffset;
__u32 FileIndex;
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index a4b12eb8df81e..aeb0a245c5324 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -1566,6 +1566,10 @@ struct validate_negotiate_info_rsp {
#define FILE_STANDARD_LINK_INFORMATION 54
#define FILE_ID_INFORMATION 59
#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */
+#define FileId64ExtdDirectoryInformation 78 /* also for QUERY_DIR */
+#define FileId64ExtdBothDirectoryInformation 79 /* also for QUERY_DIR */
+#define FileIdAllExtdDirectoryInformation 80 /* also for QUERY_DIR */
+#define FileIdAllExtdBothDirectoryInformation 81 /* also for QUERY_DIR */
/* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */
#define SMB_FIND_FILE_POSIX_INFO 0x064
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index fbbc0529743f8..8347495dbc628 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -79,6 +79,85 @@ static int create_proc_clients(void) { return 0; }
static void delete_proc_clients(void) {}
#endif
+static struct workqueue_struct *ksmbd_conn_wq;
+
+int ksmbd_conn_wq_init(void)
+{
+ ksmbd_conn_wq = alloc_workqueue("ksmbd-conn-release",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ if (!ksmbd_conn_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void ksmbd_conn_wq_destroy(void)
+{
+ if (ksmbd_conn_wq) {
+ destroy_workqueue(ksmbd_conn_wq);
+ ksmbd_conn_wq = NULL;
+ }
+}
+
+/*
+ * __ksmbd_conn_release_work() - perform the final, once-per-struct cleanup
+ * of a ksmbd_conn whose refcount has just dropped to zero.
+ *
+ * This is the common release path used by ksmbd_conn_put() for the embedded
+ * state that outlives the connection thread: async_ida and the attached
+ * transport (which owns the socket and iov for TCP). Called from a workqueue
+ * so that sleep-allowed teardown (sock_release -> tcp_close ->
+ * lock_sock_nested) never runs from an RCU softirq callback (free_opinfo_rcu)
+ * or any other non-sleeping putter context.
+ */
+static void __ksmbd_conn_release_work(struct work_struct *work)
+{
+ struct ksmbd_conn *conn =
+ container_of(work, struct ksmbd_conn, release_work);
+
+ ida_destroy(&conn->async_ida);
+ conn->transport->ops->free_transport(conn->transport);
+ kfree(conn);
+}
+
+/**
+ * ksmbd_conn_get() - take a reference on @conn and return it.
+ *
+ * @conn: connection instance to get a reference to
+ *
+ * Returns @conn unchanged so callers can write
+ * "fp->conn = ksmbd_conn_get(work->conn);" in one expression. Returns NULL
+ * if @conn is NULL.
+ */
+struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn)
+{
+ if (!conn)
+ return NULL;
+
+ atomic_inc(&conn->refcnt);
+ return conn;
+}
+
+/**
+ * ksmbd_conn_put() - drop a reference and, if it was the last, queue the
+ * release onto ksmbd_conn_wq so it runs from process context.
+ *
+ * @conn: connection instance to put a reference to
+ *
+ * Callable from any context including RCU softirq callbacks and non-sleeping
+ * locks; the actual release is deferred to the workqueue. ksmbd_conn_wq is
+ * created in ksmbd_server_init() before any conn can be allocated and is
+ * destroyed in ksmbd_server_exit() after rcu_barrier(), so it is always
+ * non-NULL while a conn reference is held.
+ */
+void ksmbd_conn_put(struct ksmbd_conn *conn)
+{
+ if (!conn)
+ return;
+
+ if (atomic_dec_and_test(&conn->refcnt))
+ queue_work(ksmbd_conn_wq, &conn->release_work);
+}
+
/**
* ksmbd_conn_free() - free resources of the connection instance
*
@@ -93,23 +172,19 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
hash_del(&conn->hlist);
up_write(&conn_list_lock);
+ /*
+ * request_buf / preauth_info / mechToken are only ever accessed by the
+ * connection handler thread that owns @conn. ksmbd_conn_free() is
+ * called from the transport free_transport() path when that thread is
+ * exiting, so it is safe to release them unconditionally even when
+ * ksmbd_conn_put() below is not the final putter (oplock / ksmbd_file
+ * holders only retain the conn pointer, not these per-thread buffers).
+ */
xa_destroy(&conn->sessions);
kvfree(conn->request_buf);
kfree(conn->preauth_info);
kfree(conn->mechToken);
- if (atomic_dec_and_test(&conn->refcnt)) {
- /*
- * async_ida is embedded in struct ksmbd_conn, so pair
- * ida_destroy() with the final kfree() rather than with
- * the unconditional field teardown above. This keeps
- * the IDA valid for the entire lifetime of the struct,
- * even while other refcount holders (oplock / vfs
- * durable handles) still reference the connection.
- */
- ida_destroy(&conn->async_ida);
- conn->transport->ops->free_transport(conn->transport);
- kfree(conn);
- }
+ ksmbd_conn_put(conn);
}
/**
@@ -136,6 +211,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->um = ERR_PTR(-EOPNOTSUPP);
if (IS_ERR(conn->um))
conn->um = NULL;
+ INIT_WORK(&conn->release_work, __ksmbd_conn_release_work);
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
atomic_set(&conn->refcnt, 1);
@@ -512,8 +588,7 @@ void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn)
if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
wake_up(&conn->r_count_q);
- if (atomic_dec_and_test(&conn->refcnt))
- kfree(conn);
+ ksmbd_conn_put(conn);
}
int ksmbd_conn_transport_init(void)
@@ -540,24 +615,54 @@ out:
static void stop_sessions(void)
{
- struct ksmbd_conn *conn;
+ struct ksmbd_conn *conn, *target;
struct ksmbd_transport *t;
+ bool any;
int bkt;
+ /*
+ * Serialised via init_lock; no concurrent stop_sessions() can
+ * touch conn->stop_called, so writing it under the read lock is
+ * safe.
+ */
again:
+ target = NULL;
+ any = false;
down_read(&conn_list_lock);
hash_for_each(conn_list, bkt, conn, hlist) {
- t = conn->transport;
- ksmbd_conn_set_exiting(conn);
- if (t->ops->shutdown) {
- up_read(&conn_list_lock);
+ any = true;
+ if (conn->stop_called)
+ continue;
+ atomic_inc(&conn->refcnt);
+ conn->stop_called = true;
+ /*
+ * Mark the connection EXITING while still holding the
+ * read lock so the selection and the status transition
+ * happen together. Do not regress a connection that has
+ * already advanced to RELEASING on its own (e.g. the
+ * handler exited its receive loop for an unrelated
+ * reason).
+ */
+ if (READ_ONCE(conn->status) != KSMBD_SESS_RELEASING)
+ ksmbd_conn_set_exiting(conn);
+ target = conn;
+ break;
+ }
+ up_read(&conn_list_lock);
+
+ if (target) {
+ t = target->transport;
+ if (t->ops->shutdown)
t->ops->shutdown(t);
- down_read(&conn_list_lock);
+ if (atomic_dec_and_test(&target->refcnt)) {
+ ida_destroy(&target->async_ida);
+ t->ops->free_transport(t);
+ kfree(target);
}
+ goto again;
}
- up_read(&conn_list_lock);
- if (!hash_empty(conn_list)) {
+ if (any) {
msleep(100);
goto again;
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index ae21a1bd4c70e..e074be9425823 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -16,6 +16,7 @@
#include <linux/kthread.h>
#include <linux/nls.h>
#include <linux/unicode.h>
+#include <linux/workqueue.h>
#include "smb_common.h"
#include "ksmbd_work.h"
@@ -49,6 +50,7 @@ struct ksmbd_conn {
struct mutex srv_mutex;
int status;
unsigned int cli_cap;
+ bool stop_called;
union {
__be32 inet_addr;
#if IS_ENABLED(CONFIG_IPV6)
@@ -119,6 +121,7 @@ struct ksmbd_conn {
bool binding;
atomic_t refcnt;
bool is_aapl;
+ struct work_struct release_work;
};
struct ksmbd_conn_ops {
@@ -163,6 +166,10 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn);
int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id);
struct ksmbd_conn *ksmbd_conn_alloc(void);
void ksmbd_conn_free(struct ksmbd_conn *conn);
+struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn);
+void ksmbd_conn_put(struct ksmbd_conn *conn);
+int ksmbd_conn_wq_init(void);
+void ksmbd_conn_wq_destroy(void);
bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
int ksmbd_conn_write(struct ksmbd_work *work);
int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index 53f44ff4d376f..6f97f8d39657c 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -167,7 +167,10 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work,
share->path = kstrndup(ksmbd_share_config_path(resp), path_len,
KSMBD_DEFAULT_GFP);
- if (share->path) {
+ if (!share->path) {
+ ret = -ENOMEM;
+ } else {
+ ret = 0;
share->path_sz = strlen(share->path);
while (share->path_sz > 1 &&
share->path[share->path_sz - 1] == '/')
@@ -179,9 +182,10 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work,
share->force_directory_mode = resp->force_directory_mode;
share->force_uid = resp->force_uid;
share->force_gid = resp->force_gid;
- ret = parse_veto_list(share,
- KSMBD_SHARE_CONFIG_VETO_LIST(resp),
- resp->veto_list_sz);
+ if (!ret)
+ ret = parse_veto_list(share,
+ KSMBD_SHARE_CONFIG_VETO_LIST(resp),
+ resp->veto_list_sz);
if (!ret && share->path) {
if (__ksmbd_override_fsids(work, share)) {
kill_share(share);
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index cd3f28b0e7cb2..0f5c18520eff0 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -30,7 +30,6 @@ static DEFINE_RWLOCK(lease_list_lock);
static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
u64 id, __u16 Tid)
{
- struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
struct oplock_info *opinfo;
@@ -39,7 +38,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
return NULL;
opinfo->sess = sess;
- opinfo->conn = conn;
+ opinfo->conn = ksmbd_conn_get(work->conn);
opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
opinfo->op_state = OPLOCK_STATE_NONE;
opinfo->pending_break = 0;
@@ -50,7 +49,6 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
init_waitqueue_head(&opinfo->oplock_brk);
atomic_set(&opinfo->refcount, 1);
atomic_set(&opinfo->breaking_cnt, 0);
- atomic_inc(&opinfo->conn->refcnt);
return opinfo;
}
@@ -132,8 +130,7 @@ static void __free_opinfo(struct oplock_info *opinfo)
{
if (opinfo->is_lease)
free_lease(opinfo);
- if (opinfo->conn && atomic_dec_and_test(&opinfo->conn->refcnt))
- kfree(opinfo->conn);
+ ksmbd_conn_put(opinfo->conn);
kfree(opinfo);
}
@@ -484,8 +481,12 @@ static inline int compare_guid_key(struct oplock_info *opinfo,
const char *guid1, const char *key1)
{
const char *guid2, *key2;
+ struct ksmbd_conn *conn;
- guid2 = opinfo->conn->ClientGUID;
+ conn = READ_ONCE(opinfo->conn);
+ if (!conn)
+ return 0;
+ guid2 = conn->ClientGUID;
key2 = opinfo->o_lease->lease_key;
if (!memcmp(guid1, guid2, SMB2_CLIENT_GUID_SIZE) &&
!memcmp(key1, key2, SMB2_LEASE_KEY_SIZE))
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 58ef02c423fce..5d799b2d4c62f 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -596,8 +596,14 @@ static int __init ksmbd_server_init(void)
if (ret)
goto err_crypto_destroy;
+ ret = ksmbd_conn_wq_init();
+ if (ret)
+ goto err_workqueue_destroy;
+
return 0;
+err_workqueue_destroy:
+ ksmbd_workqueue_destroy();
err_crypto_destroy:
ksmbd_crypto_destroy();
err_release_inode_hash:
@@ -623,6 +629,12 @@ static void __exit ksmbd_server_exit(void)
{
ksmbd_server_shutdown();
rcu_barrier();
+ /*
+ * ksmbd_conn_put() defers the final release onto ksmbd_conn_wq,
+ * so drain it after rcu_barrier() has fired any pending RCU
+ * callbacks that may have queued a release.
+ */
+ ksmbd_conn_wq_destroy();
ksmbd_release_inode_hash();
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 21825a69c29a7..5128a693aca6c 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -3767,8 +3767,10 @@ err_out1:
err_out2:
if (!rc) {
- ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED);
- rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len);
+ rc = ksmbd_update_fstate(&work->sess->file_table, fp,
+ FP_INITED);
+ if (!rc)
+ rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len);
}
if (rc) {
if (rc == -EINVAL)
@@ -3802,8 +3804,19 @@ err_out2:
ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status);
}
- if (dh_info.reconnected)
- ksmbd_put_durable_fd(dh_info.fp);
+ if (dh_info.reconnected) {
+ /*
+ * If reconnect succeeded, fp was republished in the
+ * session file table. On a later error, ksmbd_fd_put()
+ * above drops the session reference; drop the durable
+ * lookup reference through the same session-aware path so
+ * final close removes the volatile id before freeing fp.
+ */
+ if (rc && fp == dh_info.fp)
+ ksmbd_fd_put(work, dh_info.fp);
+ else
+ ksmbd_put_durable_fd(dh_info.fp);
+ }
kfree(name);
kfree(lc);
@@ -3946,7 +3959,13 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
goto free_conv_name;
}
- struct_sz = readdir_info_level_struct_sz(info_level) + conv_len;
+ struct_sz = readdir_info_level_struct_sz(info_level);
+ if (struct_sz == -EOPNOTSUPP) {
+ rc = -EINVAL;
+ goto free_conv_name;
+ }
+
+ struct_sz += conv_len;
next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
d_info->last_entry_off_align = next_entry_offset - struct_sz;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 4bbc2c27e6805..c2d9be52a311f 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -643,8 +643,10 @@ static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap,
ntace = (struct smb_ace *)((char *)pndace + *size);
ace_sz = fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, flags,
pace->e_perm, 0777);
- if (check_add_overflow(*size, ace_sz, size))
+ if (check_add_overflow(*size, ace_sz, size)) {
+ kfree(sid);
break;
+ }
(*num_aces)++;
if (pace->e_tag == ACL_USER)
ntace->access_req |=
@@ -655,8 +657,10 @@ static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap,
ntace = (struct smb_ace *)((char *)pndace + *size);
ace_sz = fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED,
0x03, pace->e_perm, 0777);
- if (check_add_overflow(*size, ace_sz, size))
+ if (check_add_overflow(*size, ace_sz, size)) {
+ kfree(sid);
break;
+ }
(*num_aces)++;
if (pace->e_tag == ACL_USER)
ntace->access_req |=
@@ -698,8 +702,10 @@ posix_default_acl:
ntace = (struct smb_ace *)((char *)pndace + *size);
ace_sz = fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, 0x0b,
pace->e_perm, 0777);
- if (check_add_overflow(*size, ace_sz, size))
+ if (check_add_overflow(*size, ace_sz, size)) {
+ kfree(sid);
break;
+ }
(*num_aces)++;
if (pace->e_tag == ACL_USER)
ntace->access_req |=
@@ -1068,7 +1074,60 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type,
ace->flags = flags;
ace->access_req = access_req;
smb_copy_sid(&ace->sid, sid);
- ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + (sid->num_subauth * 4));
+ ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 +
+ (ace->sid.num_subauth * 4));
+}
+
+static int smb_append_inherited_ace(struct smb_ace **ace, int *nt_size,
+ u16 *ace_cnt, const struct smb_sid *sid,
+ u8 type, u8 flags, __le32 access_req)
+{
+ int ace_size;
+
+ smb_set_ace(*ace, sid, type, flags, access_req);
+ ace_size = le16_to_cpu((*ace)->size);
+ /* pdacl->size is __le16 and includes struct smb_acl. */
+ if (check_add_overflow(*nt_size, ace_size, nt_size) ||
+ *nt_size > U16_MAX - (int)sizeof(struct smb_acl))
+ return -EINVAL;
+
+ (*ace_cnt)++;
+ *ace = (struct smb_ace *)((char *)*ace + ace_size);
+ return 0;
+}
+
+static int smb_validate_ntsd_sid(struct smb_ntsd *pntsd, size_t pntsd_size,
+ unsigned int sid_offset, struct smb_sid **sid,
+ size_t *sid_size)
+{
+ size_t sid_end;
+
+ *sid = NULL;
+ *sid_size = 0;
+
+ if (!sid_offset)
+ return 0;
+
+ if (sid_offset < sizeof(struct smb_ntsd) ||
+ check_add_overflow(sid_offset, (size_t)CIFS_SID_BASE_SIZE,
+ &sid_end) ||
+ sid_end > pntsd_size)
+ return -EINVAL;
+
+ *sid = (struct smb_sid *)((char *)pntsd + sid_offset);
+ if ((*sid)->num_subauth > SID_MAX_SUB_AUTHORITIES)
+ return -EINVAL;
+
+ if (check_add_overflow((size_t)CIFS_SID_BASE_SIZE,
+ sizeof(__le32) * (size_t)(*sid)->num_subauth,
+ &sid_end))
+ return -EINVAL;
+
+ if (sid_offset > pntsd_size || sid_end > pntsd_size - sid_offset)
+ return -EINVAL;
+
+ *sid_size = sid_end;
+ return 0;
}
int smb_inherit_dacl(struct ksmbd_conn *conn,
@@ -1083,28 +1142,28 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
struct dentry *parent = path->dentry->d_parent;
struct mnt_idmap *idmap = mnt_idmap(path->mnt);
int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size;
- int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size;
+ int rc = 0, pntsd_type, ppntsd_size, acl_len, aces_size;
unsigned int dacloffset;
size_t dacl_struct_end;
u16 num_aces, ace_cnt = 0;
char *aces_base;
bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
- pntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap,
+ ppntsd_size = ksmbd_vfs_get_sd_xattr(conn, idmap,
parent, &parent_pntsd);
- if (pntsd_size <= 0)
+ if (ppntsd_size <= 0)
return -ENOENT;
dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
if (!dacloffset ||
check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) ||
- dacl_struct_end > (size_t)pntsd_size) {
+ dacl_struct_end > (size_t)ppntsd_size) {
rc = -EINVAL;
goto free_parent_pntsd;
}
parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset);
- acl_len = pntsd_size - dacloffset;
+ acl_len = ppntsd_size - dacloffset;
num_aces = le16_to_cpu(parent_pdacl->num_aces);
pntsd_type = le16_to_cpu(parent_pntsd->type);
pdacl_size = le16_to_cpu(parent_pdacl->size);
@@ -1157,6 +1216,12 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
CIFS_SID_BASE_SIZE)
break;
+ if (parent_aces->sid.num_subauth > SID_MAX_SUB_AUTHORITIES ||
+ pace_size < offsetof(struct smb_ace, sid) +
+ CIFS_SID_BASE_SIZE +
+ sizeof(__le32) * parent_aces->sid.num_subauth)
+ break;
+
aces_size -= pace_size;
flags = parent_aces->flags;
@@ -1186,22 +1251,24 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
}
if (is_dir && creator && flags & CONTAINER_INHERIT_ACE) {
- smb_set_ace(aces, psid, parent_aces->type, inherited_flags,
- parent_aces->access_req);
- nt_size += le16_to_cpu(aces->size);
- ace_cnt++;
- aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size));
+ rc = smb_append_inherited_ace(&aces, &nt_size, &ace_cnt,
+ psid, parent_aces->type,
+ inherited_flags,
+ parent_aces->access_req);
+ if (rc)
+ goto free_aces_base;
flags |= INHERIT_ONLY_ACE;
psid = creator;
} else if (is_dir && !(parent_aces->flags & NO_PROPAGATE_INHERIT_ACE)) {
psid = &parent_aces->sid;
}
- smb_set_ace(aces, psid, parent_aces->type, flags | inherited_flags,
- parent_aces->access_req);
- nt_size += le16_to_cpu(aces->size);
- aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size));
- ace_cnt++;
+ rc = smb_append_inherited_ace(&aces, &nt_size, &ace_cnt, psid,
+ parent_aces->type,
+ flags | inherited_flags,
+ parent_aces->access_req);
+ if (rc)
+ goto free_aces_base;
pass:
parent_aces = (struct smb_ace *)((char *)parent_aces + pace_size);
}
@@ -1210,22 +1277,33 @@ pass:
struct smb_ntsd *pntsd;
struct smb_acl *pdacl;
struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL;
- int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size;
- int pntsd_alloc_size;
+ size_t powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size;
+ size_t pntsd_alloc_size;
- if (parent_pntsd->osidoffset) {
- powner_sid = (struct smb_sid *)((char *)parent_pntsd +
- le32_to_cpu(parent_pntsd->osidoffset));
- powner_sid_size = 1 + 1 + 6 + (powner_sid->num_subauth * 4);
- }
- if (parent_pntsd->gsidoffset) {
- pgroup_sid = (struct smb_sid *)((char *)parent_pntsd +
- le32_to_cpu(parent_pntsd->gsidoffset));
- pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4);
- }
+ rc = smb_validate_ntsd_sid(parent_pntsd, ppntsd_size,
+ le32_to_cpu(parent_pntsd->osidoffset),
+ &powner_sid, &powner_sid_size);
+ if (rc)
+ goto free_aces_base;
+ rc = smb_validate_ntsd_sid(parent_pntsd, ppntsd_size,
+ le32_to_cpu(parent_pntsd->gsidoffset),
+ &pgroup_sid, &pgroup_sid_size);
+ if (rc)
+ goto free_aces_base;
- pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size +
- pgroup_sid_size + sizeof(struct smb_acl) + nt_size;
+ if (check_add_overflow(sizeof(struct smb_ntsd),
+ (size_t)powner_sid_size,
+ &pntsd_alloc_size) ||
+ check_add_overflow(pntsd_alloc_size,
+ (size_t)pgroup_sid_size,
+ &pntsd_alloc_size) ||
+ check_add_overflow(pntsd_alloc_size, sizeof(struct smb_acl),
+ &pntsd_alloc_size) ||
+ check_add_overflow(pntsd_alloc_size, (size_t)nt_size,
+ &pntsd_alloc_size)) {
+ rc = -EINVAL;
+ goto free_aces_base;
+ }
pntsd = kzalloc(pntsd_alloc_size, KSMBD_DEFAULT_GFP);
if (!pntsd) {
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index a8242c00096f3..b6d63ff8a8a3d 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -18,7 +18,6 @@
#include "smb_common.h"
#include "../common/smb2status.h"
#include "transport_rdma.h"
-#include "../smbdirect/public.h"
#define SMB_DIRECT_PORT_IWARP 5445
@@ -540,3 +539,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
.rdma_write = smb_direct_rdma_write,
.free_transport = smb_direct_free_transport,
};
+
+MODULE_IMPORT_NS("SMBDIRECT");
diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h
index bde3d88aecc71..8b78917a17958 100644
--- a/fs/smb/server/transport_rdma.h
+++ b/fs/smb/server/transport_rdma.h
@@ -25,6 +25,6 @@ static inline void init_smbd_max_io_size(unsigned int sz) { }
static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; }
#endif
-#include "../smbdirect/smbdirect.h"
+#include <linux/smbdirect.h>
#endif /* __KSMBD_TRANSPORT_RDMA_H__ */
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 3551f01a3fa03..5a232d94f567a 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -81,7 +81,7 @@ static int proc_show_files(struct seq_file *m, void *v)
read_lock(&global_ft.lock);
idr_for_each_entry(global_ft.idr, fp, id) {
seq_printf(m, "%#-10x %#-10llx %#-10llx %#-10x",
- fp->tcon->id,
+ fp->tcon ? fp->tcon->id : 0,
fp->persistent_id,
fp->volatile_id,
atomic_read(&fp->refcount));
@@ -211,7 +211,7 @@ int ksmbd_query_inode_status(struct dentry *dentry)
return ret;
down_read(&ci->m_lock);
- if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS))
+ if (ci->m_flags & S_DEL_PENDING)
ret = KSMBD_INODE_STATUS_PENDING_DELETE;
else
ret = KSMBD_INODE_STATUS_OK;
@@ -227,7 +227,7 @@ bool ksmbd_inode_pending_delete(struct ksmbd_file *fp)
int ret;
down_read(&ci->m_lock);
- ret = (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS));
+ ret = (ci->m_flags & S_DEL_PENDING);
up_read(&ci->m_lock);
return ret;
@@ -395,12 +395,20 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
}
}
+ down_write(&ci->m_lock);
+ /* Promote S_DEL_ON_CLS to S_DEL_PENDING when close */
+ if (ci->m_flags & S_DEL_ON_CLS) {
+ ci->m_flags &= ~S_DEL_ON_CLS;
+ ci->m_flags |= S_DEL_PENDING;
+ }
+ up_write(&ci->m_lock);
+
if (atomic_dec_and_test(&ci->m_count)) {
bool do_unlink = false;
down_write(&ci->m_lock);
- if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) {
- ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
+ if (ci->m_flags & S_DEL_PENDING) {
+ ci->m_flags &= ~S_DEL_PENDING;
do_unlink = true;
}
up_write(&ci->m_lock);
@@ -418,6 +426,14 @@ static void __ksmbd_remove_durable_fd(struct ksmbd_file *fp)
return;
idr_remove(global_ft.idr, fp->persistent_id);
+ /*
+ * Clear persistent_id so a later __ksmbd_close_fd() that runs from a
+ * delayed putter (e.g. when a concurrent ksmbd_lookup_fd_inode()
+ * walker held the final reference) does not re-issue idr_remove() on
+ * an id that idr_alloc_cyclic() may have already handed out to a new
+ * durable handle.
+ */
+ fp->persistent_id = KSMBD_NO_FID;
}
static void ksmbd_remove_durable_fd(struct ksmbd_file *fp)
@@ -431,13 +447,13 @@ static void ksmbd_remove_durable_fd(struct ksmbd_file *fp)
static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
{
- if (!has_file_id(fp->volatile_id))
- return;
-
down_write(&fp->f_ci->m_lock);
list_del_init(&fp->node);
up_write(&fp->f_ci->m_lock);
+ if (!has_file_id(fp->volatile_id))
+ return;
+
write_lock(&ft->lock);
idr_remove(ft->idr, fp->volatile_id);
write_unlock(&ft->lock);
@@ -475,6 +491,17 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
kfree(smb_lock);
}
+ /*
+ * Drop fp's strong reference on conn (taken in ksmbd_open_fd() /
+ * ksmbd_reopen_durable_fd()). Durable fps that reached the
+ * scavenger have already had fp->conn cleared by session_fd_check(),
+ * in which case there is nothing to drop here.
+ */
+ if (fp->conn) {
+ ksmbd_conn_put(fp->conn);
+ fp->conn = NULL;
+ }
+
if (ksmbd_stream_fd(fp))
kfree(fp->stream.name);
kfree(fp->owner.name);
@@ -510,6 +537,20 @@ static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft,
static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp)
{
+ /*
+ * Detached durable fp -- session_fd_check() cleared fp->conn at
+ * preserve, so this fp is no longer tracked by any conn's
+ * stats.open_files_count. This happens when
+ * ksmbd_scavenger_dispose_dh() hands the final close off to an
+ * m_fp_list walker (e.g. ksmbd_lookup_fd_inode()) whose work->conn
+ * is unrelated to the conn that originally opened the handle; close
+ * via the NULL-ft path so we do not underflow that unrelated
+ * counter.
+ */
+ if (!fp->conn) {
+ __ksmbd_close_fd(NULL, fp);
+ return;
+ }
__ksmbd_close_fd(&work->sess->file_table, fp);
atomic_dec(&work->conn->stats.open_files_count);
}
@@ -752,7 +793,14 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
atomic_set(&fp->refcount, 1);
fp->filp = filp;
- fp->conn = work->conn;
+ /*
+ * fp owns a strong reference on fp->conn for as long as fp->conn is
+ * non-NULL, so session_fd_check() and __ksmbd_close_fd() never
+ * dereference a dangling pointer. Paired with ksmbd_conn_put() in
+ * session_fd_check() (durable preserve), in __ksmbd_close_fd()
+ * (final close), and on the error paths below.
+ */
+ fp->conn = ksmbd_conn_get(work->conn);
fp->tcon = work->tcon;
fp->volatile_id = KSMBD_NO_FID;
fp->persistent_id = KSMBD_NO_FID;
@@ -774,19 +822,64 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
return fp;
err_out:
+ /* fp->conn was set and refcounted before every branch here. */
+ ksmbd_conn_put(fp->conn);
kmem_cache_free(filp_cache, fp);
return ERR_PTR(ret);
}
-void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
- unsigned int state)
+/**
+ * ksmbd_update_fstate() - update an fp state under the file-table lock
+ * @ft: file table that publishes @fp's volatile id
+ * @fp: file pointer to update
+ * @state: new state
+ *
+ * Return: 0 on success. The FP_NEW -> FP_INITED transition is special:
+ * -ENOENT if teardown already unpublished @fp by advancing the state or
+ * clearing the volatile id. Other state updates preserve the historical
+ * fire-and-forget behavior.
+ */
+int ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ unsigned int state)
{
+ int ret;
+
if (!fp)
- return;
+ return -ENOENT;
write_lock(&ft->lock);
- fp->f_state = state;
+ if (state == FP_INITED &&
+ (fp->f_state != FP_NEW || !has_file_id(fp->volatile_id))) {
+ ret = -ENOENT;
+ } else {
+ fp->f_state = state;
+ ret = 0;
+ }
write_unlock(&ft->lock);
+
+ return ret;
+}
+
+/*
+ * ksmbd_mark_fp_closed() - mark fp closed under ft->lock and return how many
+ * refs the teardown path owns.
+ *
+ * FP_INITED has a normal idr-owned reference, so teardown owns both that
+ * reference and the transient lookup reference. FP_NEW is still owned by the
+ * in-flight opener/reopener, which will drop the original reference after
+ * ksmbd_update_fstate(..., FP_INITED) observes the cleared volatile id.
+ * FP_CLOSED on entry means an earlier ksmbd_close_fd() already consumed the
+ * idr-owned ref.
+ */
+static int ksmbd_mark_fp_closed(struct ksmbd_file *fp)
+{
+ if (fp->f_state == FP_INITED) {
+ set_close_state_blocked_works(fp);
+ fp->f_state = FP_CLOSED;
+ return 2;
+ }
+
+ return 1;
}
static int
@@ -794,7 +887,8 @@ __close_file_table_ids(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tcon,
bool (*skip)(struct ksmbd_tree_connect *tcon,
struct ksmbd_file *fp,
- struct ksmbd_user *user))
+ struct ksmbd_user *user),
+ bool skip_preserves_fp)
{
struct ksmbd_file_table *ft = &sess->file_table;
struct ksmbd_file *fp;
@@ -802,32 +896,120 @@ __close_file_table_ids(struct ksmbd_session *sess,
int num = 0;
while (1) {
+ int n_to_drop;
+
write_lock(&ft->lock);
fp = idr_get_next(ft->idr, &id);
if (!fp) {
write_unlock(&ft->lock);
break;
}
-
- if (skip(tcon, fp, sess->user) ||
- !atomic_dec_and_test(&fp->refcount)) {
+ if (!atomic_inc_not_zero(&fp->refcount)) {
id++;
write_unlock(&ft->lock);
continue;
}
- set_close_state_blocked_works(fp);
- idr_remove(ft->idr, fp->volatile_id);
- fp->volatile_id = KSMBD_NO_FID;
- write_unlock(&ft->lock);
+ if (skip_preserves_fp) {
+ /*
+ * Session teardown: skip() is session_fd_check(),
+ * which may sleep and mutates fp->conn / fp->tcon /
+ * fp->volatile_id when it chooses to preserve fp
+ * for durable reconnect. Unpublish fp from the
+ * session idr here, under ft->lock, so that
+ * __ksmbd_lookup_fd() through this session cannot
+ * grant a new ksmbd_fp_get() reference to an fp
+ * whose fields are about to be rewritten outside
+ * the lock. Durable reconnect still reaches fp via
+ * global_ft.
+ */
+ idr_remove(ft->idr, id);
+ fp->volatile_id = KSMBD_NO_FID;
+ write_unlock(&ft->lock);
+ if (skip(tcon, fp, sess->user)) {
+ /*
+ * session_fd_check() has converted fp to
+ * durable-preserve state and cleared its
+ * per-conn fields. fp is already unpublished
+ * above; the original idr-owned ref keeps it
+ * alive for the durable scavenger. Drop only
+ * the transient ref. atomic_dec() is safe --
+ * atomic_inc_not_zero() succeeded on a
+ * positive value and we added one more, so
+ * refcount cannot be zero here.
+ */
+ atomic_dec(&fp->refcount);
+ id++;
+ continue;
+ }
+
+ /*
+ * Keep the close-state decision under the same lock
+ * observed by ksmbd_update_fstate(), which is how an
+ * in-flight FP_NEW opener learns that teardown has
+ * cleared its volatile id.
+ */
+ write_lock(&ft->lock);
+ n_to_drop = ksmbd_mark_fp_closed(fp);
+ write_unlock(&ft->lock);
+ } else {
+ /*
+ * Tree teardown: skip() is tree_conn_fd_check(), a
+ * cheap pointer compare that doesn't sleep and has
+ * no side effects, so keep the skip decision plus
+ * the unpublish-and-mark-closed sequence atomic
+ * under ft->lock. fps belonging to other tree
+ * connects (skip() == true) stay fully published in
+ * the session idr with no lock window.
+ */
+ if (skip(tcon, fp, sess->user)) {
+ atomic_dec(&fp->refcount);
+ write_unlock(&ft->lock);
+ id++;
+ continue;
+ }
+ idr_remove(ft->idr, id);
+ fp->volatile_id = KSMBD_NO_FID;
+ n_to_drop = ksmbd_mark_fp_closed(fp);
+ write_unlock(&ft->lock);
+ }
+
+ /*
+ * fp->volatile_id is already cleared to prevent stale idr
+ * removal from a deferred final close. Remove fp from
+ * m_fp_list here because __ksmbd_remove_fd() will skip the
+ * list unlink when volatile_id is KSMBD_NO_FID.
+ */
down_write(&fp->f_ci->m_lock);
list_del_init(&fp->node);
up_write(&fp->f_ci->m_lock);
- __ksmbd_close_fd(ft, fp);
-
- num++;
+ /*
+ * Drop the references this iteration owns:
+ *
+ * n_to_drop == 2: we observed FP_INITED and committed
+ * the FP_CLOSED transition ourselves, so we own the
+ * transient (+1) and the still-intact idr-owned ref.
+ *
+ * n_to_drop == 1: either a prior ksmbd_close_fd()
+ * already consumed the idr-owned ref, or fp was still
+ * FP_NEW and the in-flight opener/reopener must keep
+ * the original reference until ksmbd_update_fstate()
+ * observes the cleared volatile id.
+ *
+ * If we end up as the final putter, finalize fp and
+ * account the open_files_count decrement via the caller's
+ * atomic_sub(num, ...). Otherwise the remaining user's
+ * ksmbd_fd_put() reaches __put_fd_final(), which does its
+ * own atomic_dec(&open_files_count), so we must not count
+ * this fp here -- doing so would double-decrement the
+ * connection-wide counter.
+ */
+ if (atomic_sub_and_test(n_to_drop, &fp->refcount)) {
+ __ksmbd_close_fd(NULL, fp);
+ num++;
+ }
id++;
}
@@ -881,24 +1063,37 @@ static bool ksmbd_durable_scavenger_alive(void)
return true;
}
-static void ksmbd_scavenger_dispose_dh(struct list_head *head)
+static void ksmbd_scavenger_dispose_dh(struct ksmbd_file *fp)
{
- while (!list_empty(head)) {
- struct ksmbd_file *fp;
+ /*
+ * Durable-preserved fp can remain linked on f_ci->m_fp_list for
+ * share-mode checks. Unlink it before final close; fp->node is not
+ * available as a scavenger-private list node because re-adding it to
+ * another list corrupts m_fp_list.
+ */
+ down_write(&fp->f_ci->m_lock);
+ list_del_init(&fp->node);
+ up_write(&fp->f_ci->m_lock);
- fp = list_first_entry(head, struct ksmbd_file, node);
- list_del_init(&fp->node);
+ /*
+ * Drop both the durable lifetime reference and the transient reference
+ * taken by the scavenger under global_ft.lock. If a concurrent
+ * ksmbd_lookup_fd_inode() (or any other m_fp_list walker) snatched fp
+ * before the unlink above, that holder owns the final close via
+ * ksmbd_fd_put() -> __ksmbd_close_fd(). Otherwise the scavenger is
+ * the last putter and finalises fp here.
+ */
+ if (atomic_sub_and_test(2, &fp->refcount))
__ksmbd_close_fd(NULL, fp);
- }
}
static int ksmbd_durable_scavenger(void *dummy)
{
struct ksmbd_file *fp = NULL;
+ struct ksmbd_file *expired_fp;
unsigned int id;
unsigned int min_timeout = 1;
bool found_fp_timeout;
- LIST_HEAD(scavenger_list);
unsigned long remaining_jiffies;
__module_get(THIS_MODULE);
@@ -908,8 +1103,6 @@ static int ksmbd_durable_scavenger(void *dummy)
if (try_to_freeze())
continue;
- found_fp_timeout = false;
-
remaining_jiffies = wait_event_timeout(dh_wq,
ksmbd_durable_scavenger_alive() == false,
__msecs_to_jiffies(min_timeout));
@@ -918,23 +1111,39 @@ static int ksmbd_durable_scavenger(void *dummy)
else
min_timeout = DURABLE_HANDLE_MAX_TIMEOUT;
- write_lock(&global_ft.lock);
- idr_for_each_entry(global_ft.idr, fp, id) {
- if (!fp->durable_timeout)
- continue;
-
- if (atomic_read(&fp->refcount) > 1 ||
- fp->conn)
- continue;
+ do {
+ expired_fp = NULL;
+ found_fp_timeout = false;
- found_fp_timeout = true;
- if (fp->durable_scavenger_timeout <=
- jiffies_to_msecs(jiffies)) {
- __ksmbd_remove_durable_fd(fp);
- list_add(&fp->node, &scavenger_list);
- } else {
+ write_lock(&global_ft.lock);
+ idr_for_each_entry(global_ft.idr, fp, id) {
unsigned long durable_timeout;
+ if (!fp->durable_timeout)
+ continue;
+
+ if (atomic_read(&fp->refcount) > 1 ||
+ fp->conn)
+ continue;
+
+ found_fp_timeout = true;
+ if (fp->durable_scavenger_timeout <=
+ jiffies_to_msecs(jiffies)) {
+ __ksmbd_remove_durable_fd(fp);
+ /*
+ * Take a transient reference so fp
+ * cannot be freed by an in-flight
+ * ksmbd_lookup_fd_inode() that found
+ * it through f_ci->m_fp_list while we
+ * drop global_ft.lock and reach the
+ * m_fp_list unlink in
+ * ksmbd_scavenger_dispose_dh().
+ */
+ atomic_inc(&fp->refcount);
+ expired_fp = fp;
+ break;
+ }
+
durable_timeout =
fp->durable_scavenger_timeout -
jiffies_to_msecs(jiffies);
@@ -942,10 +1151,11 @@ static int ksmbd_durable_scavenger(void *dummy)
if (min_timeout > durable_timeout)
min_timeout = durable_timeout;
}
- }
- write_unlock(&global_ft.lock);
+ write_unlock(&global_ft.lock);
- ksmbd_scavenger_dispose_dh(&scavenger_list);
+ if (expired_fp)
+ ksmbd_scavenger_dispose_dh(expired_fp);
+ } while (expired_fp);
if (found_fp_timeout == false)
break;
@@ -1062,25 +1272,35 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon,
if (!is_reconnectable(fp))
return false;
+ if (fp->f_state != FP_INITED)
+ return false;
+
+ if (WARN_ON_ONCE(!fp->conn))
+ return false;
+
if (ksmbd_vfs_copy_durable_owner(fp, user))
return false;
+ /*
+ * fp owns a strong reference on fp->conn (taken in ksmbd_open_fd()
+ * / ksmbd_reopen_durable_fd()), so conn stays valid for the whole
+ * body of this function regardless of any op->conn puts below.
+ */
conn = fp->conn;
ci = fp->f_ci;
down_write(&ci->m_lock);
list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
if (op->conn != conn)
continue;
- if (op->conn && atomic_dec_and_test(&op->conn->refcnt))
- kfree(op->conn);
+ ksmbd_conn_put(op->conn);
op->conn = NULL;
}
up_write(&ci->m_lock);
list_for_each_entry_safe(smb_lock, tmp_lock, &fp->lock_list, flist) {
- spin_lock(&fp->conn->llist_lock);
+ spin_lock(&conn->llist_lock);
list_del_init(&smb_lock->clist);
- spin_unlock(&fp->conn->llist_lock);
+ spin_unlock(&conn->llist_lock);
}
fp->conn = NULL;
@@ -1091,6 +1311,8 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon,
fp->durable_scavenger_timeout =
jiffies_to_msecs(jiffies) + fp->durable_timeout;
+ /* Drop fp's own reference on conn. */
+ ksmbd_conn_put(conn);
return true;
}
@@ -1098,7 +1320,8 @@ void ksmbd_close_tree_conn_fds(struct ksmbd_work *work)
{
int num = __close_file_table_ids(work->sess,
work->tcon,
- tree_conn_fd_check);
+ tree_conn_fd_check,
+ false);
atomic_sub(num, &work->conn->stats.open_files_count);
}
@@ -1107,7 +1330,8 @@ void ksmbd_close_session_fds(struct ksmbd_work *work)
{
int num = __close_file_table_ids(work->sess,
work->tcon,
- session_fd_check);
+ session_fd_check,
+ true);
atomic_sub(num, &work->conn->stats.open_files_count);
}
@@ -1178,15 +1402,27 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
old_f_state = fp->f_state;
fp->f_state = FP_NEW;
+
+ /*
+ * Initialize fp's connection binding before publishing fp into the
+ * session's file table. If __open_id() is ordered first, a
+ * concurrent teardown that iterates the table can observe a valid
+ * volatile_id with fp->conn == NULL and preserve a
+ * partially-initialized fp. fp owns a strong reference on the new
+ * conn (see ksmbd_open_fd()); undo it on __open_id() failure.
+ */
+ fp->conn = ksmbd_conn_get(conn);
+ fp->tcon = work->tcon;
+
__open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID);
if (!has_file_id(fp->volatile_id)) {
+ fp->conn = NULL;
+ fp->tcon = NULL;
+ ksmbd_conn_put(conn);
fp->f_state = old_f_state;
return -EBADF;
}
- fp->conn = conn;
- fp->tcon = work->tcon;
-
list_for_each_entry(smb_lock, &fp->lock_list, flist) {
spin_lock(&conn->llist_lock);
list_add_tail(&smb_lock->clist, &conn->lock_list);
@@ -1198,8 +1434,7 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
if (op->conn)
continue;
- op->conn = fp->conn;
- atomic_inc(&op->conn->refcnt);
+ op->conn = ksmbd_conn_get(fp->conn);
}
up_write(&ci->m_lock);
@@ -1228,7 +1463,7 @@ void ksmbd_destroy_file_table(struct ksmbd_session *sess)
if (!ft->idr)
return;
- __close_file_table_ids(sess, NULL, session_fd_check);
+ __close_file_table_ids(sess, NULL, session_fd_check, true);
idr_destroy(ft->idr);
kfree(ft->idr);
ft->idr = NULL;
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 866f32c10d4dd..e6871266a94ba 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -172,8 +172,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode);
int ksmbd_init_global_file_table(void);
void ksmbd_free_global_file_table(void);
void ksmbd_set_fd_limit(unsigned long limit);
-void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
- unsigned int state);
+int ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ unsigned int state);
bool ksmbd_vfs_compare_durable_owner(struct ksmbd_file *fp,
struct ksmbd_user *user);
diff --git a/fs/smb/smbdirect/accept.c b/fs/smb/smbdirect/accept.c
index 704b271af3a8c..5297400058385 100644
--- a/fs/smb/smbdirect/accept.c
+++ b/fs/smb/smbdirect/accept.c
@@ -854,4 +854,4 @@ struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc,
return nsc;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_accept);
+EXPORT_SYMBOL_GPL(smbdirect_socket_accept);
diff --git a/fs/smb/smbdirect/connect.c b/fs/smb/smbdirect/connect.c
index 8addee43a3811..cd726b399afec 100644
--- a/fs/smb/smbdirect/connect.c
+++ b/fs/smb/smbdirect/connect.c
@@ -60,7 +60,7 @@ int smbdirect_connect(struct smbdirect_socket *sc, const struct sockaddr *dst)
*/
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect);
+EXPORT_SYMBOL_GPL(smbdirect_connect);
static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc)
{
@@ -922,4 +922,4 @@ int smbdirect_connect_sync(struct smbdirect_socket *sc,
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect_sync);
+EXPORT_SYMBOL_GPL(smbdirect_connect_sync);
diff --git a/fs/smb/smbdirect/connection.c b/fs/smb/smbdirect/connection.c
index 822366718d457..8adf580975344 100644
--- a/fs/smb/smbdirect/connection.c
+++ b/fs/smb/smbdirect/connection.c
@@ -706,7 +706,7 @@ bool smbdirect_connection_is_connected(struct smbdirect_socket *sc)
return false;
return true;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_is_connected);
+EXPORT_SYMBOL_GPL(smbdirect_connection_is_connected);
int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc)
{
@@ -779,7 +779,7 @@ int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc)
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_wait_for_connected);
+EXPORT_SYMBOL_GPL(smbdirect_connection_wait_for_connected);
void smbdirect_connection_idle_timer_work(struct work_struct *work)
{
@@ -958,7 +958,7 @@ release_credit:
return ret;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_batch_flush);
+EXPORT_SYMBOL_GPL(smbdirect_connection_send_batch_flush);
struct smbdirect_send_batch *
smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage,
@@ -976,7 +976,7 @@ smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage,
return batch;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_init_send_batch_storage);
+EXPORT_SYMBOL_GPL(smbdirect_init_send_batch_storage);
static int smbdirect_connection_wait_for_send_bcredit(struct smbdirect_socket *sc,
struct smbdirect_send_batch *batch)
@@ -1263,7 +1263,7 @@ lcredit_failed:
bcredit_failed:
return ret;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_single_iter);
+EXPORT_SYMBOL_GPL(smbdirect_connection_send_single_iter);
int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc)
{
@@ -1288,7 +1288,7 @@ int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc)
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_wait_zero_pending);
+EXPORT_SYMBOL_GPL(smbdirect_connection_send_wait_zero_pending);
int smbdirect_connection_send_iter(struct smbdirect_socket *sc,
struct iov_iter *iter,
@@ -1373,7 +1373,7 @@ int smbdirect_connection_send_iter(struct smbdirect_socket *sc,
return total_count;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_iter);
+EXPORT_SYMBOL_GPL(smbdirect_connection_send_iter);
static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc)
{
@@ -1937,7 +1937,7 @@ read_rfc1002_done:
goto again;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_recvmsg);
+EXPORT_SYMBOL_GPL(smbdirect_connection_recvmsg);
static bool smbdirect_map_sges_single_page(struct smbdirect_map_sges *state,
struct page *page, size_t off, size_t len)
@@ -2168,7 +2168,7 @@ static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len,
if (ret < 0) {
while (state->num_sge > before) {
- struct ib_sge *sge = &state->sge[state->num_sge--];
+ struct ib_sge *sge = &state->sge[--state->num_sge];
ib_dma_unmap_page(state->device,
sge->addr,
diff --git a/fs/smb/smbdirect/debug.c b/fs/smb/smbdirect/debug.c
index a66a19d4a4634..3445843445bff 100644
--- a/fs/smb/smbdirect/debug.c
+++ b/fs/smb/smbdirect/debug.c
@@ -40,7 +40,7 @@ void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
seq_puts(m, "\n");
seq_printf(m, "Conn keep_alive_interval: %u ",
- sp->keepalive_interval_msec * 1000);
+ sp->keepalive_interval_msec / 1000);
seq_printf(m, "max_readwrite_size: %u rdma_readwrite_threshold: %u",
sp->max_read_write_size,
rdma_readwrite_threshold);
@@ -85,4 +85,4 @@ void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
atomic_read(&sc->mr_io.ready.count),
atomic_read(&sc->mr_io.used.count));
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_legacy_debug_proc_show);
+EXPORT_SYMBOL_GPL(smbdirect_connection_legacy_debug_proc_show);
diff --git a/fs/smb/smbdirect/devices.c b/fs/smb/smbdirect/devices.c
index 44962f221c352..7adacbdfe12e7 100644
--- a/fs/smb/smbdirect/devices.c
+++ b/fs/smb/smbdirect/devices.c
@@ -238,7 +238,7 @@ u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev)
return RDMA_NODE_UNSPECIFIED;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_netdev_rdma_capable_node_type);
+EXPORT_SYMBOL_GPL(smbdirect_netdev_rdma_capable_node_type);
__init int smbdirect_devices_init(void)
{
diff --git a/fs/smb/smbdirect/internal.h b/fs/smb/smbdirect/internal.h
index 2d5acf2c21bc5..e9959e6dc13ae 100644
--- a/fs/smb/smbdirect/internal.h
+++ b/fs/smb/smbdirect/internal.h
@@ -6,11 +6,11 @@
#ifndef __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__
#define __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__
+#define DEFAULT_SYMBOL_NAMESPACE "SMBDIRECT"
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include "smbdirect.h"
+#include <linux/smbdirect.h>
#include "pdu.h"
-#include "public.h"
#include <linux/mutex.h>
diff --git a/fs/smb/smbdirect/listen.c b/fs/smb/smbdirect/listen.c
index 143a7618d95f3..2f78bcaedbf82 100644
--- a/fs/smb/smbdirect/listen.c
+++ b/fs/smb/smbdirect/listen.c
@@ -90,7 +90,7 @@ int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog)
*/
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_listen);
+EXPORT_SYMBOL_GPL(smbdirect_socket_listen);
static int smbdirect_new_rdma_event_handler(struct rdma_cm_id *new_id,
struct rdma_cm_event *event)
diff --git a/fs/smb/smbdirect/mr.c b/fs/smb/smbdirect/mr.c
index 5228e699cd5d4..15c6363a2f97a 100644
--- a/fs/smb/smbdirect/mr.c
+++ b/fs/smb/smbdirect/mr.c
@@ -269,7 +269,7 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_mr_io *mr;
- int ret, num_pages;
+ int ret, num_pages, num_mapped;
struct ib_reg_wr *reg_wr;
num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
@@ -300,19 +300,22 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
num_pages, iov_iter_count(iter), sp->max_frmr_depth);
smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth);
- ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
- if (!ret) {
+ num_mapped = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
+ if (!num_mapped) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
- "ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n",
- num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
+ "ib_dma_map_sg num_pages=%u dir=%x num_mapped=%d\n",
+ num_pages, mr->dir, num_mapped);
+ ret = -EIO;
goto dma_map_error;
}
- ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
- if (ret != mr->sgt.nents) {
+ ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, num_mapped, NULL, PAGE_SIZE);
+ if (ret != num_mapped) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
- "ib_map_mr_sg failed ret = %d nents = %u\n",
- ret, mr->sgt.nents);
+ "ib_map_mr_sg failed ret = %d num_mapped = %u\n",
+ ret, num_mapped);
+ if (ret >= 0)
+ ret = -EIO;
goto map_mr_error;
}
@@ -380,7 +383,7 @@ dma_map_error:
mutex_unlock(&mr->mutex);
return NULL;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io);
+EXPORT_SYMBOL_GPL(smbdirect_connection_register_mr_io);
void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
struct smbdirect_buffer_descriptor_v1 *v1)
@@ -397,7 +400,7 @@ void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
}
mutex_unlock(&mr->mutex);
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor);
+EXPORT_SYMBOL_GPL(smbdirect_mr_io_fill_buffer_descriptor);
/*
* Deregister a MR after I/O is done
@@ -490,4 +493,4 @@ put_kref:
if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
mutex_unlock(&mr->mutex);
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io);
+EXPORT_SYMBOL_GPL(smbdirect_connection_deregister_mr_io);
diff --git a/fs/smb/smbdirect/public.h b/fs/smb/smbdirect/public.h
deleted file mode 100644
index 50088155e7c37..0000000000000
--- a/fs/smb/smbdirect/public.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2025, Stefan Metzmacher
- */
-
-#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__
-#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__
-
-struct smbdirect_buffer_descriptor_v1;
-struct smbdirect_socket_parameters;
-
-struct smbdirect_socket;
-struct smbdirect_send_batch;
-struct smbdirect_mr_io;
-
-#define __SMBDIRECT_EXPORT_SYMBOL__(__sym) EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd")
-
-#include <rdma/rw.h>
-
-u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev);
-
-bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs);
-
-int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc);
-
-int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc);
-
-int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
- const struct smbdirect_socket_parameters *sp);
-
-const struct smbdirect_socket_parameters *
-smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc);
-
-int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
- enum ib_poll_context poll_ctx,
- gfp_t gfp_mask);
-
-#define SMBDIRECT_LOG_ERR 0x0
-#define SMBDIRECT_LOG_INFO 0x1
-
-#define SMBDIRECT_LOG_OUTGOING 0x1
-#define SMBDIRECT_LOG_INCOMING 0x2
-#define SMBDIRECT_LOG_READ 0x4
-#define SMBDIRECT_LOG_WRITE 0x8
-#define SMBDIRECT_LOG_RDMA_SEND 0x10
-#define SMBDIRECT_LOG_RDMA_RECV 0x20
-#define SMBDIRECT_LOG_KEEP_ALIVE 0x40
-#define SMBDIRECT_LOG_RDMA_EVENT 0x80
-#define SMBDIRECT_LOG_RDMA_MR 0x100
-#define SMBDIRECT_LOG_RDMA_RW 0x200
-#define SMBDIRECT_LOG_NEGOTIATE 0x400
-void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
- void *private_ptr,
- bool (*needed)(struct smbdirect_socket *sc,
- void *private_ptr,
- unsigned int lvl,
- unsigned int cls),
- void (*vaprintf)(struct smbdirect_socket *sc,
- const char *func,
- unsigned int line,
- void *private_ptr,
- unsigned int lvl,
- unsigned int cls,
- struct va_format *vaf));
-
-bool smbdirect_connection_is_connected(struct smbdirect_socket *sc);
-
-int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc);
-
-int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr);
-
-void smbdirect_socket_shutdown(struct smbdirect_socket *sc);
-
-void smbdirect_socket_release(struct smbdirect_socket *sc);
-
-int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch,
- bool is_last);
-
-/*
- * This is only temporary and only needed
- * as long as the client still requires
- * to use smbdirect_connection_send_single_iter()
- */
-struct smbdirect_send_batch_storage {
- union {
- struct list_head __msg_list;
- __aligned_u64 __space[5];
- };
-};
-
-struct smbdirect_send_batch *
-smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage,
- bool need_invalidate_rkey,
- unsigned int remote_key);
-
-int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc,
- struct smbdirect_send_batch *batch,
- struct iov_iter *iter,
- unsigned int flags,
- u32 remaining_data_length);
-
-int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc);
-
-int smbdirect_connection_send_iter(struct smbdirect_socket *sc,
- struct iov_iter *iter,
- unsigned int flags,
- bool need_invalidate,
- unsigned int remote_key);
-
-int smbdirect_connection_recvmsg(struct smbdirect_socket *sc,
- struct msghdr *msg,
- unsigned int flags);
-
-int smbdirect_connect(struct smbdirect_socket *sc,
- const struct sockaddr *dst);
-
-int smbdirect_connect_sync(struct smbdirect_socket *sc,
- const struct sockaddr *dst);
-
-int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog);
-
-struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc,
- long timeo,
- struct proto_accept_arg *arg);
-
-int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc,
- void *buf, size_t buf_len,
- struct smbdirect_buffer_descriptor_v1 *desc,
- size_t desc_len,
- bool is_read);
-
-struct smbdirect_mr_io *
-smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
- struct iov_iter *iter,
- bool writing,
- bool need_invalidate);
-
-void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
- struct smbdirect_buffer_descriptor_v1 *v1);
-
-void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr);
-
-void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
- unsigned int rdma_readwrite_threshold,
- struct seq_file *m);
-
-#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */
diff --git a/fs/smb/smbdirect/rw.c b/fs/smb/smbdirect/rw.c
index c2f46b17731ec..6fe38042cfb96 100644
--- a/fs/smb/smbdirect/rw.c
+++ b/fs/smb/smbdirect/rw.c
@@ -252,4 +252,4 @@ free_msg:
kfree(msg);
goto out;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_rdma_xmit);
+EXPORT_SYMBOL_GPL(smbdirect_connection_rdma_xmit);
diff --git a/fs/smb/smbdirect/smbdirect.h b/fs/smb/smbdirect/smbdirect.h
deleted file mode 100644
index bbab5f7f7cc9b..0000000000000
--- a/fs/smb/smbdirect/smbdirect.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2025 Stefan Metzmacher
- */
-
-#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
-#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
-
-#include <linux/types.h>
-
-/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */
-struct smbdirect_buffer_descriptor_v1 {
- __le64 offset;
- __le32 token;
- __le32 length;
-} __packed;
-
-/*
- * Connection parameters mostly from [MS-SMBD] 3.1.1.1
- *
- * These are setup and negotiated at the beginning of a
- * connection and remain constant unless explicitly changed.
- *
- * Some values are important for the upper layer.
- */
-struct smbdirect_socket_parameters {
- __u64 flags;
-#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1)
-#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2)
- __u32 resolve_addr_timeout_msec;
- __u32 resolve_route_timeout_msec;
- __u32 rdma_connect_timeout_msec;
- __u32 negotiate_timeout_msec;
- __u16 initiator_depth; /* limited to U8_MAX */
- __u16 responder_resources; /* limited to U8_MAX */
- __u16 recv_credit_max;
- __u16 send_credit_target;
- __u32 max_send_size;
- __u32 max_fragmented_send_size;
- __u32 max_recv_size;
- __u32 max_fragmented_recv_size;
- __u32 max_read_write_size;
- __u32 max_frmr_depth;
- __u32 keepalive_interval_msec;
- __u32 keepalive_timeout_msec;
-} __packed;
-
-#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \
- SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \
- SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
-
-#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */
diff --git a/fs/smb/smbdirect/socket.c b/fs/smb/smbdirect/socket.c
index 1b4ab01b745e6..39cca7219c4df 100644
--- a/fs/smb/smbdirect/socket.c
+++ b/fs/smb/smbdirect/socket.c
@@ -20,7 +20,7 @@ bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs)
return false;
return true;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_frwr_is_supported);
+EXPORT_SYMBOL_GPL(smbdirect_frwr_is_supported);
static void smbdirect_socket_cleanup_work(struct work_struct *work);
@@ -107,7 +107,7 @@ init_failed:
alloc_failed:
return ret;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_kern);
+EXPORT_SYMBOL_GPL(smbdirect_socket_create_kern);
int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc)
{
@@ -148,7 +148,7 @@ init_failed:
alloc_failed:
return ret;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_accepting);
+EXPORT_SYMBOL_GPL(smbdirect_socket_create_accepting);
int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
const struct smbdirect_socket_parameters *sp)
@@ -189,14 +189,14 @@ int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_initial_parameters);
+EXPORT_SYMBOL_GPL(smbdirect_socket_set_initial_parameters);
const struct smbdirect_socket_parameters *
smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc)
{
return &sc->parameters;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_get_current_parameters);
+EXPORT_SYMBOL_GPL(smbdirect_socket_get_current_parameters);
int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
enum ib_poll_context poll_ctx,
@@ -220,7 +220,7 @@ int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_kernel_settings);
+EXPORT_SYMBOL_GPL(smbdirect_socket_set_kernel_settings);
void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
void *private_ptr,
@@ -240,7 +240,7 @@ void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
sc->logging.needed = needed;
sc->logging.vaprintf = vaprintf;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_logging);
+EXPORT_SYMBOL_GPL(smbdirect_socket_set_logging);
static void smbdirect_socket_wake_up_all(struct smbdirect_socket *sc)
{
@@ -663,13 +663,13 @@ int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr)
return 0;
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_bind);
+EXPORT_SYMBOL_GPL(smbdirect_socket_bind);
void smbdirect_socket_shutdown(struct smbdirect_socket *sc)
{
smbdirect_socket_schedule_cleanup_lvl(sc, SMBDIRECT_LOG_INFO, -ESHUTDOWN);
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_shutdown);
+EXPORT_SYMBOL_GPL(smbdirect_socket_shutdown);
static void smbdirect_socket_release_disconnect(struct kref *kref)
{
@@ -712,7 +712,7 @@ void smbdirect_socket_release(struct smbdirect_socket *sc)
*/
kref_put(&sc->refs.destroy, smbdirect_socket_release_destroy);
}
-__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_release);
+EXPORT_SYMBOL_GPL(smbdirect_socket_release);
int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc,
enum smbdirect_socket_status expected_status,
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 182e54e575ee9..4e1e4f18a1669 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -188,7 +188,7 @@ static int internal_create_group(struct kobject *kobj, int update,
kernfs_get(kn);
error = create_files(kn, kobj, uid, gid, grp, update);
if (error) {
- if (grp->name)
+ if (grp->name && !update)
kernfs_remove(kn);
}
kernfs_put(kn);
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 0788593b6a1d8..6928e378fbbdc 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -230,8 +230,12 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
}
/* Verify the descriptor CRC */
- if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
- le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
+ if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize) {
+ udf_err(sb, "block %u: CRC length %u exceeds block size\n",
+ block, le16_to_cpu(tag_p->descCRCLength));
+ goto error_out;
+ }
+ if (le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
bh->b_data + sizeof(struct tag),
le16_to_cpu(tag_p->descCRCLength)))
return bh;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 80ba94f51e5c7..aecbab61014c7 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -382,6 +382,7 @@ xfs_dir3_data_write_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ struct xfs_dir3_data_hdr *datahdr3 = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_dir3_data_verify(bp);
@@ -396,6 +397,11 @@ xfs_dir3_data_write_verify(
if (bip)
hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ /*
+ * Zero padding that may be stale from old kernels.
+ */
+ datahdr3->pad = 0;
+
xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
}
@@ -728,7 +734,6 @@ xfs_dir3_data_init(
struct xfs_dir2_data_unused *dup;
struct xfs_dir2_data_free *bf;
int error;
- int i;
/*
* Get the buffer set up for the block.
@@ -741,13 +746,16 @@ xfs_dir3_data_init(
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
/*
- * Initialize the header.
+ * Initialize the whole directory header region to zero
+ * so that all padding, bestfree entries, and any
+ * future header fields are clean.
*/
hdr = bp->b_addr;
+ memset(hdr, 0, geo->data_entry_offset);
+
if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(args->owner);
@@ -759,10 +767,6 @@ xfs_dir3_data_init(
bf = xfs_dir2_data_bestfree_p(mp, hdr);
bf[0].offset = cpu_to_be16(geo->data_entry_offset);
bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset);
- for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
- bf[i].length = 0;
- bf[i].offset = 0;
- }
/*
* Set up an unused entry for the block's body.
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 40c7f0ff6cf3a..0ec6ccd8b4dcf 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1414,8 +1414,7 @@ xfs_refcount_finish_one(
if (rcur == NULL) {
struct xfs_perag *pag = to_perag(ri->ri_group);
- error = xfs_alloc_read_agf(pag, tp,
- XFS_ALLOC_FLAG_FREEING, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 20e63069088b3..3d40cb0b2496c 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -251,6 +251,17 @@ xchk_ino_set_preen(
trace_xchk_ino_preen(sc, ino, __return_address);
}
+/* Record a block indexed by a file fork that could be optimized. */
+void
+xchk_fblock_set_preen(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xchk_fblock_preen(sc, whichfork, offset, __return_address);
+}
+
/* Record something being wrong with the filesystem primary superblock. */
void
xchk_set_corrupt(
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index f2ecc68538f0c..b494d747c0084 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -25,6 +25,8 @@ bool xchk_fblock_xref_process_error(struct xfs_scrub *sc,
void xchk_block_set_preen(struct xfs_scrub *sc,
struct xfs_buf *bp);
void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino);
+void xchk_fblock_set_preen(struct xfs_scrub *sc,
+ int whichfork, xfs_fileoff_t offset);
void xchk_set_corrupt(struct xfs_scrub *sc);
void xchk_block_set_corrupt(struct xfs_scrub *sc,
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 1a71d36898b1d..c2d6ad59d03ef 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -454,7 +454,12 @@ xchk_da_btree_block(
}
}
- /* XXX: Check hdr3.pad32 once we know how to fix it. */
+ if (xfs_has_crc(ip->i_mount)) {
+ struct xfs_da3_node_hdr *nodehdr3 = blk->bp->b_addr;
+
+ if (nodehdr3->__pad32)
+ xchk_da_set_preen(ds, level);
+ }
break;
default:
xchk_da_set_corrupt(ds, level);
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index e09724cd37255..09715a4aa154b 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -492,7 +492,12 @@ xchk_directory_data_bestfree(
goto out;
xchk_buffer_recheck(sc, bp);
- /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+ if (xfs_has_crc(sc->mp)) {
+ struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad)
+ xchk_fblock_set_preen(sc, XFS_DATA_FORK, lblk);
+ }
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out_buf;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 580d40a5ee579..0cea458f13536 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -472,6 +472,7 @@ xfs_buf_find_insert(
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
+retry:
rcu_read_lock();
bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
@@ -480,8 +481,16 @@ xfs_buf_find_insert(
error = PTR_ERR(bp);
goto out_free_buf;
}
- if (bp && lockref_get_not_dead(&bp->b_lockref)) {
- /* found an existing buffer */
+ if (bp) {
+ /*
+ * If there is an existing buffer with a dead lockref, retry
+ * until the new buffer is added, or a usable buffer is found.
+ */
+ if (!lockref_get_not_dead(&bp->b_lockref)) {
+ rcu_read_unlock();
+ cpu_relax();
+ goto retry;
+ }
rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
if (error)
@@ -820,15 +829,20 @@ xfs_buf_destroy(
ASSERT(__lockref_is_dead(&bp->b_lockref));
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ if (bp->b_pag)
+ xfs_perag_put(bp->b_pag);
+ xfs_buf_free(bp);
+}
+
+static inline void
+xfs_buf_kill(
+ struct xfs_buf *bp)
+{
+ lockref_mark_dead(&bp->b_lockref);
if (!xfs_buf_is_uncached(bp)) {
rhashtable_remove_fast(&bp->b_target->bt_hash,
&bp->b_rhash_head, xfs_buf_hash_params);
-
- if (bp->b_pag)
- xfs_perag_put(bp->b_pag);
}
-
- xfs_buf_free(bp);
}
/*
@@ -851,7 +865,7 @@ xfs_buf_rele(
return;
kill:
- lockref_mark_dead(&bp->b_lockref);
+ xfs_buf_kill(bp);
list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
spin_unlock(&bp->b_lockref.lock);
@@ -1433,7 +1447,7 @@ xfs_buftarg_drain_rele(
return LRU_SKIP;
}
- lockref_mark_dead(&bp->b_lockref);
+ xfs_buf_kill(bp);
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
@@ -1545,7 +1559,7 @@ xfs_buftarg_isolate(
return LRU_ROTATE;
}
- lockref_mark_dead(&bp->b_lockref);
+ xfs_buf_kill(bp);
list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lockref.lock);
return LRU_REMOVED;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index beaa26ec62da4..9978ac1422fc4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -699,12 +699,6 @@ xfs_create(
*/
error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
&tp);
- if (error == -ENOSPC) {
- /* flush outstanding delalloc blocks and retry */
- xfs_flush_inodes(mp);
- error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
- resblks, &tp);
- }
if (error)
goto out_parent;
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 64c8afb935c26..b994ff15d5e45 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -350,7 +350,7 @@ xfs_dax_notify_dev_failure(
/*
* Shutdown fs from a force umount in pre-remove case which won't fail,
* so errors can be ignored. Otherwise, shutdown the filesystem with
- * CORRUPT flag if error occured or notify.want_shutdown was set during
+ * CORRUPT flag if error occurred or notify.want_shutdown was set during
* RMAP querying.
*/
if (mf_flags & MF_MEM_PRE_REMOVE)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index bcc470f56e466..148cc32449c1f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1199,10 +1199,21 @@ xfs_trans_alloc_icreate(
{
struct xfs_trans *tp;
bool retried = false;
+ bool flushed = false;
int error;
retry:
error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp);
+ if (error == -ENOSPC && !flushed) {
+ /*
+ * Flush all delalloc blocks to reclaim space from speculative
+ * preallocation. This is similar to the quota retry below
+ * but targets FS-wide ENOSPC.
+ */
+ xfs_flush_inodes(mp);
+ flushed = true;
+ goto retry;
+ }
if (error)
return error;
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index a851b98143c0b..5e297b75a85f6 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1170,7 +1170,7 @@ xfs_calc_open_zones(
if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
mp->m_max_open_zones = bdev_open_zones;
- xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
+ xfs_info(mp, "limiting open zones to %u due to hardware limit.",
bdev_open_zones);
}
@@ -1217,7 +1217,7 @@ xfs_alloc_zone_info(
return zi;
out_free_bitmaps:
- while (--i > 0)
+ while (--i >= 0)
kvfree(zi->zi_used_bucket_bitmap[i]);
kfree(zi);
return NULL;
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index fedcc47048aff..c8a1d5c0332c5 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -1221,7 +1221,7 @@ out_put_oz:
if (data->oz)
xfs_open_zone_put(data->oz);
out_free_gc_data:
- kfree(data);
+ xfs_zone_gc_data_free(data);
return error;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9b646cb5335d0..ff43d6d1ea30a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -610,10 +610,14 @@ static long zonefs_fname_to_fno(const struct qstr *fname)
return c - '0';
for (i = 0, rname = name + len - 1; i < len; i++, rname--) {
+ long digit;
+
c = *rname;
if (!isdigit(c))
return -ENOENT;
- fno += (c - '0') * shift;
+ digit = (c - '0') * shift;
+ if (check_add_overflow(fno, digit, &fno))
+ return -ENOENT;
shift *= 10;
}