Skip to content

Commit ec0fa0b

Browse files
dhowellstorvalds
authored andcommitted
afs: Fix deadlock between writeback and truncate
The afs filesystem has a lock[*] that it uses to serialise I/O operations going to the server (vnode->io_lock), as the server will only perform one modification operation at a time on any given file or directory. This prevents the the filesystem from filling up all the call slots to a server with calls that aren't going to be executed in parallel anyway, thereby allowing operations on other files to obtain slots. [*] Note that is probably redundant for directories at least since i_rwsem is used to serialise directory modifications and lookup/reading vs modification. The server does allow parallel non-modification ops, however. When a file truncation op completes, we truncate the in-memory copy of the file to match - but we do it whilst still holding the io_lock, the idea being to prevent races with other operations. However, if writeback starts in a worker thread simultaneously with truncation (whilst notify_change() is called with i_rwsem locked, writeback pays it no heed), it may manage to set PG_writeback bits on the pages that will get truncated before afs_setattr_success() manages to call truncate_pagecache(). Truncate will then wait for those pages - whilst still inside io_lock: # cat /proc/8837/stack [<0>] wait_on_page_bit_common+0x184/0x1e7 [<0>] truncate_inode_pages_range+0x37f/0x3eb [<0>] truncate_pagecache+0x3c/0x53 [<0>] afs_setattr_success+0x4d/0x6e [<0>] afs_wait_for_operation+0xd8/0x169 [<0>] afs_do_sync_operation+0x16/0x1f [<0>] afs_setattr+0x1fb/0x25d [<0>] notify_change+0x2cf/0x3c4 [<0>] do_truncate+0x7f/0xb2 [<0>] do_sys_ftruncate+0xd1/0x104 [<0>] do_syscall_64+0x2d/0x3a [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The writeback operation, however, stalls indefinitely because it needs to get the io_lock to proceed: # cat /proc/5940/stack [<0>] afs_get_io_locks+0x58/0x1ae [<0>] afs_begin_vnode_operation+0xc7/0xd1 [<0>] afs_store_data+0x1b2/0x2a3 [<0>] afs_write_back_from_locked_page+0x418/0x57c [<0>] afs_writepages_region+0x196/0x224 [<0>] afs_writepages+0x74/0x156 [<0>] do_writepages+0x2d/0x56 [<0>] __writeback_single_inode+0x84/0x207 [<0>] writeback_sb_inodes+0x238/0x3cf [<0>] __writeback_inodes_wb+0x68/0x9f [<0>] wb_writeback+0x145/0x26c [<0>] wb_do_writeback+0x16a/0x194 [<0>] wb_workfn+0x74/0x177 [<0>] process_one_work+0x174/0x264 [<0>] worker_thread+0x117/0x1b9 [<0>] kthread+0xec/0xf1 [<0>] ret_from_fork+0x1f/0x30 and thus deadlock has occurred. Note that whilst afs_setattr() calls filemap_write_and_wait(), the fact that the caller is holding i_rwsem doesn't preclude more pages being dirtied through an mmap'd region. Fix this by: (1) Use the vnode validate_lock to mediate access between afs_setattr() and afs_writepages(): (a) Exclusively lock validate_lock in afs_setattr() around the whole RPC operation. (b) If WB_SYNC_ALL isn't set on entry to afs_writepages(), trying to shared-lock validate_lock and returning immediately if we couldn't get it. (c) If WB_SYNC_ALL is set, wait for the lock. The validate_lock is also used to validate a file and to zap its cache if the file was altered by a third party, so it's probably a good fit for this. (2) Move the truncation outside of the io_lock in setattr, using the same hook as is used for local directory editing. This requires the old i_size to be retained in the operation record as we commit the revised status to the inode members inside the io_lock still, but we still need to know if we reduced the file size. Fixes: d2ddc77 ("afs: Overhaul volume and server record caching and fileserver rotation") Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent f3c64ed commit ec0fa0b

3 files changed

Lines changed: 50 additions & 9 deletions

File tree

fs/afs/inode.c

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -810,21 +810,40 @@ void afs_evict_inode(struct inode *inode)
810810

811811
static void afs_setattr_success(struct afs_operation *op)
812812
{
813-
struct inode *inode = &op->file[0].vnode->vfs_inode;
813+
struct afs_vnode_param *vp = &op->file[0];
814+
struct inode *inode = &vp->vnode->vfs_inode;
815+
loff_t old_i_size = i_size_read(inode);
816+
817+
op->setattr.old_i_size = old_i_size;
818+
afs_vnode_commit_status(op, vp);
819+
/* inode->i_size has now been changed. */
820+
821+
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
822+
loff_t size = op->setattr.attr->ia_size;
823+
if (size > old_i_size)
824+
pagecache_isize_extended(inode, old_i_size, size);
825+
}
826+
}
827+
828+
static void afs_setattr_edit_file(struct afs_operation *op)
829+
{
830+
struct afs_vnode_param *vp = &op->file[0];
831+
struct inode *inode = &vp->vnode->vfs_inode;
814832

815-
afs_vnode_commit_status(op, &op->file[0]);
816833
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
817-
loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
818-
if (size > i_size)
819-
pagecache_isize_extended(inode, i_size, size);
820-
truncate_pagecache(inode, size);
834+
loff_t size = op->setattr.attr->ia_size;
835+
loff_t i_size = op->setattr.old_i_size;
836+
837+
if (size < i_size)
838+
truncate_pagecache(inode, size);
821839
}
822840
}
823841

824842
static const struct afs_operation_ops afs_setattr_operation = {
825843
.issue_afs_rpc = afs_fs_setattr,
826844
.issue_yfs_rpc = yfs_fs_setattr,
827845
.success = afs_setattr_success,
846+
.edit_dir = afs_setattr_edit_file,
828847
};
829848

830849
/*
@@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
863882
if (S_ISREG(vnode->vfs_inode.i_mode))
864883
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
865884

885+
/* Prevent any new writebacks from starting whilst we do this. */
886+
down_write(&vnode->validate_lock);
887+
866888
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
867889
afs_file_key(attr->ia_file) : NULL),
868890
vnode->volume);
869-
if (IS_ERR(op))
870-
return PTR_ERR(op);
891+
if (IS_ERR(op)) {
892+
ret = PTR_ERR(op);
893+
goto out_unlock;
894+
}
871895

872896
afs_op_set_vnode(op, 0, vnode);
873897
op->setattr.attr = attr;
@@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
880904
op->file[0].update_ctime = 1;
881905

882906
op->ops = &afs_setattr_operation;
883-
return afs_do_sync_operation(op);
907+
ret = afs_do_sync_operation(op);
908+
909+
out_unlock:
910+
up_write(&vnode->validate_lock);
911+
_leave(" = %d", ret);
912+
return ret;
884913
}

fs/afs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,7 @@ struct afs_operation {
812812
} store;
813813
struct {
814814
struct iattr *attr;
815+
loff_t old_i_size;
815816
} setattr;
816817
struct afs_acl *acl;
817818
struct yfs_acl *yacl;

fs/afs/write.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping,
738738
int afs_writepages(struct address_space *mapping,
739739
struct writeback_control *wbc)
740740
{
741+
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
741742
pgoff_t start, end, next;
742743
int ret;
743744

744745
_enter("");
745746

747+
/* We have to be careful as we can end up racing with setattr()
748+
* truncating the pagecache since the caller doesn't take a lock here
749+
* to prevent it.
750+
*/
751+
if (wbc->sync_mode == WB_SYNC_ALL)
752+
down_read(&vnode->validate_lock);
753+
else if (!down_read_trylock(&vnode->validate_lock))
754+
return 0;
755+
746756
if (wbc->range_cyclic) {
747757
start = mapping->writeback_index;
748758
end = -1;
@@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping,
762772
ret = afs_writepages_region(mapping, wbc, start, end, &next);
763773
}
764774

775+
up_read(&vnode->validate_lock);
765776
_leave(" = %d", ret);
766777
return ret;
767778
}

0 commit comments

Comments
 (0)