From: Suparna Bhattacharya <suparna@in.ibm.com>

I picked up Badari's forward port of Stephen Tweedie's
DIO fixes from 2.4, and have updated it with the following:

- Drop i_sem after i/o submission for DIO reads/writes.
- Release i_alloc_sem on DIO completion.
- Modify other write paths i.e. generic_file_aio_write, 
  generic_file_writev and xfs write to handle acquistion of 
  i_alloc_sem and align with i_sem and i_alloc_sem being released 
  directly by the DIO code, rather than by the caller.

I had to debate a bit about how the _write_nolock() cases
should be handled. The assumption I made is that for a regular
file, the caller would have already acquired i_alloc_sem and
i_sem. When its not a regular file the DIO code doesn't touch
i_sem/i_alloc_sem anyway.

Let me know how this looks to you.

I'm not quite sure how to test these changes to be sure
it all works. I have run aio-stress for O_DIRECT reads
and writes. I still need to play with running fsx (there
seem to be some problems there as I'm seeing short writes
- need to investigate whether the problem exists with mm5
as such or if its due to this patch).

DESC
O_DIRECT race fixes comments
EDESC
From: Suparna Bhattacharya <suparna@in.ibm.com>


DESC
O_DRIECT race fixes fix fix fix
EDESC

Fix various deadlocks due to not releasing various locks on various error
paths.

NFS direct-IO is totally broken.
DESC
DIO locking rework
EDESC
From: Badari Pulavarty <pbadari@us.ibm.com>

1) I noticed that i_alloc_sem and i_sem locking ordering is not=20
   consistent.  So, I changed the locking order to i_sem (first) and
   i_alloc_sem (next).  I hope this is okay.  This allowed me to push the
   locking down to blockdev_direct_IO.

2) Pushed locking down to blockdev_direct_IO() to allow NFS
   direct-io.

Here is the locking:

(i) generic_file_*_write() holds i_sem (as before) and calls
    blockdev_direct_IO().  We get i_alloc_sem and call direct_io_worker().

(ii) generic_file_*_read() does not hold any locks. 
     blockdev_direct_IO() gets i_sem and then i_alloc_sem and calls
     direct_io_worker() to do the work

(iii) direct_io_worker() does the work and drops i_sem after
      submitting IOs and drops i_alloc_sem after completing IOs.

DESC
O_DIRECT XFS fix
EDESC

XFS deadlocks, and doesn't need the fancy locking anyway.

So provide a separate direct-IO path for XFS, which avoids all the extra
locking.



 fs/direct-io.c          |   90 +++++++++++++++++++++++++++++++++++++++++-------
 fs/inode.c              |    1 
 fs/open.c               |    2 +
 fs/xfs/linux/xfs_aops.c |    3 +
 include/linux/fs.h      |   31 ++++++++++++++--
 mm/filemap.c            |   53 +++++++++++++++++++++-------
 6 files changed, 151 insertions(+), 29 deletions(-)

diff -puN fs/direct-io.c~O_DIRECT-race-fixes-rollup fs/direct-io.c
--- 25/fs/direct-io.c~O_DIRECT-race-fixes-rollup	2003-11-11 10:08:40.000000000 -0800
+++ 25-akpm/fs/direct-io.c	2003-11-11 10:08:54.000000000 -0800
@@ -52,6 +52,10 @@
  *
  * If blkfactor is zero then the user's request was aligned to the filesystem's
  * blocksize.
+ *
+ * needs_locking is set for regular files on direct-IO-naive filesystems.  It
+ * determines whether we need to do the fancy locking which prevents direct-IO
+ * from being able to read uninitialised disk blocks.
  */
 
 struct dio {
@@ -59,6 +63,7 @@ struct dio {
 	struct bio *bio;		/* bio under assembly */
 	struct inode *inode;
 	int rw;
+	int needs_locking;		/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio
 {
 	if (dio->end_io)
 		dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
+	if (dio->needs_locking)
+		up_read(&dio->inode->i_alloc_sem);
 }
 
 /*
@@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *d
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
 	unsigned long blkmask;
+	int beyond_eof = 0;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *d
 		if (dio_count & blkmask)	
 			fs_count++;
 
+		if (dio->needs_locking) {
+			if (dio->block_in_file >= (i_size_read(dio->inode) >>
+							dio->blkbits))
+				beyond_eof = 1;
+		}
+		/*
+		 * For writes inside i_size we forbid block creations: only
+		 * overwrites are permitted.  We fall back to buffered writes
+		 * at a higher level for inside-i_size block-instantiating
+		 * writes.
+		 */
 		ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
-				map_bh, dio->rw == WRITE);
+				map_bh, (dio->rw == WRITE) && beyond_eof);
 	}
 	return ret;
 }
@@ -774,6 +793,10 @@ do_holes:
 			if (!buffer_mapped(map_bh)) {
 				char *kaddr;
 
+				/* AKPM: eargh, -ENOTBLK is a hack */
+				if (dio->rw == WRITE)
+					return -ENOTBLK;
+
 				if (dio->block_in_file >=
 					i_size_read(dio->inode)>>blkbits) {
 					/* We hit eof */
@@ -839,21 +862,21 @@ out:
 	return ret;
 }
 
+/*
+ * Releases both i_sem and i_alloc_sem
+ */
 static int
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io)
+	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io,
+	struct dio *dio)
 {
 	unsigned long user_addr; 
 	int seg;
 	int ret = 0;
 	int ret2;
-	struct dio *dio;
 	size_t bytes;
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
 	dio->is_async = !is_sync_kiocb(iocb);
 
 	dio->bio = NULL;
@@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *i
 	dio->start_zero_done = 0;
 	dio->block_in_file = offset >> blkbits;
 	dio->blocks_available = 0;
-
 	dio->cur_page = NULL;
 
 	dio->boundary = 0;
@@ -947,6 +969,13 @@ direct_io_worker(int rw, struct kiocb *i
 		dio_bio_submit(dio);
 
 	/*
+	 * All new block allocations have been performed.  We can let i_sem
+	 * go now.
+	 */
+	if (dio->needs_locking)
+		up(&dio->inode->i_sem);
+
+	/*
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
 	 * reflect the number of to-be-processed BIOs.
 	 */
@@ -981,11 +1010,17 @@ direct_io_worker(int rw, struct kiocb *i
 
 /*
  * This is a library function for use by filesystem drivers.
+ *
+ * For writes to S_ISREG files, we are called under i_sem and return with i_sem
+ * held, even though it is internally dropped.
+ *
+ * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
  */
 int
-blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io)
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	int needs_special_locking)
 {
 	int seg;
 	size_t size;
@@ -994,6 +1029,8 @@ blockdev_direct_IO(int rw, struct kiocb 
 	unsigned bdev_blkbits = 0;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
+	struct dio *dio;
+	int needs_locking;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1019,10 +1056,37 @@ blockdev_direct_IO(int rw, struct kiocb 
 		}
 	}
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
-				nr_segs, blkbits, get_blocks, end_io);
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	retval = -ENOMEM;
+	if (!dio)
+		goto out;
+
+	/*
+	 * For regular files,
+	 *	readers need to grab i_sem and i_alloc_sem
+	 *	writers need to grab i_alloc_sem only (i_sem is already held)
+	 */
+	needs_locking = 0;
+	if (S_ISREG(inode->i_mode) && needs_special_locking) {
+		needs_locking = 1;
+		if (rw == READ) {
+			down(&inode->i_sem);
+			retval = filemap_write_and_wait(inode->i_mapping);
+			if (retval) {
+				up(&inode->i_sem);
+				kfree(dio);
+				goto out;
+			}
+		}
+		down_read(&inode->i_alloc_sem);
+	}
+	dio->needs_locking = needs_locking;
+
+	retval = direct_io_worker(rw, iocb, inode, iov, offset,
+				nr_segs, blkbits, get_blocks, end_io, dio);
+	if (needs_locking && rw == WRITE)
+		down(&inode->i_sem);
 out:
 	return retval;
 }
-
-EXPORT_SYMBOL(blockdev_direct_IO);
+EXPORT_SYMBOL(__blockdev_direct_IO);
diff -puN fs/inode.c~O_DIRECT-race-fixes-rollup fs/inode.c
--- 25/fs/inode.c~O_DIRECT-race-fixes-rollup	2003-11-11 10:08:40.000000000 -0800
+++ 25-akpm/fs/inode.c	2003-11-11 10:08:40.000000000 -0800
@@ -183,6 +183,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
diff -puN fs/open.c~O_DIRECT-race-fixes-rollup fs/open.c
--- 25/fs/open.c~O_DIRECT-race-fixes-rollup	2003-11-11 10:08:40.000000000 -0800
+++ 25-akpm/fs/open.c	2003-11-11 10:08:40.000000000 -0800
@@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, l
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	down(&dentry->d_inode->i_sem);
+	down_write(&dentry->d_inode->i_alloc_sem);
 	err = notify_change(dentry, &newattrs);
+	up_write(&dentry->d_inode->i_alloc_sem);
 	up(&dentry->d_inode->i_sem);
 	return err;
 }
diff -puN include/linux/fs.h~O_DIRECT-race-fixes-rollup include/linux/fs.h
--- 25/include/linux/fs.h~O_DIRECT-race-fixes-rollup	2003-11-11 10:08:40.000000000 -0800
+++ 25-akpm/include/linux/fs.h	2003-11-11 12:39:22.000000000 -0800
@@ -389,6 +389,7 @@ struct inode {
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct semaphore	i_sem;
+	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
@@ -1205,6 +1206,7 @@ extern void write_inode_now(struct inode
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
+extern int filemap_write_and_wait(struct address_space *mapping);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
@@ -1316,9 +1318,6 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb,
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
-	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
 	unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
@@ -1340,6 +1339,32 @@ static inline void do_generic_file_read(
 				actor);
 }
 
+int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	int needs_special_locking);
+
+/*
+ * For filesystems which need locking between buffered and direct access
+ */
+static inline int blockdev_direct_IO(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_blocks, end_io, 1);
+}
+
+static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_blocks, end_io, 0);
+}
+
 extern struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
diff -puN mm/filemap.c~O_DIRECT-race-fixes-rollup mm/filemap.c
--- 25/mm/filemap.c~O_DIRECT-race-fixes-rollup	2003-11-11 10:08:40.000000000 -0800
+++ 25-akpm/mm/filemap.c	2003-11-11 12:39:24.000000000 -0800
@@ -73,6 +73,9 @@
  *  ->mmap_sem
  *    ->i_sem			(msync)
  *
+ *  ->i_sem
+ *    ->i_alloc_sem             (various)
+ *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->page_lock	(__sync_single_inode)
@@ -226,6 +229,18 @@ restart:
 
 EXPORT_SYMBOL(filemap_fdatawait);
 
+int filemap_write_and_wait(struct address_space *mapping)
+{
+	int retval = 0;
+
+	if (mapping->nrpages) {
+		retval = filemap_fdatawrite(mapping);
+		if (retval == 0)
+			retval = filemap_fdatawait(mapping);
+	}
+	return retval;
+}
+
 /*
  * This adds a page to the page cache, starting out as locked, unreferenced,
  * not uptodate and with no errors.
@@ -1687,6 +1702,7 @@ EXPORT_SYMBOL(generic_write_checks);
 
 /*
  * Write to a file through the page cache. 
+ * Called under i_sem for S_ISREG files.
  *
  * We put everything into the page cache prior to writing it. This is not a
  * problem when writing full pages. With partial pages, however, we first have
@@ -1775,12 +1791,19 @@ generic_file_aio_write_nolock(struct kio
 		/*
 		 * Sync the fs metadata but not the minor inode changes and
 		 * of course not the data as we did direct DMA for the IO.
+		 * i_sem is held, which protects generic_osync_inode() from
+		 * livelocking.
 		 */
 		if (written >= 0 && file->f_flags & O_SYNC)
 			status = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 		if (written >= 0 && !is_sync_kiocb(iocb))
 			written = -EIOCBQUEUED;
-		goto out_status;
+		if (written != -ENOTBLK)
+			goto out_status;
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 */
+		written = 0;
 	}
 
 	buf = iov->iov_base;
@@ -1869,6 +1892,14 @@ generic_file_aio_write_nolock(struct kio
 					OSYNC_METADATA|OSYNC_DATA);
 	}
 	
+	/*
+	 * If we get here for O_DIRECT writes then we must have fallen through
+	 * to buffered writes (block instantiation inside i_size).  So we sync
+	 * the file data here, to try to honour O_DIRECT expectations.
+	 */
+	if (unlikely(file->f_flags & O_DIRECT) && written)
+		status = filemap_write_and_wait(mapping);
+
 out_status:	
 	err = written ? written : status;
 out:
@@ -1960,6 +1991,9 @@ ssize_t generic_file_writev(struct file 
 
 EXPORT_SYMBOL(generic_file_writev);
 
+/*
+ * Called under i_sem for writes to S_ISREG files
+ */
 ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs)
@@ -1968,18 +2002,13 @@ generic_file_direct_IO(int rw, struct ki
 	struct address_space *mapping = file->f_mapping;
 	ssize_t retval;
 
-	if (mapping->nrpages) {
-		retval = filemap_fdatawrite(mapping);
-		if (retval == 0)
-			retval = filemap_fdatawait(mapping);
-		if (retval)
-			goto out;
+	retval = filemap_write_and_wait(mapping);
+	if (retval == 0) {
+		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
+						offset, nr_segs);
+		if (rw == WRITE && mapping->nrpages)
+			invalidate_inode_pages2(mapping);
 	}
-
-	retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-	if (rw == WRITE && mapping->nrpages)
-		invalidate_inode_pages2(mapping);
-out:
 	return retval;
 }
 
diff -puN fs/xfs/linux/xfs_aops.c~O_DIRECT-race-fixes-rollup fs/xfs/linux/xfs_aops.c
--- 25/fs/xfs/linux/xfs_aops.c~O_DIRECT-race-fixes-rollup	2003-11-11 10:08:44.000000000 -0800
+++ 25-akpm/fs/xfs/linux/xfs_aops.c	2003-11-11 10:08:44.000000000 -0800
@@ -984,7 +984,8 @@ linvfs_direct_IO(
 	if (error)
 		return -error;
 
-        return blockdev_direct_IO(rw, iocb, inode, pbmap.pbm_target->pbr_bdev,
+        return blockdev_direct_IO_no_locking(rw, iocb, inode,
+		pbmap.pbm_target->pbr_bdev,
 		iov, offset, nr_segs,
 		linvfs_get_blocks_direct,
 		linvfs_unwritten_convert_direct);

_