From: Jens Axboe <axboe@suse.de>

> I cannot think of a way of doing this without adding extra locking.
> Probably can use task_lock() - take that in exit_io_context(), around
> the current local_irq_save/restore.

Agree, that's the best solution. Isn't that then needed in
set_task_ioprio() as well? I've added it there as well.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 Documentation/block/ioprio.txt |  176 +++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c      |    2 
 fs/ioprio.c                    |   19 ++++
 include/linux/ioprio.h         |    6 +
 4 files changed, 202 insertions(+), 1 deletion(-)

diff -puN /dev/null Documentation/block/ioprio.txt
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/Documentation/block/ioprio.txt	2005-05-03 20:53:49.000000000 -0700
@@ -0,0 +1,176 @@
+Block io priorities
+===================
+
+
+Intro
+-----
+
+With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
+priorities is supported for reads on files. This enables users to io nice
+processes or process groups, similar to what has been possible to cpu
+scheduling for ages. This document mainly details the current possibilites
+with cfq, other io schedulers do not support io priorities so far.
+
+Scheduling classes
+------------------
+
+CFQ implements three generic scheduling classes that determine how io is
+served for a process.
+
+IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
+higher priority than any other in the system, processes from this class are
+given first access to the disk every time. Thus it needs to be used with some
+care, one io RT process can starve the entire system. Within the RT class,
+there are 8 levels of class data that determine exactly how much time this
+process needs the disk for on each service. In the future this might change
+to be more directly mappable to performance, by passing in a wanted data
+rate instead.
+
+IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
+for any process that hasn't set a specific io priority. The class data
+determines how much io bandwidth the process will get, it's directly mappable
+to the cpu nice levels just more coarsely implemented. 0 is the highest
+BE prio level, 7 is the lowest. The mapping between cpu nice level and io
+nice level is determined as: io_nice = (cpu_nice + 20) / 5.
+
+IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
+level only get io time when no one else needs the disk. The idle class has no
+class data, since it doesn't really apply here.
+
+Tools
+-----
+
+See below for a sample ionice tool. Usage:
+
+# ionice -c<class> -n<level> -p<pid>
+
+If pid isn't given, the current process is assumed. IO priority settings
+are inherited on fork, so you can use ionice to start the process at a given
+level:
+
+# ionice -c2 -n0 /bin/ls
+
+will run ls at the best-effort scheduling class at the highest priority.
+For a running process, you can give the pid instead:
+
+# ionice -c1 -n2 -p100
+
+will change pid 100 to run at the realtime scheduling class, at priority 2.
+
+---> snip ionice.c tool <---
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <asm/unistd.h>
+
+extern int sys_ioprio_set(int, int, int);
+extern int sys_ioprio_get(int, int);
+
+#if defined(__i386__)
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290
+#elif defined(__ppc__)
+#define __NR_ioprio_set		273
+#define __NR_ioprio_get		274
+#elif defined(__x86_64__)
+#define __NR_ioprio_set		251
+#define __NR_ioprio_get		252
+#elif defined(__ia64__)
+#define __NR_ioprio_set		1274
+#define __NR_ioprio_get		1275
+#else
+#error "Unsupported arch"
+#endif
+
+_syscall3(int, ioprio_set, int, which, int, who, int, ioprio);
+_syscall2(int, ioprio_get, int, which, int, who);
+
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+#define IOPRIO_CLASS_SHIFT	13
+
+const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };
+
+int main(int argc, char *argv[])
+{
+	int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
+	int c, pid = 0;
+
+	while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
+		switch (c) {
+		case 'n':
+			ioprio = strtol(optarg, NULL, 10);
+			set = 1;
+			break;
+		case 'c':
+			ioprio_class = strtol(optarg, NULL, 10);
+			set = 1;
+			break;
+		case 'p':
+			pid = strtol(optarg, NULL, 10);
+			break;
+		}
+	}
+
+	switch (ioprio_class) {
+		case IOPRIO_CLASS_NONE:
+			ioprio_class = IOPRIO_CLASS_BE;
+			break;
+		case IOPRIO_CLASS_RT:
+		case IOPRIO_CLASS_BE:
+			break;
+		case IOPRIO_CLASS_IDLE:
+			ioprio = 7;
+			break;
+		default:
+			printf("bad prio class %d\n", ioprio_class);
+			return 1;
+	}
+
+	if (!set) {
+		if (!pid && argv[optind])
+			pid = strtol(argv[optind], NULL, 10);
+
+		ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);
+
+		printf("pid=%d, %d\n", pid, ioprio);
+
+		if (ioprio == -1)
+			perror("ioprio_get");
+		else {
+			ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
+			ioprio = ioprio & 0xff;
+			printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
+		}
+	} else {
+		if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
+			perror("ioprio_set");
+			return 1;
+		}
+
+		if (argv[optind])
+			execvp(argv[optind], &argv[optind]);
+	}
+
+	return 0;
+}
+
+---> snip ionice.c tool <---
+
+
+March 11 2005, Jens Axboe <axboe@suse.de>
diff -puN drivers/block/ll_rw_blk.c~cfq-iosched-update-to-time-sliced-design-fix drivers/block/ll_rw_blk.c
--- 25/drivers/block/ll_rw_blk.c~cfq-iosched-update-to-time-sliced-design-fix	2005-05-03 20:53:49.000000000 -0700
+++ 25-akpm/drivers/block/ll_rw_blk.c	2005-05-03 20:53:49.000000000 -0700
@@ -3331,9 +3331,11 @@ void exit_io_context(void)
 	struct io_context *ioc;
 
 	local_irq_save(flags);
+	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
 	ioc->task = NULL;
+	task_unlock(current);
 	local_irq_restore(flags);
 
 	if (ioc->aic && ioc->aic->exit)
diff -puN fs/ioprio.c~cfq-iosched-update-to-time-sliced-design-fix fs/ioprio.c
--- 25/fs/ioprio.c~cfq-iosched-update-to-time-sliced-design-fix	2005-05-03 20:53:49.000000000 -0700
+++ 25-akpm/fs/ioprio.c	2005-05-03 20:53:49.000000000 -0700
@@ -3,7 +3,21 @@
  *
  * Copyright (C) 2004 Jens Axboe <axboe@suse.de>
  *
- * Helper functions for setting/querying io priorities of processes
+ * Helper functions for setting/querying io priorities of processes. The
+ * system calls closely mimmick getpriority/setpriority, see the man page for
+ * those. The prio argument is a composite of prio class and prio data, where
+ * the data argument has meaning within that class. The standard scheduling
+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7
+ * being the lowest.
+ *
+ * IOW, setting BE scheduling class with prio 2 is done ala:
+ *
+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
+ *
+ * ioprio_set(PRIO_PROCESS, pid, prio);
+ *
+ * See also Documentation/block/ioprio.txt
+ *
  */
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
@@ -17,12 +31,15 @@ static int set_task_ioprio(struct task_s
 	    task->uid != current->uid && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
+	task_lock(task);
+
 	task->ioprio = ioprio;
 
 	ioc = task->io_context;
 	if (ioc && ioc->set_ioprio)
 		ioc->set_ioprio(ioc, ioprio);
 
+	task_unlock(task);
 	return 0;
 }
 
diff -puN include/linux/ioprio.h~cfq-iosched-update-to-time-sliced-design-fix include/linux/ioprio.h
--- 25/include/linux/ioprio.h~cfq-iosched-update-to-time-sliced-design-fix	2005-05-03 20:53:49.000000000 -0700
+++ 25-akpm/include/linux/ioprio.h	2005-05-03 20:53:49.000000000 -0700
@@ -15,6 +15,12 @@
 
 #define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
 
+/*
+ * These are the io priority groups as implemented by CFQ. RT is the realtime
+ * class, it always gets premium service. BE is the best-effort scheduling
+ * class, the default for any process. IDLE is the idle scheduling class, it
+ * is only served when no one else is using the disk.
+ */
 enum {
 	IOPRIO_CLASS_NONE,
 	IOPRIO_CLASS_RT,
_