From: Andrew Theurer <habanero@us.ibm.com>

This patch addresses some problems with wake_idle().  Currently wake_idle()
will wake a task on an alternate cpu if:

1) task->cpu is not idle
2) an idle cpu can be found

However the span of cpus to look for is very limited (only the task->cpu's
sibling).  The scheduler should find the closest idle cpu, starting with
the lowest level domain, then going to higher level domains if allowed
(doamin has flag SD_WAKE_IDLE).  This patch does this.

This and the other two patches (also to be submitted) combined have
provided as much at 5% improvement on that "online transaction DB workload"
and 2% on the industry standard J@EE workload.

I asked Martin Bligh to test these for regression, and he did not find any.
 I would like to submit for inclusion to -mm and barring any problems
eventually to mainline.

Signed-off-by: <habanero@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/asm-i386/topology.h   |    1 +
 25-akpm/include/asm-ia64/topology.h   |    1 +
 25-akpm/include/asm-ppc64/topology.h  |    1 +
 25-akpm/include/asm-x86_64/topology.h |    1 +
 25-akpm/include/linux/topology.h      |    1 +
 25-akpm/kernel/sched.c                |   30 +++++++++++++++---------------
 6 files changed, 20 insertions(+), 15 deletions(-)

diff -puN include/asm-i386/topology.h~sched-more-agressive-wake_idle include/asm-i386/topology.h
--- 25/include/asm-i386/topology.h~sched-more-agressive-wake_idle	2004-11-17 20:46:30.860665984 -0800
+++ 25-akpm/include/asm-i386/topology.h	2004-11-17 20:46:30.870664464 -0800
@@ -83,6 +83,7 @@ static inline cpumask_t pcibus_to_cpumas
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
+				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff -puN include/asm-ia64/topology.h~sched-more-agressive-wake_idle include/asm-ia64/topology.h
--- 25/include/asm-ia64/topology.h~sched-more-agressive-wake_idle	2004-11-17 20:46:30.862665680 -0800
+++ 25-akpm/include/asm-ia64/topology.h	2004-11-17 20:46:30.871664312 -0800
@@ -56,6 +56,7 @@ void build_cpu_to_node_map(void);
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
+				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff -puN include/asm-ppc64/topology.h~sched-more-agressive-wake_idle include/asm-ppc64/topology.h
--- 25/include/asm-ppc64/topology.h~sched-more-agressive-wake_idle	2004-11-17 20:46:30.863665528 -0800
+++ 25-akpm/include/asm-ppc64/topology.h	2004-11-17 20:46:30.871664312 -0800
@@ -51,6 +51,7 @@ static inline int node_to_first_cpu(int 
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
+				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff -puN include/asm-x86_64/topology.h~sched-more-agressive-wake_idle include/asm-x86_64/topology.h
--- 25/include/asm-x86_64/topology.h~sched-more-agressive-wake_idle	2004-11-17 20:46:30.864665376 -0800
+++ 25-akpm/include/asm-x86_64/topology.h	2004-11-17 20:46:30.871664312 -0800
@@ -47,6 +47,7 @@ static inline cpumask_t __pcibus_to_cpum
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
+				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff -puN include/linux/topology.h~sched-more-agressive-wake_idle include/linux/topology.h
--- 25/include/linux/topology.h~sched-more-agressive-wake_idle	2004-11-17 20:46:30.866665072 -0800
+++ 25-akpm/include/linux/topology.h	2004-11-17 20:46:30.872664160 -0800
@@ -120,6 +120,7 @@ static inline int __next_node_with_cpus(
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
+				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff -puN kernel/sched.c~sched-more-agressive-wake_idle kernel/sched.c
--- 25/kernel/sched.c~sched-more-agressive-wake_idle	2004-11-17 20:46:30.867664920 -0800
+++ 25-akpm/kernel/sched.c	2004-11-17 20:46:30.875663704 -0800
@@ -935,9 +935,10 @@ static inline unsigned long target_load(
 #endif
 
 /*
- * wake_idle() is useful especially on SMT architectures to wake a
- * task onto an idle sibling if we would otherwise wake it onto a
- * busy sibling.
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
  *
  * Returns the CPU we should wake onto.
  */
@@ -945,24 +946,23 @@ static inline unsigned long target_load(
 static int wake_idle(int cpu, task_t *p)
 {
 	cpumask_t tmp;
-	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *sd;
 	int i;
 
 	if (idle_cpu(cpu))
 		return cpu;
 
-	sd = rq->sd;
-	if (!(sd->flags & SD_WAKE_IDLE))
-		return cpu;
-
-	cpus_and(tmp, sd->span, p->cpus_allowed);
-
-	for_each_cpu_mask(i, tmp) {
-		if (idle_cpu(i))
-			return i;
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, cpu_online_map);
+			cpus_and(tmp, tmp, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i))
+					return i;
+			}
+		}
+		else break;
 	}
-
 	return cpu;
 }
 #else
@@ -1074,7 +1074,7 @@ static int try_to_wake_up(task_t * p, un
 out_set_cpu:
 	schedstat_inc(rq, ttwu_attempts);
 	new_cpu = wake_idle(new_cpu, p);
-	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
+	if (new_cpu != cpu) {
 		schedstat_inc(rq, ttwu_moved);
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
_