From: long <tlnguyen@snoqualmie.dp.intel.com>

Thank you for your feedback on MSI patches based on kernel 
2.6.0-test5. Below are the updated patches, which are based
on kernel 2.6.0-test6 and include minor modification to
reflect your input.



 25-akpm/Documentation/MSI-HOWTO.txt                 |  321 ++++++
 25-akpm/arch/i386/Kconfig                           |   18 
 25-akpm/arch/i386/kernel/i8259.c                    |    4 
 25-akpm/arch/i386/kernel/io_apic.c                  |  169 ++-
 25-akpm/arch/i386/kernel/mpparse.c                  |   12 
 25-akpm/arch/i386/pci/irq.c                         |    6 
 25-akpm/drivers/acpi/pci_irq.c                      |    2 
 25-akpm/drivers/pci/Makefile                        |    1 
 25-akpm/drivers/pci/msi.c                           | 1061 ++++++++++++++++++++
 25-akpm/drivers/pci/probe.c                         |    1 
 25-akpm/drivers/pci/remove.c                        |    2 
 25-akpm/include/asm-i386/hw_irq.h                   |    1 
 25-akpm/include/asm-i386/io_apic.h                  |   40 
 25-akpm/include/asm-i386/mach-default/irq_vectors.h |   10 
 25-akpm/include/linux/pci.h                         |   17 
 25-akpm/include/linux/pci_msi.h                     |  193 +++
 16 files changed, 1805 insertions(+), 53 deletions(-)

diff -puN arch/i386/Kconfig~ia32-MSI-support arch/i386/Kconfig
--- 25/arch/i386/Kconfig~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/arch/i386/Kconfig	Fri Oct  3 15:46:21 2003
@@ -1030,6 +1030,24 @@ config PCI_DIRECT
  	depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS)
 	default y
 
+config PCI_USE_VECTOR
+	bool "PCI_USE_VECTOR"
+	depends on X86_LOCAL_APIC
+	default n
+	help
+	   This replaces the current existing IRQ-based index interrupt scheme
+	   with the vector-base index scheme. The advantages of vector base over	   IRQ base are listed below:
+	   1) Support MSI implementation.
+	   2) Support future IOxAPIC hotplug
+
+	   Note that this enables MSI, Message Signaled Interrupt, on all
+	   MSI capable device functions detected if users also install the
+	   MSI patch. Message Signal Interrupt enables an MSI-capable
+	   hardware device to send an inbound Memory Write on its PCI bus
+	   instead of asserting IRQ signal on device IRQ pin.
+
+	   If you don't know what to do here, say N.
+
 source "drivers/pci/Kconfig"
 
 config ISA
diff -puN arch/i386/kernel/i8259.c~ia32-MSI-support arch/i386/kernel/i8259.c
--- 25/arch/i386/kernel/i8259.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/arch/i386/kernel/i8259.c	Fri Oct  3 15:46:21 2003
@@ -419,8 +419,10 @@ void __init init_IRQ(void)
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i = 0; i < NR_IRQS; i++) {
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
 		if (vector != SYSCALL_VECTOR) 
 			set_intr_gate(vector, interrupt[i]);
 	}
diff -puN arch/i386/kernel/io_apic.c~ia32-MSI-support arch/i386/kernel/io_apic.c
--- 25/arch/i386/kernel/io_apic.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/arch/i386/kernel/io_apic.c	Fri Oct  3 15:47:56 2003
@@ -76,6 +76,14 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
+#ifdef CONFIG_PCI_USE_VECTOR
+int vector_irq[NR_IRQS] = { [0 ... NR_IRQS -1] = -1};
+#define vector_to_irq(vector) 	\
+	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#else
+#define vector_to_irq(vector)	(vector)
+#endif
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -249,7 +257,7 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
-static void set_ioapic_affinity(unsigned int irq, cpumask_t cpumask)
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 {
 	unsigned long flags;
 	int pin;
@@ -288,7 +296,7 @@ static void set_ioapic_affinity(unsigned
 
 extern cpumask_t irq_affinity[NR_IRQS];
 
-static cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
+cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 
 #define IRQBALANCE_CHECK_ARCH -999
 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
@@ -670,13 +678,13 @@ static int __init irqbalance_disable(cha
 
 __setup("noirqbalance", irqbalance_disable);
 
-static void set_ioapic_affinity(unsigned int irq, cpumask_t mask);
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask);
 
 static inline void move_irq(int irq)
 {
 	/* note - we hold the desc->lock */
 	if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
-		set_ioapic_affinity(irq, pending_irq_balance_cpumask[irq]);
+		set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
 		cpus_clear(pending_irq_balance_cpumask[irq]);
 	}
 }
@@ -853,7 +861,7 @@ void __init setup_ioapic_dest(cpumask_t 
 			if (irq_entry == -1)
 				continue;
 			irq = pin_2_irq(irq_entry, ioapic, pin);
-			set_ioapic_affinity(irq, mask);
+			set_ioapic_affinity_irq(irq, mask);
 		}
 
 	}
@@ -1140,6 +1148,7 @@ static inline int IO_APIC_irq_trigger(in
 
 int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };
 
+#ifndef CONFIG_PCI_USE_VECTOR
 static int __init assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
@@ -1157,11 +1166,38 @@ next:
 	}
 
 	IO_APIC_VECTOR(irq) = current_vector;
+
 	return current_vector;
 }
+#else
+extern int assign_irq_vector(int irq);
+#endif
 
-static struct hw_interrupt_type ioapic_level_irq_type;
-static struct hw_interrupt_type ioapic_edge_irq_type;
+static struct hw_interrupt_type ioapic_level_type;
+static struct hw_interrupt_type ioapic_edge_type;
+
+#define IOAPIC_AUTO	-1
+#define IOAPIC_EDGE	0
+#define IOAPIC_LEVEL	1
+
+static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+	if (use_pci_vector() && !platform_legacy_irq(irq)) {
+		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+				trigger == IOAPIC_LEVEL)
+			irq_desc[vector].handler = &ioapic_level_type;
+		else
+			irq_desc[vector].handler = &ioapic_edge_type;
+		set_intr_gate(vector, interrupt[vector]);
+	} else	{
+		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+				trigger == IOAPIC_LEVEL)
+			irq_desc[irq].handler = &ioapic_level_type;
+		else
+			irq_desc[irq].handler = &ioapic_edge_type;
+		set_intr_gate(vector, interrupt[irq]);
+	}
+}
 
 void __init setup_IO_APIC_irqs(void)
 {
@@ -1219,13 +1255,7 @@ void __init setup_IO_APIC_irqs(void)
 		if (IO_APIC_IRQ(irq)) {
 			vector = assign_irq_vector(irq);
 			entry.vector = vector;
-
-			if (IO_APIC_irq_trigger(irq))
-				irq_desc[irq].handler = &ioapic_level_irq_type;
-			else
-				irq_desc[irq].handler = &ioapic_edge_irq_type;
-
-			set_intr_gate(vector, interrupt[irq]);
+			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
 		
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
@@ -1272,7 +1302,7 @@ void __init setup_ExtINT_IRQ0_pin(unsign
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].handler = &ioapic_edge_irq_type;
+	irq_desc[0].handler = &ioapic_edge_type;
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1762,9 +1792,6 @@ static int __init timer_irq_works(void)
  * that was delayed but this is now handled in the device
  * independent code.
  */
-#define enable_edge_ioapic_irq unmask_IO_APIC_irq
-
-static void disable_edge_ioapic_irq (unsigned int irq) { /* nothing */ }
 
 /*
  * Starting up a edge-triggered IO-APIC interrupt is
@@ -1775,7 +1802,6 @@ static void disable_edge_ioapic_irq (uns
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
  */
-
 static unsigned int startup_edge_ioapic_irq(unsigned int irq)
 {
 	int was_pending = 0;
@@ -1793,8 +1819,6 @@ static unsigned int startup_edge_ioapic_
 	return was_pending;
 }
 
-#define shutdown_edge_ioapic_irq	disable_edge_ioapic_irq
-
 /*
  * Once we have recorded IRQ_PENDING already, we can mask the
  * interrupt for real. This prevents IRQ storms from unhandled
@@ -1809,9 +1833,6 @@ static void ack_edge_ioapic_irq(unsigned
 	ack_APIC_irq();
 }
 
-static void end_edge_ioapic_irq (unsigned int i) { /* nothing */ }
-
-
 /*
  * Level triggered interrupts can just be masked,
  * and shutting down and starting up the interrupt
@@ -1833,10 +1854,6 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
-#define shutdown_level_ioapic_irq	mask_IO_APIC_irq
-#define enable_level_ioapic_irq		unmask_IO_APIC_irq
-#define disable_level_ioapic_irq	mask_IO_APIC_irq
-
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	unsigned long v;
@@ -1863,6 +1880,7 @@ static void end_level_ioapic_irq (unsign
  * The idea is from Manfred Spraul.  --macro
  */
 	i = IO_APIC_VECTOR(irq);
+
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 
 	ack_APIC_irq();
@@ -1897,7 +1915,57 @@ static void end_level_ioapic_irq (unsign
 	}
 }
 
-static void mask_and_ack_level_ioapic_irq (unsigned int irq) { /* nothing */ }
+#ifdef CONFIG_PCI_USE_VECTOR
+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	return startup_edge_ioapic_irq(irq);
+}
+
+static void ack_edge_ioapic_vector(unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	ack_edge_ioapic_irq(irq);
+}
+
+static unsigned int startup_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	return startup_level_ioapic_irq (irq);
+}
+
+static void end_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	end_level_ioapic_irq(irq);
+}
+
+static void mask_IO_APIC_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_IO_APIC_irq(irq);
+}
+
+static void unmask_IO_APIC_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	unmask_IO_APIC_irq(irq);
+}
+
+static void set_ioapic_affinity_vector (unsigned int vector,
+					unsigned long cpu_mask)
+{
+	int irq = vector_to_irq(vector);
+
+	set_ioapic_affinity_irq(irq, cpu_mask);
+}
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1907,26 +1975,25 @@ static void mask_and_ack_level_ioapic_ir
  * edge-triggered handler, without risking IRQ storms and other ugly
  * races.
  */
-
-static struct hw_interrupt_type ioapic_edge_irq_type = {
+static struct hw_interrupt_type ioapic_edge_type = {
 	.typename 	= "IO-APIC-edge",
-	.startup 	= startup_edge_ioapic_irq,
-	.shutdown 	= shutdown_edge_ioapic_irq,
-	.enable 	= enable_edge_ioapic_irq,
-	.disable 	= disable_edge_ioapic_irq,
-	.ack 		= ack_edge_ioapic_irq,
-	.end 		= end_edge_ioapic_irq,
+	.startup 	= startup_edge_ioapic,
+	.shutdown 	= shutdown_edge_ioapic,
+	.enable 	= enable_edge_ioapic,
+	.disable 	= disable_edge_ioapic,
+	.ack 		= ack_edge_ioapic,
+	.end 		= end_edge_ioapic,
 	.set_affinity 	= set_ioapic_affinity,
 };
 
-static struct hw_interrupt_type ioapic_level_irq_type = {
+static struct hw_interrupt_type ioapic_level_type = {
 	.typename 	= "IO-APIC-level",
-	.startup 	= startup_level_ioapic_irq,
-	.shutdown 	= shutdown_level_ioapic_irq,
-	.enable 	= enable_level_ioapic_irq,
-	.disable 	= disable_level_ioapic_irq,
-	.ack 		= mask_and_ack_level_ioapic_irq,
-	.end 		= end_level_ioapic_irq,
+	.startup 	= startup_level_ioapic,
+	.shutdown 	= shutdown_level_ioapic,
+	.enable 	= enable_level_ioapic,
+	.disable 	= disable_level_ioapic,
+	.ack 		= mask_and_ack_level_ioapic,
+	.end 		= end_level_ioapic,
 	.set_affinity 	= set_ioapic_affinity,
 };
 
@@ -1946,7 +2013,13 @@ static inline void init_IO_APIC_traps(vo
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
-		if (IO_APIC_IRQ(irq) && !IO_APIC_VECTOR(irq)) {
+		int tmp = irq;
+		if (use_pci_vector()) {
+			if (!platform_legacy_irq(tmp))
+				if ((tmp = vector_to_irq(tmp)) == -1)
+					continue;
+		}
+		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -2378,10 +2451,12 @@ int io_apic_set_pci_routing (int ioapic,
 		"IRQ %d Mode:%i Active:%i)\n", ioapic, 
 		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low);
 
+ 	if (use_pci_vector() && !platform_legacy_irq(irq))
+		irq = IO_APIC_VECTOR(irq);
 	if (edge_level) {
-	irq_desc[irq].handler = &ioapic_level_irq_type;
+		irq_desc[irq].handler = &ioapic_level_type;
 	} else {
-		irq_desc[irq].handler = &ioapic_edge_irq_type;
+		irq_desc[irq].handler = &ioapic_edge_type;
 	}
 
 	set_intr_gate(entry.vector, interrupt[irq]);
diff -puN arch/i386/kernel/mpparse.c~ia32-MSI-support arch/i386/kernel/mpparse.c
--- 25/arch/i386/kernel/mpparse.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/arch/i386/kernel/mpparse.c	Fri Oct  3 15:46:21 2003
@@ -1140,15 +1140,19 @@ void __init mp_parse_prt (void)
 		if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
 			printk(KERN_DEBUG "Pin %d-%d already programmed\n",
 				mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-			entry->irq = irq;
+ 			if (use_pci_vector() && !platform_legacy_irq(irq))
+ 				irq = IO_APIC_VECTOR(irq);
+ 			entry->irq = irq;
 			continue;
 		}
 
 		mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
 
-		if (!io_apic_set_pci_routing(ioapic, ioapic_pin, irq, edge_level, active_high_low))
-			entry->irq = irq;
-
+		if (!io_apic_set_pci_routing(ioapic, ioapic_pin, irq, edge_level, active_high_low)) {
+ 			if (use_pci_vector() && !platform_legacy_irq(irq))
+ 				irq = IO_APIC_VECTOR(irq);
+ 			entry->irq = irq;
+ 		}
 		printk(KERN_DEBUG "%02x:%02x:%02x[%c] -> %d-%d -> IRQ %d\n",
 			entry->id.segment, entry->id.bus, 
 			entry->id.device, ('A' + entry->pin), 
diff -puN arch/i386/pci/irq.c~ia32-MSI-support arch/i386/pci/irq.c
--- 25/arch/i386/pci/irq.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/arch/i386/pci/irq.c	Fri Oct  3 15:46:21 2003
@@ -680,8 +680,10 @@ static int pcibios_lookup_irq(struct pci
 		    	if ( dev2->irq && dev2->irq != irq && \
 			(!(pci_probe & PCI_USE_PIRQ_MASK) || \
 			((1 << dev2->irq) & mask)) ) {
+#ifndef CONFIG_PCI_USE_VECTOR
 		    		printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
 				       pci_name(dev2), dev2->irq, irq);
+#endif
 		    		continue;
 		    	}
 			dev2->irq = irq;
@@ -745,6 +747,10 @@ static void __init pcibios_fixup_irqs(vo
 							bridge->bus->number, PCI_SLOT(bridge->devfn), pin, irq);
 				}
 				if (irq >= 0) {
+					if (use_pci_vector() &&
+						!platform_legacy_irq(irq))
+						irq = IO_APIC_VECTOR(irq);
+
 					printk(KERN_INFO "PCI->APIC IRQ transform: (B%d,I%d,P%d) -> %d\n",
 						dev->bus->number, PCI_SLOT(dev->devfn), pin, irq);
 					dev->irq = irq;
diff -puN /dev/null Documentation/MSI-HOWTO.txt
--- /dev/null	Thu Apr 11 07:25:15 2002
+++ 25-akpm/Documentation/MSI-HOWTO.txt	Fri Oct  3 15:46:21 2003
@@ -0,0 +1,321 @@
+		The MSI Driver Guide HOWTO
+	Tom L Nguyen tom.l.nguyen@intel.com
+			10/03/2003
+
+1. About this guide
+
+This guide describes the basics of Message Signaled Interrupts(MSI), the
+advantages of using MSI over traditional interrupt mechanisms, and how
+to enable your driver to use MSI or MSI-X. Also included is a Frequently
+Asked Questions.
+
+2. Copyright 2003 Intel Corporation
+
+3. What is MSI/MSI-X?
+
+Message Signaled Interrupt (MSI), as described in the PCI Local Bus
+Specification Revision 2.3 or latest, is an optional feature, and a
+required feature for PCI Express devices. MSI enables a device function
+to request service by sending an Inbound Memory Write on its PCI bus to
+the FSB as a Message Signal Interrupt transaction. Because MSI is
+generated in the form of a Memory Write, all transaction conditions,
+such as a Retry, Master-Abort, Target-Abort or normal completion, are
+supported.
+
+A PCI device that supports MSI must also support pin IRQ assertion
+interrupt mechanism to provide backward compatibility for systems that
+do not support MSI. In Systems, which support MSI, the bus driver is
+responsible for initializing the message address and message data of
+the device function's MSI/MSI-X capability structure during device
+initial configuration.
+
+An MSI capable device function indicates MSI support by implementing
+the MSI/MSI-X capability structure in its PCI capability list. The
+device function may implement both the MSI capability structure and
+the MSI-X capability structure; however, the bus driver should not
+enable both, but instead enable only the MSI-X capability structure.
+
+The MSI capability structure contains Message Control register,
+Message Address register and Message Data register. These registers
+provide the bus driver control over MSI. The Message Control register
+indicates the MSI capability supported by the device. The Message
+Address register specifies the target address and the Message Data
+register specifies the characteristics of the message. To request
+service, the device function writes the content of the Message Data
+register to the target address. The device and its software driver
+are prohibited from writing to these registers.
+
+The MSI-X capability structure is an optional extension to MSI. It
+uses an independent and separate capability structure. There are
+some key advantages to implementing the MSI-X capability structure
+over the MSI capability structure as described below.
+
+	- Support a larger maximum number of vectors per function.
+
+	- Provide the ability for system software to configure
+	each vector with an independent message address and message
+	data, specified by a table that resides in Memory Space.
+
+        - MSI and MSI-X both support per-vector masking. Per-vector
+	masking is an optional extension of MSI but a required
+	feature for MSI-X. Per-vector masking provides the kernel
+	the ability to mask/unmask MSI when servicing its software
+	interrupt service routing handler. If per-vector masking is
+	not supported, then the device driver should provide the
+	hardware/software synchronization to ensure that the device
+	generates MSI when the driver wants it to do so.
+
+4. Why use MSI?
+
+As a benefit the simplification of board design, MSI allows board
+designers to remove out of band interrupt routing. MSI is another
+step towards a legacy-free environment.
+
+Due to increasing pressure on chipset and processor packages to
+reduce pin count, the need for interrupt pins is expected to
+diminish over time. Devices, due to pin constraints, may implement
+messages to increase performance.
+
+PCI Express endpoints uses INTx emulation (in-band messages) instead
+of IRQ pin assertion. Using INTx emulation requires interrupt
+sharing among devices connected to the same node (PCI bridge) while
+MSI is unique (non-shared) and does not require BIOS configuration
+support. As a result, the PCI Express technology requires MSI
+support for better interrupt performance.
+
+Using MSI enables the device functions to support two or more
+vectors, which can be configure to target different CPU's to
+increase scalability.
+
+5. Configuring a driver to use MSI/MSI-X
+
+By default, the kernel will not enable MSI/MSI-X on all devices that
+support this capability once the patch is installed. A kernel
+configuration option must be selected to enable MSI/MSI-X support.
+
+5.1 Including MSI support into the kernel
+
+To include MSI support into the kernel requires users to patch the
+VECTOR-base patch first and then the MSI patch because the MSI
+support needs VECTOR based scheme. Once these patches are installed,
+setting CONFIG_PCI_USE_VECTOR enables the VECTOR based scheme and
+the option for MSI-capable device drivers to selectively enable MSI
+(using pci_enable_msi as desribed below).
+
+Since the target of the inbound message is the local APIC, providing
+CONFIG_PCI_USE_VECTOR is dependent on whether CONFIG_X86_LOCAL_APIC
+is enabled or not.
+
+int pci_enable_msi(struct pci_dev *)
+
+With this new API, any existing device driver, which like to have
+MSI enabled on its device function, must call this explicitly. A
+successful call will initialize the MSI/MSI-X capability structure
+with ONE vector, regardless of whether the device function is
+capable of supporting multiple messages. This vector replaces the
+pre-assigned dev->irq with a new MSI vector. To avoid the conflict
+of new assigned vector with existing pre-assigned vector requires
+the device driver to call this API before calling request_irq(...).
+
+The below diagram shows the events, which switches the interrupt
+mode on the MSI-capable device function between MSI mode and
+PIN-IRQ assertion mode.
+
+	 ------------   pci_enable_msi 	 ------------------------
+	|	     | <===============	| 			 |
+	| MSI MODE   |	  	     	| PIN-IRQ ASSERTION MODE |
+	| 	     | ===============>	|			 |
+ 	 ------------	free_irq      	 ------------------------
+
+5.2 Configuring for MSI support
+
+Due to the non-contiguous fashion in vector assignment of the
+existing Linux kernel, this patch does not support multiple
+messages regardless of the device function is capable of supporting
+more than one vector. The bus driver initializes only entry 0 of
+this capability if pci_enable_msi(...) is called successfully by
+the device driver.
+
+5.3 Configuring for MSI-X support
+
+Both the MSI capability structure and the MSI-X capability structure
+share the same above semantics; however, due to the ability of the
+system software to configure each vector of the MSI-X capability
+structure with an independent message address and message data, the
+non-contiguous fashion in vector assignment of the existing Linux
+kernel has no impact on supporting multiple messages on an MSI-X
+capable device functions. By default, as mentioned above, ONE vector
+should be always allocated to the MSI-X capability structure at
+entry 0. The bus driver does not initialize other entries of the
+MSI-X table.
+
+Note that the PCI subsystem should have full control of a MSI-X
+table that resides in Memory Space. The software device driver
+should not access this table.
+
+To request for additional vectors, the device software driver should
+call function msi_alloc_vectors(). It is recommended that the
+software driver should call this function once during the
+initialization phase of the device driver.
+
+The function msi_alloc_vectors(), once invoked, enables either
+all or nothing, depending on the current availability of vector
+resources. If no vector resources are available, the device function
+still works with ONE vector. If the vector resources are available
+for the number of vectors requested by the driver, this function
+will reconfigure the MSI-X capability structure of the device with
+additional messages, starting from entry 1. To emphasize this
+reason, for example, the device may be capable for supporting the
+maximum of 32 vectors while its software driver usually may request
+4 vectors.
+
+For each vector, after this successful call, the device driver is
+responsible to call other functions like request_irq(), enable_irq(),
+etc. to enable this vector with its corresponding interrupt service
+handler. It is the device driver's choice to have all vectors shared
+the same interrupt service handler or each vector with a unique
+interrupt service handler.
+
+In addition to the function msi_alloc_vectors(), another function
+msi_free_vectors() is provided to allow the software driver to
+release a number of vectors back to the vector resources. Once
+invoked, the PCI subsystem disables (masks) each vector released.
+These vectors are no longer valid for the hardware device and its
+software driver to use. Like free_irq, it recommends that the
+device driver should also call msi_free_vectors to release all
+additional vectors previously requested.
+
+int msi_alloc_vectors(struct pci_dev *dev, int *vector, int nvec)
+
+This API enables the software driver to request the PCI subsystem
+for additional messages. Depending on the number of vectors
+available, the PCI subsystem enables either all or nothing.
+
+Argument dev points to the device (pci_dev) structure.
+Argument vector is a pointer of integer type. The number of
+elements is indicated in argument nvec.
+Argument nvec is an integer indicating the number of messages
+requested.
+A return of zero indicates that the number of allocated vector is
+successfully allocated. Otherwise, indicate resources not
+available.
+
+int msi_free_vectors(struct pci_dev* dev, int *vector, int nvec)
+
+This API enables the software driver to inform the PCI subsystem
+that it is willing to release a number of vectors back to the
+MSI resource pool. Once invoked, the PCI subsystem disables each
+MSI-X entry associated with each vector stored in the argument 2.
+These vectors are no longer valid for the hardware device and
+its software driver to use.
+
+Argument dev points to the device (pci_dev) structure.
+Argument vector is a pointer of integer type. The number of
+elements is indicated in argument nvec.
+Argument nvec is an integer indicating the number of messages
+released.
+A return of zero indicates that the number of allocated vectors
+is successfully released. Otherwise, indicates a failure.
+
+5.4 Hardware requirements for MSI support
+MSI support requires support from both system hardware and
+individual hardware device functions.
+
+5.4.1 System hardware support
+Since the target of MSI address is the local APIC CPU, enabling
+MSI support in Linux kernel is dependent on whether existing
+system hardware supports local APIC. Users should verify their
+system whether it runs when CONFIG_X86_LOCAL_APIC=y.
+
+In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set;
+however, in UP environment, users must manually set
+CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
+CONFIG_PCI_USE_VECTOR enables the VECTOR based scheme and
+the option for MSI-capable device drivers to selectively enable
+MSI (using pci_enable_msi as desribed below).
+
+Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI
+vector is allocated new during runtime and MSI support does not
+depend on BIOS support. This key independency enables MSI support
+on future IOxAPIC free platform.
+
+5.4.2 Device hardware support
+The hardware device function supports MSI by indicating the
+MSI/MSI-X capability structure on its PCI capability list. By
+default, this capability structure will not be initialized by
+the kernel to enable MSI during the system boot. In other words,
+the device function is running on its default pin assertion mode.
+Note that in many cases the hardware supporting MSI have bugs,
+which may result in system hang. The software driver of specific
+MSI-capable hardware is responsible for whether calling
+pci_enable_msi or not. A return of zero indicates the kernel
+successfully initializes the MSI/MSI-X capability structure of the
+device funtion. The device function is now running on MSI mode.
+
+5.5 How to tell whether MSI is enabled on device function
+
+At the driver level, a return of zero from pci_enable_msi(...)
+indicates to the device driver that its device function is
+initialized successfully and ready to run in MSI mode.
+
+At the user level, users can use command 'cat /proc/interrupts'
+to display the vector allocated for the device and its interrupt
+mode, as shown below.
+
+           CPU0       CPU1
+  0:     324639          0    IO-APIC-edge  timer
+  1:       1186          0    IO-APIC-edge  i8042
+  2:          0          0          XT-PIC  cascade
+ 12:       2797          0    IO-APIC-edge  i8042
+ 14:       6543          0    IO-APIC-edge  ide0
+ 15:          1          0    IO-APIC-edge  ide1
+169:          0          0   IO-APIC-level  uhci-hcd
+185:          0          0   IO-APIC-level  uhci-hcd
+193:        138         10         PCI MSI  aic79xx
+201:         30          0         PCI MSI  aic79xx
+225:         30          0   IO-APIC-level  aic7xxx
+233:         30          0   IO-APIC-level  aic7xxx
+NMI:          0          0
+LOC:     324553     325068
+ERR:          0
+MIS:          0
+
+6. FAQ
+
+Q1. Are there any limitations on using the MSI?
+
+A1. If the PCI device supports MSI and conforms to the
+specification and the platform supports the APIC local bus,
+then using MSI should work.
+
+Q2. Will it work on all the Pentium processors (P3, P4, Xeon,
+AMD processors)? In P3 IPI's are transmitted on the APIC local
+bus and in P4 and Xeon they are transmitted on the system
+bus. Are there any implications with this?
+
+A2. MSI support enables a PCI device sending an inbound
+memory write (0xfeexxxxx as target address) on its PCI bus
+directly to the FSB. Since the message address has a
+redirection hint bit cleared, it should work.
+
+Q3. The target address 0xfeexxxxx will be translated by the
+Host Bridge into an interrupt message. Are there any
+limitations on the chipsets such as Intel 8xx, Intel e7xxx,
+or VIA?
+
+A3. If these chipsets support an inbound memory write with
+target address set as 0xfeexxxxx, as conformed to PCI
+specification 2.3 or latest, then it should work.
+
+Q4. From the driver point of view, if the MSI is lost because
+of the errors occur during inbound memory write, then it may
+wait for ever. Is there a mechanism for it to recover?
+
+A4. Since the target of the transaction is an inbound memory
+write, all transaction termination conditions (Retry,
+Master-Abort, Target-Abort, or normal completion) are
+supported. A device sending an MSI must abide by all the PCI
+rules and conditions regarding that inbound memory write. So,
+if a retry is signaled it must retry, etc... We believe that
+the recommendation for Abort is also a retry (refer to PCI
+specification 2.3 or latest).
diff -puN drivers/acpi/pci_irq.c~ia32-MSI-support drivers/acpi/pci_irq.c
--- 25/drivers/acpi/pci_irq.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/drivers/acpi/pci_irq.c	Fri Oct  3 15:46:21 2003
@@ -237,7 +237,7 @@ acpi_pci_irq_add_prt (
                           PCI Interrupt Routing Support
    -------------------------------------------------------------------------- */
 
-static int
+int
 acpi_pci_irq_lookup (struct pci_bus *bus, int device, int pin)
 {
 	struct acpi_prt_entry	*entry = NULL;
diff -puN drivers/pci/Makefile~ia32-MSI-support drivers/pci/Makefile
--- 25/drivers/pci/Makefile~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/drivers/pci/Makefile	Fri Oct  3 15:46:21 2003
@@ -27,6 +27,7 @@ obj-$(CONFIG_PPC64) += setup-bus.o
 obj-$(CONFIG_SGI_IP27) += setup-irq.o
 obj-$(CONFIG_SGI_IP32) += setup-irq.o
 obj-$(CONFIG_X86_VISWS) += setup-irq.o
+obj-$(CONFIG_PCI_USE_VECTOR) += msi.o
 
 # Cardbus & CompactPCI use setup-bus
 obj-$(CONFIG_HOTPLUG) += setup-bus.o
diff -puN /dev/null drivers/pci/msi.c
--- /dev/null	Thu Apr 11 07:25:15 2002
+++ 25-akpm/drivers/pci/msi.c	Fri Oct  3 15:46:21 2003
@@ -0,0 +1,1061 @@
+/*
+ * linux/drivers/pci/msi.c
+ */
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/config.h>
+#include <linux/ioport.h>
+#include <linux/smp_lock.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/acpi.h>
+
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <mach_apic.h>
+
+#include <linux/pci_msi.h>
+
+_DEFINE_DBG_BUFFER
+
+static spinlock_t msi_lock = SPIN_LOCK_UNLOCKED;
+static struct msi_desc* msi_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = NULL };
+static kmem_cache_t* msi_cachep;
+
+static int pci_msi_enable = 1;
+static int nr_alloc_vectors = 0;
+static int nr_released_vectors = 0;
+static int nr_reserved_vectors = NR_HP_RESERVED_VECTORS;
+static int nr_msix_devices = 0;
+
+#ifndef CONFIG_X86_IO_APIC
+int vector_irq[NR_IRQS] = { [0 ... NR_IRQS -1] = -1};
+int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };
+#endif
+
+static void msi_cache_ctor(void *p, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(p, 0, NR_IRQS * sizeof(struct msi_desc));
+}
+
+static int msi_cache_init(void)
+{
+	msi_cachep = kmem_cache_create("msi_cache",
+			NR_IRQS * sizeof(struct msi_desc),
+		       	0, SLAB_HWCACHE_ALIGN, msi_cache_ctor, NULL);
+	if (!msi_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void msi_set_mask_bit(unsigned int vector, int flag)
+{
+	struct msi_desc *entry;
+
+	entry = (struct msi_desc *)msi_desc[vector];
+	if (!entry || !entry->dev || !entry->mask_base)
+		return;
+	switch (entry->msi_attrib.type) {
+	case PCI_CAP_ID_MSI:
+	{
+		int		pos;
+		unsigned int	mask_bits;
+
+		pos = entry->mask_base;
+	        entry->dev->bus->ops->read(entry->dev->bus, entry->dev->devfn,
+				pos, 4, &mask_bits);
+		mask_bits &= ~(1);
+		mask_bits |= flag;
+	        entry->dev->bus->ops->write(entry->dev->bus, entry->dev->devfn,
+				pos, 4, mask_bits);
+		break;
+	}
+	case PCI_CAP_ID_MSIX:
+	{
+		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+		writel(flag, entry->mask_base + offset);
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_affinity(unsigned int vector, unsigned long cpu_mask)
+{
+	struct msi_desc *entry;
+	struct msg_address address;
+	unsigned int dest_id;
+
+	entry = (struct msi_desc *)msi_desc[vector];
+	if (!entry || !entry->dev)
+		return;
+
+	switch (entry->msi_attrib.type) {
+	case PCI_CAP_ID_MSI:
+	{
+		int pos;
+
+   		if (!(pos = pci_find_capability(entry->dev, PCI_CAP_ID_MSI)))
+			return;
+
+	        entry->dev->bus->ops->read(entry->dev->bus, entry->dev->devfn,
+			msi_lower_address_reg(pos), 4,
+			&address.lo_address.value);
+		dest_id = (address.lo_address.u.dest_id &
+			MSI_ADDRESS_HEADER_MASK) |
+			(cpu_mask_to_apicid(cpu_mask) << MSI_TARGET_CPU_SHIFT);
+		address.lo_address.u.dest_id = dest_id;
+		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
+		entry->dev->bus->ops->write(entry->dev->bus, entry->dev->devfn,
+			msi_lower_address_reg(pos), 4,
+			address.lo_address.value);
+		break;
+	}
+	case PCI_CAP_ID_MSIX:
+	{
+		int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+			PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET;
+
+		address.lo_address.value = readl(entry->mask_base + offset);
+		dest_id = (address.lo_address.u.dest_id &
+			MSI_ADDRESS_HEADER_MASK) |
+			(cpu_mask_to_apicid(cpu_mask) << MSI_TARGET_CPU_SHIFT);
+		address.lo_address.u.dest_id = dest_id;
+		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
+		writel(address.lo_address.value, entry->mask_base + offset);
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static inline void move_msi(int vector)
+{
+	if (unlikely(pending_irq_balance_cpumask[vector])) {
+		set_msi_affinity(vector, pending_irq_balance_cpumask[vector]);
+		pending_irq_balance_cpumask[vector] = 0;
+	}
+}
+#endif
+
+static void mask_MSI_irq(unsigned int vector)
+{
+	msi_set_mask_bit(vector, 1);
+}
+
+static void unmask_MSI_irq(unsigned int vector)
+{
+	msi_set_mask_bit(vector, 0);
+}
+
+static unsigned int startup_msi_irq_wo_maskbit(unsigned int vector)
+{
+	return 0;	/* never anything pending */
+}
+
+static void pci_disable_msi(unsigned int vector);
+static void shutdown_msi_irq(unsigned int vector)
+{
+	pci_disable_msi(vector);
+}
+
+#define shutdown_msi_irq_wo_maskbit	shutdown_msi_irq
+static void enable_msi_irq_wo_maskbit(unsigned int vector) {}
+static void disable_msi_irq_wo_maskbit(unsigned int vector) {}
+static void ack_msi_irq_wo_maskbit(unsigned int vector) {}
+static void end_msi_irq_wo_maskbit(unsigned int vector)
+{
+	move_msi(vector);
+	ack_APIC_irq();
+}
+
+static unsigned int startup_msi_irq_w_maskbit(unsigned int vector)
+{
+	unmask_MSI_irq(vector);
+	return 0;	/* never anything pending */
+}
+
+#define shutdown_msi_irq_w_maskbit	shutdown_msi_irq
+#define enable_msi_irq_w_maskbit	unmask_MSI_irq
+#define disable_msi_irq_w_maskbit	mask_MSI_irq
+#define ack_msi_irq_w_maskbit		mask_MSI_irq
+
+static void end_msi_irq_w_maskbit(unsigned int vector)
+{
+	move_msi(vector);
+	unmask_MSI_irq(vector);
+	ack_APIC_irq();
+}
+
+/*
+ * Interrupt Type for MSI-X PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI-X Capability Structure.
+ */
+static struct hw_interrupt_type msix_irq_type = {
+	.typename	= "PCI MSI-X",
+	.startup	= startup_msi_irq_w_maskbit,
+	.shutdown	= shutdown_msi_irq_w_maskbit,
+	.enable		= enable_msi_irq_w_maskbit,
+	.disable	= disable_msi_irq_w_maskbit,
+	.ack		= ack_msi_irq_w_maskbit,
+	.end		= end_msi_irq_w_maskbit,
+	.set_affinity	= set_msi_irq_affinity
+};
+
+/*
+ * Interrupt Type for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI Capability Structure with
+ * Mask-and-Pending Bits.
+ */
+static struct hw_interrupt_type msi_irq_w_maskbit_type = {
+	.typename	= "PCI MSI",
+	.startup	= startup_msi_irq_w_maskbit,
+	.shutdown	= shutdown_msi_irq_w_maskbit,
+	.enable		= enable_msi_irq_w_maskbit,
+	.disable	= disable_msi_irq_w_maskbit,
+	.ack		= ack_msi_irq_w_maskbit,
+	.end		= end_msi_irq_w_maskbit,
+	.set_affinity	= set_msi_irq_affinity
+};
+
+/*
+ * Interrupt Type for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI Capability Structure without
+ * Mask-and-Pending Bits.
+ */
+static struct hw_interrupt_type msi_irq_wo_maskbit_type = {
+	.typename	= "PCI MSI",
+	.startup	= startup_msi_irq_wo_maskbit,
+	.shutdown	= shutdown_msi_irq_wo_maskbit,
+	.enable		= enable_msi_irq_wo_maskbit,
+	.disable	= disable_msi_irq_wo_maskbit,
+	.ack		= ack_msi_irq_wo_maskbit,
+	.end		= end_msi_irq_wo_maskbit,
+	.set_affinity	= set_msi_irq_affinity
+};
+
+static void msi_data_init(struct msg_data *msi_data,
+			  unsigned int vector)
+{
+	memset(msi_data, 0, sizeof(struct msg_data));
+	msi_data->vector = (u8)vector;
+	msi_data->delivery_mode = MSI_DELIVERY_MODE;
+	msi_data->level = MSI_LEVEL_MODE;
+	msi_data->trigger = MSI_TRIGGER_MODE;
+}
+
+static void msi_address_init(struct msg_address *msi_address)
+{
+	unsigned int	dest_id;
+
+	memset(msi_address, 0, sizeof(struct msg_address));
+	msi_address->hi_address = (u32)0;
+	dest_id = (MSI_ADDRESS_HEADER << MSI_ADDRESS_HEADER_SHIFT) |
+		 (MSI_TARGET_CPU << MSI_TARGET_CPU_SHIFT);
+	msi_address->lo_address.u.dest_mode = MSI_LOGICAL_MODE;
+	msi_address->lo_address.u.redirection_hint = MSI_REDIRECTION_HINT_MODE;
+	msi_address->lo_address.u.dest_id = dest_id;
+}
+
+static int pci_vector_resources(void)
+{
+	static int res = -EINVAL;
+	int nr_free_vectors;
+
+	if (res == -EINVAL) {
+		int i, repeat;
+		for (i = NR_REPEATS; i > 0; i--) {
+			if ((FIRST_DEVICE_VECTOR + i * 8) > FIRST_SYSTEM_VECTOR)
+				continue;
+			break;
+		}
+		i++;
+		repeat = (FIRST_SYSTEM_VECTOR - FIRST_DEVICE_VECTOR)/i;
+		res = i * repeat - NR_RESERVED_VECTORS + 1;
+	}
+
+	nr_free_vectors = res + nr_released_vectors - nr_alloc_vectors;
+
+	return nr_free_vectors;
+}
+
+int assign_irq_vector(int irq)
+{
+	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+
+	if (irq != MSI_AUTO && IO_APIC_VECTOR(irq) > 0)
+		return IO_APIC_VECTOR(irq);
+next:
+	current_vector += 8;
+	if (current_vector == SYSCALL_VECTOR)
+		goto next;
+
+	if (current_vector > FIRST_SYSTEM_VECTOR) {
+		offset++;
+		current_vector = FIRST_DEVICE_VECTOR + offset;
+	}
+
+	if (current_vector == FIRST_SYSTEM_VECTOR)
+		return -ENOSPC;
+
+	vector_irq[current_vector] = irq;
+	if (irq != MSI_AUTO)
+		IO_APIC_VECTOR(irq) = current_vector;
+
+	nr_alloc_vectors++;
+
+	return current_vector;
+}
+
+static int assign_msi_vector(void)
+{
+	static int new_vector_avail = 1;
+	int vector;
+	unsigned long flags;
+
+	/*
+	 * msi_lock is provided to ensure that successful allocation of MSI
+	 * vector is assigned unique among drivers.
+	 */
+	spin_lock_irqsave(&msi_lock, flags);
+	if (!(pci_vector_resources() > 0)) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EBUSY;
+	}
+
+	if (!new_vector_avail) {
+		/*
+	 	 * vector_irq[] = -1 indicates that this specific vector is:
+	 	 * - assigned for MSI (since MSI have no associated IRQ) or
+	 	 * - assigned for legacy if less than 16, or
+	 	 * - having no corresponding 1:1 vector-to-IOxAPIC IRQ mapping
+	 	 * vector_irq[] = 0 indicates that this vector, previously
+		 * assigned for MSI, is freed by hotplug removed operations.
+		 * This vector will be reused for any subsequent hotplug added
+		 * operations.
+	 	 * vector_irq[] > 0 indicates that this vector is assigned for
+		 * IOxAPIC IRQs. This vector and its value provides a 1-to-1
+		 * vector-to-IOxAPIC IRQ mapping.
+	 	 */
+		for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
+			if (vector_irq[vector] != 0)
+				continue;
+			vector_irq[vector] = -1;
+			nr_released_vectors--;
+			spin_unlock_irqrestore(&msi_lock, flags);
+			return vector;
+		}
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EBUSY;
+	}
+
+	vector = assign_irq_vector(MSI_AUTO);
+	if (vector  == (FIRST_SYSTEM_VECTOR - 8))
+		new_vector_avail = 0;
+
+	spin_unlock_irqrestore(&msi_lock, flags);
+	return vector;
+}
+
+static int get_new_vector(void)
+{
+	int vector;
+
+	if ((vector = assign_msi_vector()) > 0)
+		set_intr_gate(vector, interrupt[vector]);
+
+	return vector;
+}
+
+static int msi_init(void)
+{
+	static int status = -ENOMEM;
+
+	if (!status)
+		return status;
+
+	if ((status = msi_cache_init()) < 0) {
+		pci_msi_enable = 0;
+		printk(KERN_INFO "WARNING: MSI INIT FAILURE\n");
+		return status;
+	}
+	printk(KERN_INFO "MSI INIT SUCCESS\n");
+
+	return status;
+}
+
+static int get_msi_vector(struct pci_dev *dev)
+{
+	return get_new_vector();
+}
+
+static struct msi_desc* alloc_msi_entry(void)
+{
+	struct msi_desc *entry;
+
+	entry = (struct msi_desc*) kmem_cache_alloc(msi_cachep, SLAB_KERNEL);
+	if (!entry)
+		return NULL;
+
+	memset(entry, 0, sizeof(struct msi_desc));
+	entry->link.tail = entry->link.head = 0;	/* single message */
+	entry->dev = NULL;
+
+	return entry;
+}
+
+static void attach_msi_entry(struct msi_desc *entry, int vector)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	msi_desc[vector] = entry;
+	spin_unlock_irqrestore(&msi_lock, flags);
+}
+
+static void irq_handler_init(int cap_id, int pos, int mask)
+{
+	spin_lock(&irq_desc[pos].lock);
+	if (cap_id == PCI_CAP_ID_MSIX)
+		irq_desc[pos].handler = &msix_irq_type;
+	else {
+		if (!mask)
+			irq_desc[pos].handler = &msi_irq_wo_maskbit_type;
+		else
+			irq_desc[pos].handler = &msi_irq_w_maskbit_type;
+	}
+	spin_unlock(&irq_desc[pos].lock);
+}
+
+static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
+{
+	u32 control;
+
+	dev->bus->ops->read(dev->bus, dev->devfn,
+		msi_control_reg(pos), 2, &control);
+	if (type == PCI_CAP_ID_MSI) {
+		/* Set enabled bits to single MSI & enable MSI_enable bit */
+		msi_enable(control, 1);
+	        dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_control_reg(pos), 2, control);
+	} else {
+		msix_enable(control);
+	        dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_control_reg(pos), 2, control);
+	}
+    	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
+		/* PCI Express Endpoint device detected */
+		u32 cmd;
+	        dev->bus->ops->read(dev->bus, dev->devfn, PCI_COMMAND, 2, &cmd);
+		cmd |= PCI_COMMAND_INTX_DISABLE;
+	        dev->bus->ops->write(dev->bus, dev->devfn, PCI_COMMAND, 2, cmd);
+	}
+}
+
+static void disable_msi_mode(struct pci_dev *dev, int pos, int type)
+{
+	u32 control;
+
+	dev->bus->ops->read(dev->bus, dev->devfn,
+		msi_control_reg(pos), 2, &control);
+	if (type == PCI_CAP_ID_MSI) {
+		/* Set enabled bits to single MSI & enable MSI_enable bit */
+		msi_disable(control);
+	        dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_control_reg(pos), 2, control);
+	} else {
+		msix_disable(control);
+	        dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_control_reg(pos), 2, control);
+	}
+    	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
+		/* PCI Express Endpoint device detected */
+		u32 cmd;
+	        dev->bus->ops->read(dev->bus, dev->devfn, PCI_COMMAND, 2, &cmd);
+		cmd &= ~PCI_COMMAND_INTX_DISABLE;
+	        dev->bus->ops->write(dev->bus, dev->devfn, PCI_COMMAND, 2, cmd);
+	}
+}
+
+static int msi_lookup_vector(struct pci_dev *dev)
+{
+	int vector;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
+		if (!msi_desc[vector] || msi_desc[vector]->dev != dev ||
+			msi_desc[vector]->msi_attrib.entry_nr ||
+			msi_desc[vector]->msi_attrib.default_vector != dev->irq)
+			continue;	/* not entry 0, skip */
+		spin_unlock_irqrestore(&msi_lock, flags);
+		/* This pre-assigned entry-0 MSI vector for this device
+		   already exits. Override dev->irq with this vector */
+		dev->irq = vector;
+		return 0;
+	}
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+	return -EACCES;
+}
+
+void pci_scan_msi_device(struct pci_dev *dev)
+{
+	if (!dev)
+		return;
+
+   	if (pci_find_capability(dev, PCI_CAP_ID_MSIX) > 0) {
+		nr_reserved_vectors++;
+		nr_msix_devices++;
+	} else if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0)
+		nr_reserved_vectors++;
+}
+
+/**
+ * msi_capability_init - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ *
+ * Setup the MSI capability structure of device funtion with a single
+ * MSI vector, regardless of device function is capable of handling
+ * multiple messages. A return of zero indicates the successful setup
+ * of an entry zero with the new MSI vector or non-zero for otherwise.
+ **/
+static int msi_capability_init(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+	struct msg_address address;
+	struct msg_data data;
+	int pos, vector;
+	u32 control;
+
+   	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+
+	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos),
+		2, &control);
+	if (control & PCI_MSI_FLAGS_ENABLE)
+		return 0;
+
+	vector = dev->irq;
+	if (vector > 0 && !msi_lookup_vector(dev)) {
+		/* Lookup Sucess */
+		enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
+		return 0;
+	}
+	/* MSI Entry Initialization */
+	if (!(entry = alloc_msi_entry()))
+		return -ENOMEM;
+
+	if ((vector = get_msi_vector(dev)) < 0) {
+		kmem_cache_free(msi_cachep, entry);
+		return -EBUSY;
+	}
+	entry->msi_attrib.type = PCI_CAP_ID_MSI;
+	entry->msi_attrib.entry_nr = 0;
+	entry->msi_attrib.maskbit = is_mask_bit_support(control);
+	entry->msi_attrib.default_vector = dev->irq;
+	dev->irq = vector;	/* save default pre-assigned ioapic vector */
+	entry->dev = dev;
+	if (is_mask_bit_support(control)) {
+		entry->mask_base = msi_mask_bits_reg(pos,
+				is_64bit_address(control));
+	}
+	/* Replace with MSI handler */
+	irq_handler_init(PCI_CAP_ID_MSI, vector, entry->msi_attrib.maskbit);
+	/* Configure MSI capability structure */
+	msi_address_init(&address);
+	msi_data_init(&data, vector);
+	entry->msi_attrib.current_cpu = ((address.lo_address.u.dest_id >>
+				MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
+	dev->bus->ops->write(dev->bus, dev->devfn, msi_lower_address_reg(pos),
+				4, address.lo_address.value);
+	if (is_64bit_address(control)) {
+		dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_upper_address_reg(pos), 4, address.hi_address);
+		dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_data_reg(pos, 1), 2, *((u32*)&data));
+	} else
+		dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_data_reg(pos, 0), 2, *((u32*)&data));
+	if (entry->msi_attrib.maskbit) {
+		unsigned int maskbits, temp;
+		/* All MSIs are unmasked by default, Mask them all */
+	        dev->bus->ops->read(dev->bus, dev->devfn,
+			msi_mask_bits_reg(pos, is_64bit_address(control)), 4,
+			&maskbits);
+		temp = (1 << multi_msi_capable(control));
+		temp = ((temp - 1) & ~temp);
+		maskbits |= temp;
+		dev->bus->ops->write(dev->bus, dev->devfn,
+			msi_mask_bits_reg(pos, is_64bit_address(control)), 4,
+			maskbits);
+	}
+	attach_msi_entry(entry, vector);
+	/* Set MSI enabled bits	 */
+	enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
+
+	return 0;
+}
+
+/**
+ * msix_capability_init - configure device's MSI-X capability
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ *
+ * Setup the MSI-X capability structure of device funtion with a
+ * single MSI-X vector. A return of zero indicates the successful setup
+ * of an entry zero with the new MSI-X vector or non-zero for otherwise.
+ * To request for additional MSI-X vectors, the device drivers are
+ * required to utilize the following supported APIs:
+ * 1) msi_alloc_vectors(...) for requesting one or more MSI-X vectors
+ * 2) msi_free_vectors(...) for releasing one or more MSI-X vectors
+ *    back to PCI subsystem before calling free_irq(...)
+ **/
+static int msix_capability_init(struct pci_dev	*dev)
+{
+	struct msi_desc *entry;
+	struct msg_address address;
+	struct msg_data data;
+	int vector, pos, dev_msi_cap;
+	u32 phys_addr, table_offset;
+	u32 control;
+	u8 bir;
+	void *base;
+
+   	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return -EINVAL;
+
+	/* Request & Map MSI-X table region */
+	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos), 2,
+		&control);
+	if (control & PCI_MSIX_FLAGS_ENABLE)
+		return 0;
+
+	vector = dev->irq;
+	if (vector > 0 && !msi_lookup_vector(dev)) {
+		/* Lookup Sucess */
+		enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
+		return 0;
+	}
+
+	dev_msi_cap = multi_msix_capable(control);
+	dev->bus->ops->read(dev->bus, dev->devfn,
+		msix_table_offset_reg(pos), 4, &table_offset);
+	bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+	phys_addr = pci_resource_start (dev, bir);
+	phys_addr += (u32)(table_offset & ~PCI_MSIX_FLAGS_BIRMASK);
+	if (!request_mem_region(phys_addr,
+		dev_msi_cap * PCI_MSIX_ENTRY_SIZE,
+		"MSI-X iomap Failure"))
+		return -ENOMEM;
+	base = ioremap_nocache(phys_addr, dev_msi_cap * PCI_MSIX_ENTRY_SIZE);
+	if (base == NULL)
+		goto free_region;
+	/* MSI Entry Initialization */
+	entry = alloc_msi_entry();
+	if (!entry)
+		goto free_iomap;
+	if ((vector = get_msi_vector(dev)) < 0)
+		goto free_entry;
+
+	entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+	entry->msi_attrib.entry_nr = 0;
+	entry->msi_attrib.maskbit = 1;
+	entry->msi_attrib.default_vector = dev->irq;
+	dev->irq = vector;	/* save default pre-assigned ioapic vector */
+	entry->dev = dev;
+	entry->mask_base = (unsigned long)base;
+	/* Replace with MSI handler */
+	irq_handler_init(PCI_CAP_ID_MSIX, vector, 1);
+	/* Configure MSI-X capability structure */
+	msi_address_init(&address);
+	msi_data_init(&data, vector);
+	entry->msi_attrib.current_cpu = ((address.lo_address.u.dest_id >>
+				MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
+	writel(address.lo_address.value, base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+	writel(address.hi_address, base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+	writel(*(u32*)&data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
+	/* Initialize all entries from 1 up to 0 */
+	for (pos = 1; pos < dev_msi_cap; pos++) {
+		writel(0, base + pos * PCI_MSIX_ENTRY_SIZE +
+			PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		writel(0, base + pos * PCI_MSIX_ENTRY_SIZE +
+			PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		writel(0, base + pos * PCI_MSIX_ENTRY_SIZE +
+			PCI_MSIX_ENTRY_DATA_OFFSET);
+	}
+	attach_msi_entry(entry, vector);
+	/* Set MSI enabled bits	 */
+	enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
+
+	return 0;
+
+free_entry:
+	kmem_cache_free(msi_cachep, entry);
+free_iomap:
+	iounmap(base);
+free_region:
+	release_mem_region(phys_addr, dev_msi_cap * PCI_MSIX_ENTRY_SIZE);
+
+	return ((vector < 0) ? -EBUSY : -ENOMEM);
+}
+
+/**
+ * pci_enable_msi - configure device's MSI(X) capability structure
+ * @dev: pointer to the pci_dev data structure of MSI(X) device function
+ *
+ * Setup the MSI/MSI-X capability structure of device function with
+ * a single MSI(X) vector upon its software driver call to request for
+ * MSI(X) mode enabled on its hardware device function. A return of zero
+ * indicates the successful setup of an entry zero with the new MSI(X)
+ * vector or non-zero for otherwise.
+ **/
+int pci_enable_msi(struct pci_dev* dev)
+{
+	int status = -EINVAL;
+
+	if (!pci_msi_enable || !dev)
+ 		return status;
+
+	if (msi_init() < 0)
+		return -ENOMEM;
+
+	if ((status = msix_capability_init(dev)) == -EINVAL)
+		status = msi_capability_init(dev);
+	if (!status)
+		nr_reserved_vectors--;
+
+	return status;
+}
+
+static int msi_free_vector(struct pci_dev* dev, int vector);
+static void pci_disable_msi(unsigned int vector)
+{
+	int head, tail, type, default_vector;
+	struct msi_desc *entry;
+	struct pci_dev *dev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[vector];
+	if (!entry || !entry->dev) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return;
+	}
+	dev = entry->dev;
+	type = entry->msi_attrib.type;
+	head = entry->link.head;
+	tail = entry->link.tail;
+	default_vector = entry->msi_attrib.default_vector;
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+	disable_msi_mode(dev, pci_find_capability(dev, type), type);
+	/* Restore dev->irq to its default pin-assertion vector */
+	dev->irq = default_vector;
+	if (type == PCI_CAP_ID_MSIX && head != tail) {
+		/* Bad driver, which do not call msi_free_vectors before exit.
+		   We must do a cleanup here */
+		while (1) {
+			spin_lock_irqsave(&msi_lock, flags);
+			entry = msi_desc[vector];
+			head = entry->link.head;
+			tail = entry->link.tail;
+			spin_unlock_irqrestore(&msi_lock, flags);
+			if (tail == head)
+				break;
+			if (msi_free_vector(dev, entry->link.tail))
+				break;
+		}
+	}
+}
+
+static int msi_alloc_vector(struct pci_dev* dev, int head)
+{
+	struct msi_desc *entry;
+	struct msg_address address;
+	struct msg_data data;
+	int i, offset, pos, dev_msi_cap, vector;
+	u32 low_address, control;
+	unsigned long base = 0L;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[dev->irq];
+	if (!entry) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EINVAL;
+	}
+	base = entry->mask_base;
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+   	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos),
+		2, &control);
+	dev_msi_cap = multi_msix_capable(control);
+	for (i = 1; i < dev_msi_cap; i++) {
+		if (!(low_address = readl(base + i * PCI_MSIX_ENTRY_SIZE)))
+			 break;
+	}
+	if (i >= dev_msi_cap)
+		return -EINVAL;
+
+	/* MSI Entry Initialization */
+	if (!(entry = alloc_msi_entry()))
+		return -ENOMEM;
+
+	if ((vector = get_new_vector()) < 0) {
+		kmem_cache_free(msi_cachep, entry);
+		return vector;
+	}
+	entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+	entry->msi_attrib.entry_nr = i;
+	entry->msi_attrib.maskbit = 1;
+	entry->dev = dev;
+	entry->link.head = head;
+	entry->mask_base = base;
+	irq_handler_init(PCI_CAP_ID_MSIX, vector, 1);
+	/* Configure MSI-X capability structure */
+	msi_address_init(&address);
+	msi_data_init(&data, vector);
+	entry->msi_attrib.current_cpu = ((address.lo_address.u.dest_id >>
+				MSI_TARGET_CPU_SHIFT) & MSI_TARGET_CPU_MASK);
+	offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+	writel(address.lo_address.value, base + offset +
+		PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+	writel(address.hi_address, base + offset +
+		PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+	writel(*(u32*)&data, base + offset + PCI_MSIX_ENTRY_DATA_OFFSET);
+	writel(1, base + offset + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+	attach_msi_entry(entry, vector);
+
+	return vector;
+}
+
+static int msi_free_vector(struct pci_dev* dev, int vector)
+{
+	struct msi_desc *entry;
+	int entry_nr, type;
+	unsigned long base = 0L;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[vector];
+	if (!entry || entry->dev != dev) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EINVAL;
+	}
+	type = entry->msi_attrib.type;
+	entry_nr = entry->msi_attrib.entry_nr;
+	base = entry->mask_base;
+	if (entry->link.tail != entry->link.head) {
+		msi_desc[entry->link.head]->link.tail = entry->link.tail;
+		if (entry->link.tail)
+			msi_desc[entry->link.tail]->link.head = entry->link.head;
+	}
+	entry->dev = NULL;
+	vector_irq[vector] = 0;
+	nr_released_vectors++;
+	msi_desc[vector] = NULL;
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+	kmem_cache_free(msi_cachep, entry);
+	if (type == PCI_CAP_ID_MSIX) {
+		int offset;
+
+		offset = entry_nr * PCI_MSIX_ENTRY_SIZE;
+		writel(1, base + offset + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		writel(0, base + offset + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+	}
+
+	return 0;
+}
+
+/**
+ * msi_alloc_vectors - allocate additional MSI-X vectors
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @vector: pointer to an array of new allocated MSI-X vectors
+ * @nvec: number of MSI-X vectors requested for allocation by device driver
+ *
+ * Allocate additional MSI-X vectors requested by device driver. A
+ * return of zero indicates the successful setup of MSI-X capability
+ * structure with new allocated MSI-X vectors or non-zero for otherwise.
+ **/
+int msi_alloc_vectors(struct pci_dev* dev, int *vector, int nvec)
+{
+	struct msi_desc *entry;
+	int i, head, pos, vec, free_vectors, alloc_vectors;
+	int *vectors = (int *)vector;
+	u32 control;
+	unsigned long flags;
+
+	if (!pci_msi_enable || !dev)
+ 		return -EINVAL;
+
+   	if (!(pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)))
+ 		return -EINVAL;
+
+	dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos), 			2, &control);
+	if (nvec > multi_msix_capable(control))
+		return -EINVAL;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[dev->irq];
+	if (!entry || entry->dev != dev ||		/* legal call */
+	   entry->msi_attrib.type != PCI_CAP_ID_MSIX || /* must be MSI-X */
+	   entry->link.head != entry->link.tail) {	/* already multi */
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EINVAL;
+	}
+	/*
+	 * msi_lock is provided to ensure that enough vectors resources are
+	 * available before granting.
+	 */
+	free_vectors = pci_vector_resources();
+	/* Ensure that each MSI/MSI-X device has one vector reserved by
+	   default to avoid any MSI-X driver to take all available
+ 	   resources */
+	free_vectors -= nr_reserved_vectors;
+	/* Find the average of free vectors among MSI-X devices */
+	if (nr_msix_devices > 0)
+		free_vectors /= nr_msix_devices;
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+	if (nvec > free_vectors)
+		return -EBUSY;
+
+	alloc_vectors = 0;
+	head = dev->irq;
+	for (i = 0; i < nvec; i++) {
+		if ((vec = msi_alloc_vector(dev, head)) < 0)
+			break;
+		*(vectors + i) = vec;
+		head = vec;
+		alloc_vectors++;
+	}
+	if (alloc_vectors != nvec) {
+		for (i = 0; i < alloc_vectors; i++) {
+			vec = *(vectors + i);
+			msi_free_vector(dev, vec);
+		}
+		spin_lock_irqsave(&msi_lock, flags);
+		msi_desc[dev->irq]->link.tail = msi_desc[dev->irq]->link.head;
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EBUSY;
+	}
+	if (nr_msix_devices > 0)
+		nr_msix_devices--;
+
+	return 0;
+}
+
+/**
+ * msi_free_vectors - reclaim MSI-X vectors to unused state
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @vector: pointer to an array of released MSI-X vectors
+ * @nvec: number of MSI-X vectors requested for release by device driver
+ *
+ * Reclaim MSI-X vectors released by device driver to unused state,
+ * which may be used later on. A return of zero indicates the
+ * success or non-zero for otherwise. Device driver should call this
+ * before calling function free_irq.
+ **/
+int msi_free_vectors(struct pci_dev* dev, int *vector, int nvec)
+{
+	struct msi_desc *entry;
+	int i;
+	unsigned long flags;
+
+	if (!pci_msi_enable)
+ 		return -EINVAL;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[dev->irq];
+	if (!entry || entry->dev != dev ||
+	   	entry->msi_attrib.type != PCI_CAP_ID_MSIX ||
+		entry->link.head == entry->link.tail) {	/* Nothing to free */
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return -EINVAL;
+	}
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+	for (i = 0; i < nvec; i++) {
+		if (*(vector + i) == dev->irq)
+			continue;/* Don't free entry 0 if mistaken by driver */
+		msi_free_vector(dev, *(vector + i));
+	}
+
+	return 0;
+}
+
+/**
+ * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state
+ * @dev: pointer to the pci_dev data structure of MSI(X) device function
+ *
+ * Being called during hotplug remove, from which the device funciton
+ * is hot-removed. All previous assigned MSI/MSI-X vectors, if
+ * allocated for this device function, are reclaimed to unused state,
+ * which may be used later on.
+ **/
+void msi_remove_pci_irq_vectors(struct pci_dev* dev)
+{
+	struct msi_desc *entry;
+	int type;
+	unsigned long flags;
+
+	if (!pci_msi_enable)
+ 		return;
+
+	spin_lock_irqsave(&msi_lock, flags);
+	entry = msi_desc[dev->irq];
+	if (!entry || entry->dev != dev) {
+		spin_unlock_irqrestore(&msi_lock, flags);
+		return;
+	}
+	type = entry->msi_attrib.type;
+	spin_unlock_irqrestore(&msi_lock, flags);
+
+	msi_free_vector(dev, dev->irq);
+	if (type == PCI_CAP_ID_MSIX) {
+		int i, pos, dev_msi_cap;
+		u32 phys_addr, table_offset;
+		u32 control;
+		u8 bir;
+
+   		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+		dev->bus->ops->read(dev->bus, dev->devfn, msi_control_reg(pos), 			2, &control);
+		dev_msi_cap = multi_msix_capable(control);
+		dev->bus->ops->read(dev->bus, dev->devfn,
+			msix_table_offset_reg(pos), 4, &table_offset);
+		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+		phys_addr = pci_resource_start (dev, bir);
+		phys_addr += (u32)(table_offset & ~PCI_MSIX_FLAGS_BIRMASK);
+		for (i = FIRST_DEVICE_VECTOR; i < NR_IRQS; i++) {
+			spin_lock_irqsave(&msi_lock, flags);
+			if (!msi_desc[i] || msi_desc[i]->dev != dev) {
+				spin_unlock_irqrestore(&msi_lock, flags);
+				continue;
+			}
+			spin_unlock_irqrestore(&msi_lock, flags);
+			msi_free_vector(dev, i);
+		}
+		writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		iounmap((void*)entry->mask_base);
+		release_mem_region(phys_addr, dev_msi_cap * PCI_MSIX_ENTRY_SIZE);
+	}
+	nr_reserved_vectors++;
+}
+
+EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(msi_alloc_vectors);
+EXPORT_SYMBOL(msi_free_vectors);
diff -puN drivers/pci/probe.c~ia32-MSI-support drivers/pci/probe.c
--- 25/drivers/pci/probe.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/drivers/pci/probe.c	Fri Oct  3 15:46:21 2003
@@ -552,6 +552,7 @@ int __devinit pci_scan_slot(struct pci_b
 		struct pci_dev *dev;
 
 		dev = pci_scan_device(bus, devfn);
+		pci_scan_msi_device(dev);
 		if (func == 0) {
 			if (!dev)
 				break;
diff -puN drivers/pci/remove.c~ia32-MSI-support drivers/pci/remove.c
--- 25/drivers/pci/remove.c~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/drivers/pci/remove.c	Fri Oct  3 15:46:21 2003
@@ -14,6 +14,8 @@ static void pci_free_resources(struct pc
 {
 	int i;
 
+ 	msi_remove_pci_irq_vectors(dev);
+
 	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 		struct resource *res = dev->resource + i;
 		if (res->parent)
diff -puN include/asm-i386/hw_irq.h~ia32-MSI-support include/asm-i386/hw_irq.h
--- 25/include/asm-i386/hw_irq.h~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/include/asm-i386/hw_irq.h	Fri Oct  3 15:46:21 2003
@@ -41,6 +41,7 @@ asmlinkage void apic_timer_interrupt(voi
 asmlinkage void error_interrupt(void);
 asmlinkage void spurious_interrupt(void);
 asmlinkage void thermal_interrupt(struct pt_regs);
+#define platform_legacy_irq(irq)	((irq) < 16)
 #endif
 
 void mask_irq(unsigned int irq);
diff -puN include/asm-i386/io_apic.h~ia32-MSI-support include/asm-i386/io_apic.h
--- 25/include/asm-i386/io_apic.h~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/include/asm-i386/io_apic.h	Fri Oct  3 15:46:21 2003
@@ -13,6 +13,46 @@
 
 #ifdef CONFIG_X86_IO_APIC
 
+#ifdef CONFIG_PCI_USE_VECTOR
+static inline int use_pci_vector(void)	{return 1;}
+static inline void disable_edge_ioapic_vector(unsigned int vector) { }
+static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
+static inline void end_edge_ioapic_vector (unsigned int vector) { }
+#define startup_level_ioapic	startup_level_ioapic_vector
+#define shutdown_level_ioapic	mask_IO_APIC_vector
+#define enable_level_ioapic	unmask_IO_APIC_vector
+#define disable_level_ioapic	mask_IO_APIC_vector
+#define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
+#define end_level_ioapic	end_level_ioapic_vector
+#define set_ioapic_affinity	set_ioapic_affinity_vector
+
+#define startup_edge_ioapic 	startup_edge_ioapic_vector
+#define shutdown_edge_ioapic 	disable_edge_ioapic_vector
+#define enable_edge_ioapic 	unmask_IO_APIC_vector
+#define disable_edge_ioapic 	disable_edge_ioapic_vector
+#define ack_edge_ioapic 	ack_edge_ioapic_vector
+#define end_edge_ioapic 	end_edge_ioapic_vector
+#else
+static inline int use_pci_vector(void)	{return 0;}
+static inline void disable_edge_ioapic_irq(unsigned int irq) { }
+static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
+static inline void end_edge_ioapic_irq (unsigned int irq) { }
+#define startup_level_ioapic	startup_level_ioapic_irq
+#define shutdown_level_ioapic	mask_IO_APIC_irq
+#define enable_level_ioapic	unmask_IO_APIC_irq
+#define disable_level_ioapic	mask_IO_APIC_irq
+#define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
+#define end_level_ioapic	end_level_ioapic_irq
+#define set_ioapic_affinity	set_ioapic_affinity_irq
+
+#define startup_edge_ioapic 	startup_edge_ioapic_irq
+#define shutdown_edge_ioapic 	disable_edge_ioapic_irq
+#define enable_edge_ioapic 	unmask_IO_APIC_irq
+#define disable_edge_ioapic 	disable_edge_ioapic_irq
+#define ack_edge_ioapic 	ack_edge_ioapic_irq
+#define end_edge_ioapic 	end_edge_ioapic_irq
+#endif
+
 #define APIC_MISMATCH_DEBUG
 
 #define IO_APIC_BASE(idx) \
diff -puN include/asm-i386/mach-default/irq_vectors.h~ia32-MSI-support include/asm-i386/mach-default/irq_vectors.h
--- 25/include/asm-i386/mach-default/irq_vectors.h~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/include/asm-i386/mach-default/irq_vectors.h	Fri Oct  3 15:46:21 2003
@@ -76,11 +76,21 @@
  * Since vectors 0x00-0x1f are used/reserved for the CPU,
  * the usable vector space is 0x20-0xff (224 vectors)
  */
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS 256
+#ifdef CONFIG_PCI_USE_VECTOR
+#define NR_IRQS FIRST_SYSTEM_VECTOR
+#else
 #ifdef CONFIG_X86_IO_APIC
 #define NR_IRQS 224
 #else
 #define NR_IRQS 16
 #endif
+#endif
 
 #define FPU_IRQ			13
 
diff -puN include/linux/pci.h~ia32-MSI-support include/linux/pci.h
--- 25/include/linux/pci.h~ia32-MSI-support	Fri Oct  3 15:46:21 2003
+++ 25-akpm/include/linux/pci.h	Fri Oct  3 15:46:21 2003
@@ -36,6 +36,7 @@
 #define  PCI_COMMAND_WAIT 	0x80	/* Enable address/data stepping */
 #define  PCI_COMMAND_SERR	0x100	/* Enable SERR */
 #define  PCI_COMMAND_FAST_BACK	0x200	/* Enable back-to-back writes */
+#define  PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
 
 #define PCI_STATUS		0x06	/* 16 bits */
 #define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
@@ -198,6 +199,8 @@
 #define  PCI_CAP_ID_MSI		0x05	/* Message Signalled Interrupts */
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
+#define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
+#define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
 #define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
 #define PCI_CAP_FLAGS		2	/* Capability defined flags (16 bits) */
 #define PCI_CAP_SIZEOF		4
@@ -275,11 +278,13 @@
 #define  PCI_MSI_FLAGS_QSIZE	0x70	/* Message queue size configured */
 #define  PCI_MSI_FLAGS_QMASK	0x0e	/* Maximum queue size available */
 #define  PCI_MSI_FLAGS_ENABLE	0x01	/* MSI feature enabled */
+#define  PCI_MSI_FLAGS_MASKBIT	0x100	/* 64-bit mask bits allowed */
 #define PCI_MSI_RFU		3	/* Rest of capability flags */
 #define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
 #define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
 #define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
 #define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
+#define PCI_MSI_MASK_BIT	16	/* Mask bits register */
 
 /* CompactPCI Hotswap Register */
 
@@ -695,6 +700,18 @@ void pci_pool_free (struct pci_pool *poo
 extern struct pci_dev *isa_bridge;
 #endif
 
+#ifndef CONFIG_PCI_USE_VECTOR
+static inline void pci_scan_msi_device(struct pci_dev *dev) {}
+static inline int pci_enable_msi(struct pci_dev *dev) {return -1;}
+static inline void msi_remove_pci_irq_vectors(struct pci_dev *dev) {}
+#else
+extern void pci_scan_msi_device(struct pci_dev *dev);
+extern int pci_enable_msi(struct pci_dev *dev);
+extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
+extern int msi_alloc_vectors(struct pci_dev* dev, int *vector, int nvec);
+extern int msi_free_vectors(struct pci_dev* dev, int *vector, int nvec);
+#endif
+
 #endif /* CONFIG_PCI */
 
 /* Include architecture-dependent settings and functions */
diff -puN /dev/null include/linux/pci_msi.h
--- /dev/null	Thu Apr 11 07:25:15 2002
+++ 25-akpm/include/linux/pci_msi.h	Fri Oct  3 15:46:21 2003
@@ -0,0 +1,193 @@
+/*
+ *	../include/linux/pci_msi.h
+ *
+ */
+
+#ifndef _ASM_PCI_MSI_H
+#define _ASM_PCI_MSI_H
+
+#include <linux/pci.h>
+
+#define MSI_AUTO -1
+#define NR_REPEATS	23
+#define NR_RESERVED_VECTORS 3 /*FIRST_DEVICE_VECTOR,FIRST_SYSTEM_VECTOR,0x80 */
+
+/*
+ * Assume the maximum number of hot plug slots supported by the system is about
+ * ten. The worstcase is that each of these slots is hot-added with a device,
+ * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which
+ * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined
+ * as below to ensure at least one message is assigned to each detected MSI/
+ * MSI-X device function.
+ */
+#define NR_HP_RESERVED_VECTORS 	20
+
+extern int vector_irq[NR_IRQS];
+extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
+extern void (*interrupt[NR_IRQS])(void);
+
+#ifdef CONFIG_SMP
+#define set_msi_irq_affinity	set_msi_affinity
+#else
+#define set_msi_irq_affinity	NULL
+static inline void move_msi(int vector) {}
+#endif
+
+#ifndef CONFIG_X86_IO_APIC
+static inline int get_ioapic_vector(struct pci_dev *dev) { return -1;}
+static inline void restore_ioapic_irq_handler(int irq) {}
+#else
+extern void restore_ioapic_irq_handler(int irq);
+#endif
+
+/*
+ * MSI-X Address Register
+ */
+#define PCI_MSIX_FLAGS_QSIZE		0x7FF
+#define PCI_MSIX_FLAGS_ENABLE		(1 << 15)
+#define PCI_MSIX_FLAGS_BIRMASK		(7 << 0)
+#define PCI_MSIX_FLAGS_BITMASK		(1 << 0)
+
+#define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET	0
+#define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET	4
+#define PCI_MSIX_ENTRY_DATA_OFFSET		8
+#define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET	12
+#define PCI_MSIX_ENTRY_SIZE			16
+
+#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
+#define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
+#define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
+#define msi_data_reg(base, is64bit)	\
+	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
+#define msi_mask_bits_reg(base, is64bit) \
+	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
+#define multi_msi_capable(control) \
+	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
+#define multi_msi_enable(control, num) \
+	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+#define is_64bit_address(control)	(control & PCI_MSI_FLAGS_64BIT)
+#define is_mask_bit_support(control)	(control & PCI_MSI_FLAGS_MASKBIT)
+#define msi_enable(control, num) multi_msi_enable(control, num); \
+	control |= PCI_MSI_FLAGS_ENABLE
+
+#define msix_control_reg		msi_control_reg
+#define msix_table_offset_reg(base)	(base + 0x04)
+#define msix_pba_offset_reg(base)	(base + 0x08)
+#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
+#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
+#define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
+#define multi_msix_capable		msix_table_size
+#define msix_unmask(address)	 	(address & ~PCI_MSIX_FLAGS_BITMASK)
+#define msix_mask(address)		(address | PCI_MSIX_FLAGS_BITMASK)
+#define msix_is_pending(address) 	(address & PCI_MSIX_FLAGS_PENDMASK)
+
+extern char __dbg_str_buf[256];
+#define _DEFINE_DBG_BUFFER	char __dbg_str_buf[256];
+#define _DBG_K_TRACE_ENTRY	((unsigned int)0x00000001)
+#define _DBG_K_TRACE_EXIT	((unsigned int)0x00000002)
+#define _DBG_K_INFO		((unsigned int)0x00000004)
+#define _DBG_K_ERROR		((unsigned int)0x00000008)
+#define _DBG_K_TRACE	(_DBG_K_TRACE_ENTRY | _DBG_K_TRACE_EXIT)
+
+#define _DEBUG_LEVEL	(_DBG_K_INFO | _DBG_K_ERROR | _DBG_K_TRACE)
+#define _DBG_PRINT( dbg_flags, args... )		\
+if ( _DEBUG_LEVEL & (dbg_flags) )			\
+{							\
+	int len;					\
+	len = sprintf(__dbg_str_buf, "%s:%d: %s ", 	\
+		__FILE__, __LINE__, __FUNCTION__ ); 	\
+	sprintf(__dbg_str_buf + len, args);		\
+	printk(KERN_INFO "%s\n", __dbg_str_buf);	\
+}
+
+#define MSI_FUNCTION_TRACE_ENTER	\
+	_DBG_PRINT (_DBG_K_TRACE_ENTRY, "%s", "[Entry]");
+#define MSI_FUNCTION_TRACE_EXIT		\
+	_DBG_PRINT (_DBG_K_TRACE_EXIT, "%s", "[Entry]");
+
+/*
+ * MSI Defined Data Structures
+ */
+#define MSI_ADDRESS_HEADER		0xfee
+#define MSI_ADDRESS_HEADER_SHIFT	12
+#define MSI_ADDRESS_HEADER_MASK		0xfff000
+#define MSI_TARGET_CPU_SHIFT		4
+#define MSI_TARGET_CPU_MASK		0xff
+#define MSI_DELIVERY_MODE		0
+#define MSI_LEVEL_MODE			1	/* Edge always assert */
+#define MSI_TRIGGER_MODE		0	/* MSI is edge sensitive */
+#define MSI_LOGICAL_MODE		1
+#define MSI_REDIRECTION_HINT_MODE	0
+#ifdef CONFIG_SMP
+#define MSI_TARGET_CPU			logical_smp_processor_id()
+#else
+#define MSI_TARGET_CPU			TARGET_CPUS
+#endif
+
+struct msg_data {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32	vector		:  8;
+	__u32	delivery_mode	:  3;	/* 000b: FIXED | 001b: lowest prior */
+	__u32	reserved_1	:  3;
+	__u32	level		:  1;	/* 0: deassert | 1: assert */
+	__u32	trigger		:  1;	/* 0: edge | 1: level */
+	__u32	reserved_2	: 16;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u32	reserved_2	: 16;
+	__u32	trigger		:  1;	/* 0: edge | 1: level */
+	__u32	level		:  1;	/* 0: deassert | 1: assert */
+	__u32	reserved_1	:  3;
+	__u32	delivery_mode	:  3;	/* 000b: FIXED | 001b: lowest prior */
+	__u32	vector		:  8;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+} __attribute__ ((packed));
+
+struct msg_address {
+	union {
+		struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+			__u32	reserved_1	:  2;
+			__u32	dest_mode	:  1;	/*0:physic | 1:logic */
+			__u32	redirection_hint:  1;  	/*0: dedicated CPU
+							  1: lowest priority */
+			__u32	reserved_2	:  4;
+ 			__u32	dest_id		: 24;	/* Destination ID */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ 			__u32	dest_id		: 24;	/* Destination ID */
+			__u32	reserved_2	:  4;
+			__u32	redirection_hint:  1;  	/*0: dedicated CPU
+							  1: lowest priority */
+			__u32	dest_mode	:  1;	/*0:physic | 1:logic */
+			__u32	reserved_1	:  2;
+#else
+#error "Bitfield endianness not defined! Check your byteorder.h"
+#endif
+      		}u;
+       		__u32  value;
+	}lo_address;
+	__u32 	hi_address;
+} __attribute__ ((packed));
+
+struct msi_desc {
+	struct {
+		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
+		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
+		__u8	reserved: 2; 	/* reserved			  */
+		__u8	entry_nr;    	/* specific enabled entry 	  */
+		__u8	default_vector; /* default pre-assigned vector    */
+		__u8	current_cpu; 	/* current destination cpu	  */
+	}msi_attrib;
+
+	struct {
+		__u16	head;
+		__u16	tail;
+	}link;
+
+	unsigned long mask_base;
+	struct pci_dev *dev;
+};
+
+#endif /* _ASM_PCI_MSI_H */

_