diff -urN ./linux-2.6.18.1/Documentation/kernel-parameters.txt linux-2.6.18.1-cabi-20070529-RT_HRT/Documentation/kernel-parameters.txt
--- ./linux-2.6.18.1/Documentation/kernel-parameters.txt	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/Documentation/kernel-parameters.txt	2007-05-19 23:58:35.000000000 +0900
@@ -1637,6 +1637,12 @@
 
 	time		Show timing data prefixed to each printk message line
 
+	timeout_granularity=
+			[KNL]
+			Timeout granularity: process timer wheel timers every
+			timeout_granularity jiffies. Defaults to 1 (process
+			timers HZ times per second - most finegrained).
+
 	clocksource=	[GENERIC_TIME] Override the default clocksource
 			Override the default clocksource and use the clocksource
 			with the name specified.
diff -urN ./linux-2.6.18.1/Documentation/rt-mutex-design.txt linux-2.6.18.1-cabi-20070529-RT_HRT/Documentation/rt-mutex-design.txt
--- ./linux-2.6.18.1/Documentation/rt-mutex-design.txt	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/Documentation/rt-mutex-design.txt	2007-05-19 23:58:35.000000000 +0900
@@ -333,11 +333,11 @@
 
 unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
 {
-        unsigned long T = *A;
-        if (*A == *B) {
-                *A = *C;
-        }
-        return T;
+	unsigned long T = *A;
+	if (*A == *B) {
+		*A = *C;
+	}
+	return T;
 }
 #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
 
@@ -582,7 +582,7 @@
 try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
 slow path.  The first thing that is done here is an atomic setting of
 the "Has Waiters" flag of the mutex's owner field.  Yes, this could really
-be false, because if the the mutex has no owner, there are no waiters and
+be false, because if the mutex has no owner, there are no waiters and
 the current task also won't have any waiters.  But we don't have the lock
 yet, so we assume we are going to be a waiter.  The reason for this is to
 play nice for those architectures that do have CMPXCHG.  By setting this flag
@@ -735,7 +735,7 @@
 in the slow path too.  If a waiter of a mutex woke up because of a signal
 or timeout between the time the owner failed the fast path CMPXCHG check and
 the grabbing of the wait_lock, the mutex may not have any waiters, thus the
-owner still needs to make this check. If there are no waiters than the mutex
+owner still needs to make this check. If there are no waiters then the mutex
 owner field is set to NULL, the wait_lock is released and nothing more is
 needed.
 
diff -urN ./linux-2.6.18.1/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/Makefile
--- ./linux-2.6.18.1/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/Makefile	2007-05-28 20:35:09.000000000 +0900
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 18
-EXTRAVERSION = .1
+EXTRAVERSION = -rt5-RT-100
 NAME=Avast! A bilge rat!
 
 # *DOCUMENTATION*
@@ -485,10 +485,14 @@
 
 include $(srctree)/arch/$(ARCH)/Makefile
 
-ifdef CONFIG_FRAME_POINTER
-CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ifdef CONFIG_MCOUNT
+CFLAGS                += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
 else
-CFLAGS		+= -fomit-frame-pointer
+  ifdef CONFIG_FRAME_POINTER
+    CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+  else
+    CFLAGS		+= -fomit-frame-pointer
+  endif
 endif
 
 ifdef CONFIG_UNWIND_INFO
@@ -522,7 +526,7 @@
 #
 # INSTALL_PATH specifies where to place the updated kernel and system map
 # images. Default is /boot, but you can set it to other values
-export	INSTALL_PATH ?= /boot
+export	INSTALL_PATH = /boot
 
 #
 # INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory
@@ -598,6 +602,10 @@
 #
 # System.map is generated to document addresses of all kernel symbols
 
+ifeq ($(strip $(CONFIG_CABI)),y)
+libgcc=$(shell $(CC) -print-libgcc-file-name)
+endif
+
 vmlinux-init := $(head-y) $(init-y)
 vmlinux-main := $(core-y) $(libs-y) $(drivers-y) $(net-y)
 vmlinux-all  := $(vmlinux-init) $(vmlinux-main)
diff -urN ./linux-2.6.18.1/arch/arm/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/Kconfig
--- ./linux-2.6.18.1/arch/arm/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -17,6 +17,10 @@
 	  Europe.  There is an ARM Linux project with a web page at
 	  <http://www.arm.linux.org.uk/>.
 
+config GENERIC_TIME
+	bool
+	default y
+
 config MMU
 	bool
 	default y
@@ -51,6 +55,18 @@
 	bool
 	default y
 
+config STACKTRACE_SUPPORT
+	bool
+	default y
+
+config LOCKDEP_SUPPORT
+	bool
+	default y
+
+config TRACE_IRQFLAGS_SUPPORT
+	bool
+	default y
+
 config HARDIRQS_SW_RESEND
 	bool
 	default y
@@ -344,6 +360,15 @@
 
 source "arch/arm/mach-netx/Kconfig"
 
+config IS_TICK_BASED
+	bool
+	depends on GENERIC_TIME
+	default y
+	help
+	  This is used on platforms that have not added a clocksource to
+	  support GENERIC_TIME.  Platforms which have a clocksource
+	  should set this to 'n' in their mach-*/Kconfig.
+
 # Definitions to make life easier
 config ARCH_ACORN
 	bool
@@ -419,6 +444,8 @@
 
 menu "Kernel Features"
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric Multi-Processing (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && REALVIEW_MPCORE
@@ -463,38 +490,7 @@
 	  accounting to be spread across the timer interval, preventing a
 	  "thundering herd" at every timer tick.
 
-config PREEMPT
-	bool "Preemptible Kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
-
-config NO_IDLE_HZ
-	bool "Dynamic tick timer"
-	help
-	  Select this option if you want to disable continuous timer ticks
-	  and have them programmed to occur as required. This option saves
-	  power as the system can remain in idle state for longer.
-
-	  By default dynamic tick is disabled during the boot, and can be
-	  manually enabled with:
-
-	    echo 1 > /sys/devices/system/timer/timer0/dyn_tick
-
-	  Alternatively, if you want dynamic tick automatically enabled
-	  during boot, pass "dyntick=enable" via the kernel command string.
-
-	  Please note that dynamic tick may affect the accuracy of
-	  timekeeping on some platforms depending on the implementation.
-	  Currently at least OMAP, PXA2xx and SA11x0 platforms are known
-	  to have accurate timekeeping with dynamic tick.
+source kernel/Kconfig.preempt
 
 config HZ
 	int
diff -urN ./linux-2.6.18.1/arch/arm/boot/compressed/head.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/boot/compressed/head.S
--- ./linux-2.6.18.1/arch/arm/boot/compressed/head.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/boot/compressed/head.S	2007-05-19 23:58:35.000000000 +0900
@@ -231,7 +231,8 @@
  */
 		cmp	r4, r2
 		bhs	wont_overwrite
-		add	r0, r4, #4096*1024	@ 4MB largest kernel size
+		sub	r3, sp, r5		@ > compressed kernel image
+		add	r0, r4, r3, lsl #2	@ allow for 4x expansion
 		cmp	r0, r5
 		bls	wont_overwrite
 
@@ -822,6 +823,19 @@
 		mov	pc, r10
 #endif
 
+#ifdef CONFIG_MCOUNT
+/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this
+ * trampoline
+ */
+                .text
+                .align 0
+                .type mcount %function
+                .global mcount
+mcount:
+		mov pc, lr	@ just return
+#endif
+
+
 reloc_end:
 
 		.align
diff -urN ./linux-2.6.18.1/arch/arm/common/time-acorn.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/common/time-acorn.c
--- ./linux-2.6.18.1/arch/arm/common/time-acorn.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/common/time-acorn.c	2007-05-19 23:58:35.000000000 +0900
@@ -77,7 +77,7 @@
 
 static struct irqaction ioc_timer_irq = {
 	.name		= "timer",
-	.flags		= IRQF_DISABLED,
+	.flags		= IRQF_DISABLED | IRQF_NODELAY,
 	.handler	= ioc_timer_interrupt
 };
 
diff -urN ./linux-2.6.18.1/arch/arm/kernel/dma.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/dma.c
--- ./linux-2.6.18.1/arch/arm/kernel/dma.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/dma.c	2007-05-19 23:58:35.000000000 +0900
@@ -20,7 +20,7 @@
 
 #include <asm/mach/dma.h>
 
-DEFINE_SPINLOCK(dma_spin_lock);
+DEFINE_RAW_SPINLOCK(dma_spin_lock);
 EXPORT_SYMBOL(dma_spin_lock);
 
 static dma_t dma_chan[MAX_DMA_CHANNELS];
diff -urN ./linux-2.6.18.1/arch/arm/kernel/entry-armv.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/entry-armv.S
--- ./linux-2.6.18.1/arch/arm/kernel/entry-armv.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/entry-armv.S	2007-05-19 23:58:35.000000000 +0900
@@ -191,6 +191,9 @@
 __irq_svc:
 	svc_entry
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	trace_hardirqs_off
+#endif
 #ifdef CONFIG_PREEMPT
 	get_thread_info tsk
 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
@@ -201,7 +204,7 @@
 	irq_handler
 #ifdef CONFIG_PREEMPT
 	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	blne	svc_preempt
 preempt_return:
 	ldr	r0, [tsk, #TI_PREEMPT]		@ read preempt value
@@ -211,6 +214,10 @@
 #endif
 	ldr	r0, [sp, #S_PSR]		@ irqs are already disabled
 	msr	spsr_cxsf, r0
+#ifdef CONFIG_TRACE_IRQFLAGS
+	tst	r0, #PSR_I_BIT
+	bleq	trace_hardirqs_on
+#endif
 	ldmia	sp, {r0 - pc}^			@ load r0 - pc, cpsr
 
 	.ltorg
@@ -228,7 +235,7 @@
 	str	r7, [tsk, #TI_PREEMPT]		@ expects preempt_count == 0
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	beq	preempt_return			@ go again
 	b	1b
 #endif
@@ -398,6 +405,9 @@
 __irq_usr:
 	usr_entry
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	trace_hardirqs_off
+#endif
 	get_thread_info tsk
 #ifdef CONFIG_PREEMPT
 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
@@ -412,6 +422,9 @@
 	teq	r0, r7
 	strne	r0, [r0, -r0]
 #endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	trace_hardirqs_on
+#endif
 
 	mov	why, #0
 	b	ret_to_user
diff -urN ./linux-2.6.18.1/arch/arm/kernel/entry-common.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/entry-common.S
--- ./linux-2.6.18.1/arch/arm/kernel/entry-common.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/entry-common.S	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 2000 Russell King
  *
+ * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -40,7 +42,7 @@
 fast_work_pending:
 	str	r0, [sp, #S_R0+S_OFF]!		@ returned r0
 work_pending:
-	tst	r1, #_TIF_NEED_RESCHED
+	tst	r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	bne	work_resched
 	tst	r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING
 	beq	no_work_pending
@@ -50,7 +52,8 @@
 	b	ret_slow_syscall		@ Check work again
 
 work_resched:
-	bl	schedule
+	bl	__schedule
+
 /*
  * "slow" syscall return path.  "why" tells us if this was a real syscall.
  */
@@ -387,6 +390,112 @@
 #include "calls.S"
 #undef ABI
 #undef OBSOLETE
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+
+#ifdef CONFIG_MCOUNT
+/*
+ * At the point where we are in mcount() we maintain the
+ * frame of the prologue code and keep the call to mcount()
+ * out of the stack frame list:
+
+        saved pc          <---\     caller of instrumented routine
+        saved lr              |
+        ip/prev_sp            |
+        fp        -----^      |
+         :                    |
+                              |
+     -> saved pc              |     instrumented routine
+    |   saved lr              |
+    |   ip/prev_sp            |
+    |   fp           ---------/
+    |     :
+    |
+    |                             mcount
+    |	saved pc
+    |	saved lr
+    |	ip/prev sp
+     --	fp
+        r3
+        r2
+        r1
+   sp-> r0
+         :
+ */
+
+	.text
+	.align 0
+	.type mcount %function
+	.global mcount
+
+/* gcc -pg generated FUNCTION_PROLOGUE references mcount()
+ * and has already created the stack frame invocation for
+ * the routine we have been called to instrument. We create
+ * a complete frame nevertheless, as we want to use the same
+ * call to mcount() from c code.
+ */
+mcount:
+
+	ldr	ip, =mcount_enabled	@ leave early, if disabled
+	ldr	ip, [ip]
+	cmp	ip, #0
+	moveq	pc, lr
+
+	mov	ip,  sp
+	stmdb   sp!, {r0 - r3, fp, ip, lr, pc}	@ create stack frame
+
+	ldr	r1, [fp, #-4]		@ get lr (the return address
+					@ of the caller of the
+					@ instrumented function)
+	mov	r0, lr			@ get lr - (the return address
+					@ of the instrumented function)
+
+	sub	fp, ip, #4		@ point fp at this frame
+
+	bl	__trace
+1:
+	ldmdb   fp, {r0 - r3, fp, sp, pc}	@ pop entry frame and return
+
+#endif
+
+/* ARM replacement for unsupported gcc __builtin_return_address(n)
+ * where 0 < n.  n == 0 is supported here as well.
+ *
+ * Walk up the stack frame until the desired frame is found or a NULL
+ * fp is encountered, return NULL in the latter case.
+ *
+ * Note: it is possible under code optimization for the stack invocation
+ * of an ancestor function (level N) to be removed before calling a
+ * descendant function (level N+1).  No easy means is available to deduce
+ * this scenario with the result being [for example] caller_addr(0) when
+ * called from level N+1 returning level N-1 rather than the expected
+ * level N.  This optimization issue appears isolated to the case of
+ * a call to a level N+1 routine made at the tail end of a level N
+ * routine -- the level N frame is deleted and a simple branch is made
+ * to the level N+1 routine.
+ */
+
+	.text
+	.align 0
+	.type arm_return_addr %function
+	.global arm_return_addr
+
+arm_return_addr:
+	mov	ip, r0
+	mov	r0, fp
+3:
+	cmp	r0, #0
+	beq	1f		@ frame list hit end, bail
+	cmp	ip, #0
+	beq	2f		@ reached desired frame
+	ldr	r0, [r0, #-12]  @ else continue, get next fp
+	sub	ip, ip, #1
+	b	 3b
+2:
+	ldr	r0, [r0, #-4]   @ get target return address
+1:
+	mov	pc, lr
 
 #endif
 
diff -urN ./linux-2.6.18.1/arch/arm/kernel/fiq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/fiq.c
--- ./linux-2.6.18.1/arch/arm/kernel/fiq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/fiq.c	2007-05-19 23:58:35.000000000 +0900
@@ -89,7 +89,7 @@
  * disable irqs for the duration.  Note - these functions are almost
  * entirely coded in assembly.
  */
-void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
@@ -107,7 +107,7 @@
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
 
-void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
diff -urN ./linux-2.6.18.1/arch/arm/kernel/head.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/head.S
--- ./linux-2.6.18.1/arch/arm/kernel/head.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/head.S	2007-05-19 23:58:35.000000000 +0900
@@ -234,18 +234,19 @@
 
 	/*
 	 * Now setup the pagetables for our kernel direct
-	 * mapped region.  We round TEXTADDR down to the
-	 * nearest megabyte boundary.  It is assumed that
-	 * the kernel fits within 4 contigous 1MB sections.
+	 * mapped region.
 	 */
 	add	r0, r4,  #(TEXTADDR & 0xff000000) >> 18	@ start of kernel
 	str	r3, [r0, #(TEXTADDR & 0x00f00000) >> 18]!
-	add	r3, r3, #1 << 20
-	str	r3, [r0, #4]!			@ KERNEL + 1MB
-	add	r3, r3, #1 << 20
-	str	r3, [r0, #4]!			@ KERNEL + 2MB
-	add	r3, r3, #1 << 20
-	str	r3, [r0, #4]			@ KERNEL + 3MB
+
+	ldr	r6, =(_end - PAGE_OFFSET)
+	sub	r6, r6, #1			@ r6 = number of sections
+	mov	r6, r6, lsr #20			@ needed for kernel minus 1
+
+1:	add	r3, r3, #1 << 20
+	str	r3, [r0, #4]!
+	subs	r6, r6, #1
+	bgt	1b
 
 	/*
 	 * Then map first 1MB of ram in case it contains our boot params.
diff -urN ./linux-2.6.18.1/arch/arm/kernel/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/irq.c
--- ./linux-2.6.18.1/arch/arm/kernel/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -101,7 +101,7 @@
 /* Handle bad interrupts */
 static struct irq_desc bad_irq_desc = {
 	.handle_irq = handle_bad_irq,
-	.lock = SPIN_LOCK_UNLOCKED
+	.lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock)
 };
 
 /*
@@ -109,10 +109,12 @@
  * come via this function.  Instead, they should provide their
  * own 'handler'
  */
-asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
+asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct irqdesc *desc = irq_desc + irq;
 
+	trace_special(instruction_pointer(regs), irq, 0);
+
 	/*
 	 * Some hardware gives randomly wrong interrupts.  Rather
 	 * than crashing, do something sensible.
diff -urN ./linux-2.6.18.1/arch/arm/kernel/process.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/process.c
--- ./linux-2.6.18.1/arch/arm/kernel/process.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/process.c	2007-05-19 23:58:35.000000000 +0900
@@ -123,7 +123,7 @@
 		cpu_relax();
 	else {
 		local_irq_disable();
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			timer_dyn_reprogram();
 			arch_idle();
 		}
@@ -154,12 +154,20 @@
 		if (!idle)
 			idle = default_idle;
 		leds_event(led_idle_start);
-		while (!need_resched())
-			idle();
+
+		if (!need_resched()  && !need_resched_delayed() &&
+		    !hrtimer_stop_sched_tick()) {
+			while (!need_resched() && !need_resched_delayed())
+				idle();
+		}
+		hrtimer_restart_sched_tick();
+
 		leds_event(led_idle_end);
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
diff -urN ./linux-2.6.18.1/arch/arm/kernel/semaphore.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/semaphore.c
--- ./linux-2.6.18.1/arch/arm/kernel/semaphore.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/semaphore.c	2007-05-19 23:58:35.000000000 +0900
@@ -49,14 +49,16 @@
  *    we cannot lose wakeup events.
  */
 
-void __up(struct semaphore *sem)
+fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
+EXPORT_SYMBOL(__compat_up);
+
 static DEFINE_SPINLOCK(semaphore_lock);
 
-void __sched __down(struct semaphore * sem)
+fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +91,9 @@
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+EXPORT_SYMBOL(__compat_down);
+
+fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -140,6 +144,8 @@
 	return retval;
 }
 
+EXPORT_SYMBOL(__compat_down_interruptible);
+
 /*
  * Trylock failed - make sure we correct for
  * having decremented the count.
@@ -148,7 +154,7 @@
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-int __down_trylock(struct semaphore * sem)
+fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -168,6 +174,15 @@
 	return 1;
 }
 
+EXPORT_SYMBOL(__compat_down_trylock);
+
+fastcall int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
+
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
@@ -185,7 +200,7 @@
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down				\n\
+	bl	__compat_down			\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
 	.align	5				\n\
@@ -193,7 +208,7 @@
 __down_interruptible_failed:			\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_interruptible		\n\
+	bl	__compat_down_interruptible	\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
@@ -202,7 +217,7 @@
 __down_trylock_failed:				\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_trylock			\n\
+	bl	__compat_down_trylock		\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 						\n\
@@ -211,7 +226,7 @@
 __up_wakeup:					\n\
 	stmfd	sp!, {r0 - r4, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__up				\n\
+	bl	__compat_up			\n\
 	ldmfd	sp!, {r0 - r4, pc}		\n\
 	");
 
diff -urN ./linux-2.6.18.1/arch/arm/kernel/signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/signal.c
--- ./linux-2.6.18.1/arch/arm/kernel/signal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/signal.c	2007-05-19 23:58:35.000000000 +0900
@@ -630,6 +630,14 @@
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
diff -urN ./linux-2.6.18.1/arch/arm/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/smp.c
--- ./linux-2.6.18.1/arch/arm/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -515,7 +515,7 @@
 		cpu_clear(cpu, data->unfinished);
 }
 
-static DEFINE_SPINLOCK(stop_lock);
+static DEFINE_RAW_SPINLOCK(stop_lock);
 
 /*
  * ipi_cpu_stop - handle IPI from smp_send_stop()
diff -urN ./linux-2.6.18.1/arch/arm/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/time.c
--- ./linux-2.6.18.1/arch/arm/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -69,10 +69,12 @@
  */
 int (*set_rtc)(void);
 
+#ifdef CONFIG_IS_TICK_BASED
 static unsigned long dummy_gettimeoffset(void)
 {
 	return 0;
 }
+#endif
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -84,34 +86,10 @@
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 
-static unsigned long next_rtc_update;
-
-/*
- * If we have an externally synchronized linux clock, then update
- * CMOS clock accordingly every ~11 minutes.  set_rtc() has to be
- * called as close as possible to 500 ms before the new second
- * starts.
- */
-static inline void do_set_rtc(void)
+void sync_persistent_clock(struct timespec ts)
 {
-	if (!ntp_synced() || set_rtc == NULL)
-		return;
-
-	if (next_rtc_update &&
-	    time_before((unsigned long)xtime.tv_sec, next_rtc_update))
-		return;
-
-	if (xtime.tv_nsec < 500000000 - ((unsigned) tick_nsec >> 1) &&
-	    xtime.tv_nsec >= 500000000 + ((unsigned) tick_nsec >> 1))
-		return;
-
-	if (set_rtc())
-		/*
-		 * rtc update failed.  Try again in 60s
-		 */
-		next_rtc_update = xtime.tv_sec + 60;
-	else
-		next_rtc_update = xtime.tv_sec + 660;
+	if (set_rtc)
+		set_rtc();
 }
 
 #ifdef CONFIG_LEDS
@@ -230,68 +208,6 @@
 #define	do_leds()
 #endif
 
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long flags;
-	unsigned long seq;
-	unsigned long usec, sec, lost;
-
-	do {
-		seq = read_seqbegin_irqsave(&xtime_lock, flags);
-		usec = system_timer->offset();
-
-		lost = jiffies - wall_jiffies;
-		if (lost)
-			usec += lost * USECS_PER_JIFFY;
-
-		sec = xtime.tv_sec;
-		usec += xtime.tv_nsec / 1000;
-	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-
-	/* usec may have gone up a lot: be safe */
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * done, and then undo it!
-	 */
-	nsec -= system_timer->offset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 /**
  * save_time_delta - Save the offset between system time and RTC time
  * @delta: pointer to timespec to store delta
@@ -332,7 +248,6 @@
 {
 	profile_tick(CPU_PROFILING, regs);
 	do_leds();
-	do_set_rtc();
 	do_timer(regs);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
@@ -500,8 +415,10 @@
 
 void __init time_init(void)
 {
+#ifdef CONFIG_IS_TICK_BASED
 	if (system_timer->offset == NULL)
 		system_timer->offset = dummy_gettimeoffset;
+#endif
 	system_timer->init();
 
 #ifdef CONFIG_NO_IDLE_HZ
diff -urN ./linux-2.6.18.1/arch/arm/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/traps.c
--- ./linux-2.6.18.1/arch/arm/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -176,6 +176,7 @@
 {
 #ifdef CONFIG_DEBUG_ERRORS
 	__backtrace();
+	print_traces(current);
 #endif
 }
 
@@ -191,7 +192,7 @@
 	if (tsk != current)
 		fp = thread_saved_fp(tsk);
 	else
-		asm("mov%? %0, fp" : "=r" (fp));
+		asm("mov %0, fp" : "=r" (fp) : : "cc");
 
 	c_backtrace(fp, 0x10);
 	barrier();
@@ -216,7 +217,7 @@
 	}
 }
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 /*
  * This function is protected against re-entrancy.
@@ -252,7 +253,7 @@
 }
 
 static LIST_HEAD(undef_hook);
-static DEFINE_SPINLOCK(undef_lock);
+static DEFINE_RAW_SPINLOCK(undef_lock);
 
 void register_undef_hook(struct undef_hook *hook)
 {
diff -urN ./linux-2.6.18.1/arch/arm/lib/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/lib/Makefile
--- ./linux-2.6.18.1/arch/arm/lib/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/lib/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -41,6 +41,7 @@
 lib-$(CONFIG_ARCH_CLPS7500)	+= io-acorn.o
 lib-$(CONFIG_ARCH_L7200)	+= io-acorn.o
 lib-$(CONFIG_ARCH_SHARK)	+= io-shark.o
+lib-$(CONFIG_STACKTRACE)	+= stacktrace.o
 
 $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
diff -urN ./linux-2.6.18.1/arch/arm/lib/stacktrace.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/lib/stacktrace.c
--- ./linux-2.6.18.1/arch/arm/lib/stacktrace.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/lib/stacktrace.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,77 @@
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+
+struct stackframe {
+	unsigned long fp;
+	unsigned long sp;
+	unsigned long lr;
+	unsigned long pc;
+};
+
+int walk_stackframe(unsigned long fp, unsigned long low, unsigned long high,
+		    int (*fn)(struct stackframe *, void *), void *data)
+{
+	struct stackframe *frame;
+
+	do {
+		/*
+		 * Check current frame pointer is within bounds
+		 */
+		if ((fp - 12) < low || fp + 4 >= high)
+			break;
+
+		frame = (struct stackframe *)(fp - 12);
+
+		if (fn(frame, data))
+			break;
+
+		/*
+		 * Update the low bound - the next frame must always
+		 * be at a higher address than the current frame.
+		 */
+		low = fp + 4;
+		fp = frame->fp;
+	} while (fp);
+
+	return 0;
+}
+
+struct stack_trace_data {
+	struct stack_trace *trace;
+	unsigned int skip;
+};
+
+static int save_trace(struct stackframe *frame, void *d)
+{
+	struct stack_trace_data *data = d;
+	struct stack_trace *trace = data->trace;
+
+	if (data->skip) {
+		data->skip--;
+		return 0;
+	}
+
+	trace->entries[trace->nr_entries++] = frame->lr;
+
+	return trace->nr_entries >= trace->max_entries;
+}
+
+void save_stack_trace(struct stack_trace *trace, struct task_struct *task,
+		      int all_contexts, unsigned int skip)
+{
+	struct stack_trace_data data;
+	unsigned long fp, base;
+
+	data.trace = trace;
+	data.skip = skip;
+
+	if (task) {
+		base = (unsigned long)task_stack_page(task);
+		fp = 0;
+	} else {
+		base = (unsigned long)task_stack_page(current);
+		asm("mov %0, fp" : "=r" (fp));
+	}
+
+	walk_stackframe(fp, base, base + THREAD_SIZE, save_trace, &data);
+}
diff -urN ./linux-2.6.18.1/arch/arm/mach-footbridge/netwinder-hw.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-footbridge/netwinder-hw.c
--- ./linux-2.6.18.1/arch/arm/mach-footbridge/netwinder-hw.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-footbridge/netwinder-hw.c	2007-05-19 23:58:35.000000000 +0900
@@ -67,7 +67,7 @@
 /*
  * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE
  */
-DEFINE_SPINLOCK(gpio_lock);
+DEFINE_RAW_SPINLOCK(gpio_lock);
 
 static unsigned int current_gpio_op;
 static unsigned int current_gpio_io;
diff -urN ./linux-2.6.18.1/arch/arm/mach-footbridge/netwinder-leds.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-footbridge/netwinder-leds.c
--- ./linux-2.6.18.1/arch/arm/mach-footbridge/netwinder-leds.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-footbridge/netwinder-leds.c	2007-05-19 23:58:35.000000000 +0900
@@ -32,7 +32,7 @@
 static char hw_led_state;
 
 static DEFINE_SPINLOCK(leds_lock);
-extern spinlock_t gpio_lock;
+extern raw_spinlock_t gpio_lock;
 
 static void netwinder_leds_event(led_event_t evt)
 {
diff -urN ./linux-2.6.18.1/arch/arm/mach-integrator/core.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-integrator/core.c
--- ./linux-2.6.18.1/arch/arm/mach-integrator/core.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-integrator/core.c	2007-05-19 23:58:35.000000000 +0900
@@ -164,7 +164,7 @@
 
 #define CM_CTRL	IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET
 
-static DEFINE_SPINLOCK(cm_lock);
+static DEFINE_RAW_SPINLOCK(cm_lock);
 
 /**
  * cm_control - update the CM_CTRL register.
diff -urN ./linux-2.6.18.1/arch/arm/mach-integrator/pci_v3.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-integrator/pci_v3.c
--- ./linux-2.6.18.1/arch/arm/mach-integrator/pci_v3.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-integrator/pci_v3.c	2007-05-19 23:58:35.000000000 +0900
@@ -162,7 +162,7 @@
  *	 7:2	register number
  *  
  */
-static DEFINE_SPINLOCK(v3_lock);
+static DEFINE_RAW_SPINLOCK(v3_lock);
 
 #define PCI_BUS_NONMEM_START	0x00000000
 #define PCI_BUS_NONMEM_SIZE	SZ_256M
diff -urN ./linux-2.6.18.1/arch/arm/mach-integrator/platsmp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-integrator/platsmp.c
--- ./linux-2.6.18.1/arch/arm/mach-integrator/platsmp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-integrator/platsmp.c	2007-05-19 23:58:35.000000000 +0900
@@ -31,7 +31,7 @@
 volatile int __cpuinitdata pen_release = -1;
 unsigned long __cpuinitdata phys_pen_release = 0;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __cpuinit platform_secondary_init(unsigned int cpu)
 {
diff -urN ./linux-2.6.18.1/arch/arm/mach-ixp4xx/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-ixp4xx/Kconfig
--- ./linux-2.6.18.1/arch/arm/mach-ixp4xx/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-ixp4xx/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -1,5 +1,9 @@
 if ARCH_IXP4XX
 
+config IS_TICK_BASED
+	bool
+	default n
+
 config ARCH_SUPPORTS_BIG_ENDIAN
 	bool
 	default y
diff -urN ./linux-2.6.18.1/arch/arm/mach-ixp4xx/common-pci.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-ixp4xx/common-pci.c
--- ./linux-2.6.18.1/arch/arm/mach-ixp4xx/common-pci.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-ixp4xx/common-pci.c	2007-05-19 23:58:35.000000000 +0900
@@ -53,7 +53,7 @@
  * these transactions are atomic or we will end up
  * with corrupt data on the bus or in a driver.
  */
-static DEFINE_SPINLOCK(ixp4xx_pci_lock);
+static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock);
 
 /*
  * Read from PCI config space
diff -urN ./linux-2.6.18.1/arch/arm/mach-ixp4xx/common.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-ixp4xx/common.c
--- ./linux-2.6.18.1/arch/arm/mach-ixp4xx/common.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-ixp4xx/common.c	2007-05-19 23:58:35.000000000 +0900
@@ -26,6 +26,8 @@
 #include <linux/bitops.h>
 #include <linux/time.h>
 #include <linux/timex.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 
 #include <asm/hardware.h>
 #include <asm/uaccess.h>
@@ -38,6 +40,11 @@
 #include <asm/mach/irq.h>
 #include <asm/mach/time.h>
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int __init ixp4xx_clockevent_init(void);
+static struct clock_event clockevent_ixp4xx;
+#endif
+
 /*************************************************************************
  * IXP4xx chipset I/O mapping
  *************************************************************************/
@@ -253,25 +260,17 @@
 
 static unsigned volatile last_jiffy_time;
 
-#define CLOCK_TICKS_PER_USEC	((CLOCK_TICK_RATE + USEC_PER_SEC/2) / USEC_PER_SEC)
-
-/* IRQs are disabled before entering here from do_gettimeofday() */
-static unsigned long ixp4xx_gettimeoffset(void)
-{
-	u32 elapsed;
-
-	elapsed = *IXP4XX_OSTS - last_jiffy_time;
-
-	return elapsed / CLOCK_TICKS_PER_USEC;
-}
-
 static irqreturn_t ixp4xx_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	write_seqlock(&xtime_lock);
-
 	/* Clear Pending Interrupt by writing '1' to it */
 	*IXP4XX_OSST = IXP4XX_OSST_TIMER_1_PEND;
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+	if (clockevent_ixp4xx.event_handler)
+		clockevent_ixp4xx.event_handler(regs);
+#else
+	write_seqlock(&xtime_lock);
+
 	/*
 	 * Catch up with the real idea of time
 	 */
@@ -281,6 +280,7 @@
 	}
 
 	write_sequnlock(&xtime_lock);
+#endif
 
 	return IRQ_HANDLED;
 }
@@ -299,17 +299,18 @@
 	/* Setup the Timer counter value */
 	*IXP4XX_OSRT1 = (LATCH & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE;
 
-	/* Reset time-stamp counter */
-	*IXP4XX_OSTS = 0;
 	last_jiffy_time = 0;
 
 	/* Connect the interrupt handler and enable the interrupt */
 	setup_irq(IRQ_IXP4XX_TIMER1, &ixp4xx_timer_irq);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+	ixp4xx_clockevent_init();
+#endif
 }
 
 struct sys_timer ixp4xx_timer = {
 	.init		= ixp4xx_timer_init,
-	.offset		= ixp4xx_gettimeoffset,
 };
 
 static struct resource ixp46x_i2c_resources[] = {
@@ -365,3 +366,70 @@
 			ixp4xx_exp_bus_size >> 20);
 }
 
+cycle_t ixp4xx_get_cycles(void)
+{
+	return *IXP4XX_OSTS;
+}
+
+static struct clocksource clocksource_ixp4xx = {
+	.name 		= "OSTS",
+	.rating		= 200,
+	.read		= ixp4xx_get_cycles,
+	.mask		= 0xFFFFFFFF,
+	.shift 		= 20,
+	.is_continuous 	= 1,
+};
+
+static int __init ixp4xx_clocksource_init(void)
+{
+	/* Reset time-stamp counter */
+	*IXP4XX_OSTS = 0;
+
+	clocksource_ixp4xx.mult =
+		clocksource_khz2mult(66660, clocksource_ixp4xx.shift);
+	clocksource_register(&clocksource_ixp4xx);
+
+	return 0;
+}
+device_initcall(ixp4xx_clocksource_init);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+static u32 clockevent_mode = 0;
+
+static void ixp4xx_set_next_event(unsigned long evt, 
+				  struct clock_event *unused)
+{
+	u32 oneshot = (clockevent_mode == CLOCK_EVT_ONESHOT) ?
+		IXP4XX_OST_ONE_SHOT : 0;
+
+	*IXP4XX_OSRT1 = (evt & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE |
+		oneshot;
+}
+
+static void ixp4xx_set_mode(int mode, struct clock_event *evt)
+{
+	clockevent_mode = mode;
+}
+
+static struct clock_event clockevent_ixp4xx = {
+	.name		= "ixp4xx timer1",
+	.capabilities	= CLOCK_CAP_NEXTEVT | CLOCK_CAP_TICK | 
+			  CLOCK_CAP_UPDATE | CLOCK_CAP_PROFILE,
+	.shift		= 32,
+	.set_mode	= ixp4xx_set_mode,
+	.set_next_event	= ixp4xx_set_next_event,
+};
+
+static int __init ixp4xx_clockevent_init(void)
+{
+	clockevent_ixp4xx.mult = div_sc(FREQ, NSEC_PER_SEC,
+					clockevent_ixp4xx.shift);
+	clockevent_ixp4xx.max_delta_ns =
+		clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx);
+	clockevent_ixp4xx.min_delta_ns =
+		clockevent_delta2ns(0xf, &clockevent_ixp4xx);
+	register_local_clockevent(&clockevent_ixp4xx);
+
+	return 0;
+}
+#endif
diff -urN ./linux-2.6.18.1/arch/arm/mach-omap1/pm.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-omap1/pm.c
--- ./linux-2.6.18.1/arch/arm/mach-omap1/pm.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-omap1/pm.c	2007-05-19 23:58:35.000000000 +0900
@@ -120,7 +120,7 @@
 
 	local_irq_disable();
 	local_fiq_disable();
-	if (need_resched()) {
+	if (need_resched() || need_resched_delayed()) {
 		local_fiq_enable();
 		local_irq_enable();
 		return;
diff -urN ./linux-2.6.18.1/arch/arm/mach-omap2/pm.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-omap2/pm.c
--- ./linux-2.6.18.1/arch/arm/mach-omap2/pm.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-omap2/pm.c	2007-05-19 23:58:35.000000000 +0900
@@ -53,7 +53,7 @@
 {
 	local_irq_disable();
 	local_fiq_disable();
-	if (need_resched()) {
+	if (need_resched() || need_resched_delayed()) {
 		local_fiq_enable();
 		local_irq_enable();
 		return;
diff -urN ./linux-2.6.18.1/arch/arm/mach-sa1100/badge4.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-sa1100/badge4.c
--- ./linux-2.6.18.1/arch/arm/mach-sa1100/badge4.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-sa1100/badge4.c	2007-05-19 23:58:35.000000000 +0900
@@ -240,15 +240,22 @@
 	/* detect on->off and off->on transitions */
 	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
 		/* was off, now on */
-		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
 		GPSR = BADGE4_GPIO_PCMEN5V;
 	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
 		/* was on, now off */
-		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
 		GPCR = BADGE4_GPIO_PCMEN5V;
 	}
 
 	local_irq_restore(flags);
+
+	/* detect on->off and off->on transitions */
+	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
+		/* was off, now on */
+		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
+	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
+		/* was on, now off */
+		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
+	}
 }
 EXPORT_SYMBOL(badge4_set_5V);
 
diff -urN ./linux-2.6.18.1/arch/arm/mach-shark/leds.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-shark/leds.c
--- ./linux-2.6.18.1/arch/arm/mach-shark/leds.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-shark/leds.c	2007-05-19 23:58:35.000000000 +0900
@@ -32,7 +32,7 @@
 static short hw_led_state;
 static short saved_state;
 
-static DEFINE_SPINLOCK(leds_lock);
+static DEFINE_RAW_SPINLOCK(leds_lock);
 
 short sequoia_read(int addr) {
   outw(addr,0x24);
diff -urN ./linux-2.6.18.1/arch/arm/mach-versatile/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-versatile/Kconfig
--- ./linux-2.6.18.1/arch/arm/mach-versatile/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-versatile/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -1,6 +1,10 @@
 menu "Versatile platform type"
 	depends on ARCH_VERSATILE
 
+config IS_TICK_BASED
+	bool
+	default n
+
 config ARCH_VERSATILE_PB
 	bool "Support Versatile/PB platform"
 	default y
diff -urN ./linux-2.6.18.1/arch/arm/mach-versatile/core.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-versatile/core.c
--- ./linux-2.6.18.1/arch/arm/mach-versatile/core.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mach-versatile/core.c	2007-05-19 23:58:35.000000000 +0900
@@ -26,6 +26,8 @@
 #include <linux/interrupt.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/clcd.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 
 #include <asm/system.h>
 #include <asm/hardware.h>
@@ -808,59 +810,50 @@
 #define TICKS2USECS(x)	((x) / TICKS_PER_uSEC)
 #endif
 
-/*
- * Returns number of ms since last clock interrupt.  Note that interrupts
- * will have been disabled by do_gettimeoffset()
- */
-static unsigned long versatile_gettimeoffset(void)
+#ifdef CONFIG_HIGH_RES_TIMERS
+static void timer_set_mode(int mode, struct clock_event *clk)
 {
-	unsigned long ticks1, ticks2, status;
-
-	/*
-	 * Get the current number of ticks.  Note that there is a race
-	 * condition between us reading the timer and checking for
-	 * an interrupt.  We get around this by ensuring that the
-	 * counter has not reloaded between our two reads.
-	 */
-	ticks2 = readl(TIMER0_VA_BASE + TIMER_VALUE) & 0xffff;
-	do {
-		ticks1 = ticks2;
-		status = __raw_readl(VA_IC_BASE + VIC_RAW_STATUS);
-		ticks2 = readl(TIMER0_VA_BASE + TIMER_VALUE) & 0xffff;
-	} while (ticks2 > ticks1);
-
-	/*
-	 * Number of ticks since last interrupt.
-	 */
-	ticks1 = TIMER_RELOAD - ticks2;
-
-	/*
-	 * Interrupt pending?  If so, we've reloaded once already.
-	 *
-	 * FIXME: Need to check this is effectively timer 0 that expires
-	 */
-	if (status & IRQMASK_TIMERINT0_1)
-		ticks1 += TIMER_RELOAD;
+	if (mode == CLOCK_EVT_PERIODIC) {
+		writel(TIMER_CTRL_PERIODIC | TIMER_CTRL_32BIT | TIMER_CTRL_IE |
+		       TIMER_CTRL_ENABLE, TIMER0_VA_BASE + TIMER_CTRL);
+	} else {
+		writel(TIMER_CTRL_ONESHOT | TIMER_CTRL_32BIT | TIMER_CTRL_IE |
+		       TIMER_CTRL_ENABLE,  TIMER0_VA_BASE + TIMER_CTRL);
+	}
+}
 
-	/*
-	 * Convert the ticks to usecs
-	 */
-	return TICKS2USECS(ticks1);
+static void timer_set_next_event(unsigned long evt, struct clock_event *unused)
+{
+	BUG_ON(!evt);
+	writel(evt, TIMER0_VA_BASE + TIMER_LOAD);
 }
 
+static struct clock_event timer0_clock =  {
+	.name		= "timer0",
+	.shift		= 32,
+	.capabilities	= CLOCK_CAP_TICK | CLOCK_CAP_UPDATE |
+			  CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE,
+	.set_mode	= timer_set_mode,
+	.set_next_event	= timer_set_next_event,
+};
+#endif
+
 /*
  * IRQ handler for the timer
  */
 static irqreturn_t versatile_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	write_seqlock(&xtime_lock);
-
 	// ...clear the interrupt
 	writel(1, TIMER0_VA_BASE + TIMER_INTCLR);
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+	if (timer0_clock.event_handler)
+		timer0_clock.event_handler(regs);
+#else
+	write_seqlock(&xtime_lock);
 	timer_tick(regs);
-
 	write_sequnlock(&xtime_lock);
+#endif
 
 	return IRQ_HANDLED;
 }
@@ -893,11 +886,20 @@
 	/*
 	 * Initialise to a known state (all timers off)
 	 */
-	writel(0, TIMER0_VA_BASE + TIMER_CTRL);
+ 	writel(0, TIMER0_VA_BASE + TIMER_CTRL);
 	writel(0, TIMER1_VA_BASE + TIMER_CTRL);
 	writel(0, TIMER2_VA_BASE + TIMER_CTRL);
 	writel(0, TIMER3_VA_BASE + TIMER_CTRL);
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+	timer0_clock.mult = div_sc(1000000, NSEC_PER_SEC, timer0_clock.shift);
+	timer0_clock.max_delta_ns =
+		clockevent_delta2ns(0xffffffff, &timer0_clock);
+	timer0_clock.min_delta_ns =
+		clockevent_delta2ns(0xf, &timer0_clock);
+	register_global_clockevent(&timer0_clock);
+#endif
+
 	writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_LOAD);
 	writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_VALUE);
 	writel(TIMER_DIVISOR | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC |
@@ -911,5 +913,36 @@
 
 struct sys_timer versatile_timer = {
 	.init		= versatile_timer_init,
-	.offset		= versatile_gettimeoffset,
 };
+
+cycle_t versatile_get_cycles(void)
+{
+	return ~readl(TIMER3_VA_BASE + TIMER_VALUE);
+}
+
+static struct clocksource clocksource_versatile = {
+	.name 		= "timer3",
+	.rating		= 200,
+	.read		= versatile_get_cycles,
+	.mask		= 0xFFFFFFFF,
+	.shift 		= 20,
+	.is_continuous 	= 1,
+};
+
+static int __init versatile_clocksource_init(void)
+{
+	writel(0, TIMER3_VA_BASE + TIMER_CTRL);
+	writel(0xffffffff, TIMER3_VA_BASE + TIMER_LOAD);
+	writel(0xffffffff, TIMER3_VA_BASE + TIMER_VALUE);
+	writel(TIMER_CTRL_32BIT | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC,
+	       TIMER3_VA_BASE + TIMER_CTRL);
+
+	clocksource_versatile.mult =
+		clocksource_khz2mult(1000, clocksource_versatile.shift);
+	clocksource_register(&clocksource_versatile);
+
+	return 0;
+}
+
+device_initcall(versatile_clocksource_init);
+
diff -urN ./linux-2.6.18.1/arch/arm/mm/consistent.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/consistent.c
--- ./linux-2.6.18.1/arch/arm/mm/consistent.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/consistent.c	2007-05-19 23:58:35.000000000 +0900
@@ -40,7 +40,7 @@
  * These are the page tables (2MB each) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte[NUM_CONSISTENT_PTES];
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
diff -urN ./linux-2.6.18.1/arch/arm/mm/copypage-v4mc.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/copypage-v4mc.c
--- ./linux-2.6.18.1/arch/arm/mm/copypage-v4mc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/copypage-v4mc.c	2007-05-19 23:58:35.000000000 +0900
@@ -29,7 +29,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * ARMv4 mini-dcache optimised copy_user_page
@@ -43,7 +43,7 @@
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_page that does the right thing.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
@@ -82,7 +82,7 @@
 /*
  * ARMv4 optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
diff -urN ./linux-2.6.18.1/arch/arm/mm/copypage-v6.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/copypage-v6.c
--- ./linux-2.6.18.1/arch/arm/mm/copypage-v6.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/copypage-v6.c	2007-05-19 23:58:35.000000000 +0900
@@ -26,7 +26,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(v6_lock);
+static DEFINE_RAW_SPINLOCK(v6_lock);
 
 /*
  * Copy the user page.  No aliasing to deal with so we can just
diff -urN ./linux-2.6.18.1/arch/arm/mm/copypage-xscale.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/copypage-xscale.c
--- ./linux-2.6.18.1/arch/arm/mm/copypage-xscale.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/copypage-xscale.c	2007-05-19 23:58:35.000000000 +0900
@@ -31,7 +31,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * XScale mini-dcache optimised copy_user_page
@@ -41,7 +41,7 @@
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	/*
@@ -104,7 +104,7 @@
 /*
  * XScale optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
diff -urN ./linux-2.6.18.1/arch/arm/mm/fault.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/fault.c
--- ./linux-2.6.18.1/arch/arm/mm/fault.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/fault.c	2007-05-19 23:58:35.000000000 +0900
@@ -215,7 +215,7 @@
 	return fault;
 }
 
-static int
+static notrace int
 do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk;
@@ -315,7 +315,7 @@
  * interrupt or a critical region, and should only copy the information
  * from the master page table, nothing more.
  */
-static int
+static notrace int
 do_translation_fault(unsigned long addr, unsigned int fsr,
 		     struct pt_regs *regs)
 {
@@ -361,7 +361,7 @@
  * Some section permission faults need to be handled gracefully.
  * They can happen due to a __{get,put}_user during an oops.
  */
-static int
+static notrace int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
@@ -372,7 +372,7 @@
 /*
  * This abort handler always returns "fault".
  */
-static int
+static notrace int
 do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	return 1;
@@ -427,7 +427,7 @@
 	{ do_bad,		SIGBUS,  0,		"unknown 31"			   }
 };
 
-void __init
+void __init notrace
 hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *),
 		int sig, const char *name)
 {
@@ -441,7 +441,7 @@
 /*
  * Dispatch a data abort to the relevant handler.
  */
-asmlinkage void
+asmlinkage notrace void
 do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6);
@@ -460,7 +460,7 @@
 	notify_die("", regs, &info, fsr, 0);
 }
 
-asmlinkage void
+asmlinkage notrace void
 do_PrefetchAbort(unsigned long addr, struct pt_regs *regs)
 {
 	do_translation_fault(addr, 0, regs);
diff -urN ./linux-2.6.18.1/arch/arm/mm/init.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/init.c
--- ./linux-2.6.18.1/arch/arm/mm/init.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/mm/init.c	2007-05-19 23:58:35.000000000 +0900
@@ -25,7 +25,7 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern void _stext, _text, _etext, __data_start, _end, __init_begin, __init_end;
diff -urN ./linux-2.6.18.1/arch/arm/plat-omap/clock.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/clock.c
--- ./linux-2.6.18.1/arch/arm/plat-omap/clock.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/clock.c	2007-05-19 23:58:35.000000000 +0900
@@ -29,7 +29,7 @@
 
 static LIST_HEAD(clocks);
 static DEFINE_MUTEX(clocks_mutex);
-static DEFINE_SPINLOCK(clockfw_lock);
+static DEFINE_RAW_SPINLOCK(clockfw_lock);
 
 static struct clk_functions *arch_clock;
 
diff -urN ./linux-2.6.18.1/arch/arm/plat-omap/dma.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/dma.c
--- ./linux-2.6.18.1/arch/arm/plat-omap/dma.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/dma.c	2007-05-19 23:58:35.000000000 +0900
@@ -949,7 +949,7 @@
 /*----------------------------------------------------------------------------*/
 
 static struct lcd_dma_info {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	int reserved;
 	void (* callback)(u16 status, void *data);
 	void *cb_data;
diff -urN ./linux-2.6.18.1/arch/arm/plat-omap/gpio.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/gpio.c
--- ./linux-2.6.18.1/arch/arm/plat-omap/gpio.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/gpio.c	2007-05-19 23:58:35.000000000 +0900
@@ -120,7 +120,7 @@
 	u32 reserved_map;
 	u32 suspend_wakeup;
 	u32 saved_wakeup;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 #define METHOD_MPUIO		0
diff -urN ./linux-2.6.18.1/arch/arm/plat-omap/mux.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/mux.c
--- ./linux-2.6.18.1/arch/arm/plat-omap/mux.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/mux.c	2007-05-19 23:58:35.000000000 +0900
@@ -56,7 +56,7 @@
  */
 int __init_or_module omap_cfg_reg(const unsigned long index)
 {
-	static DEFINE_SPINLOCK(mux_spin_lock);
+	static DEFINE_RAW_SPINLOCK(mux_spin_lock);
 
 	unsigned long flags;
 	struct pin_config *cfg;
diff -urN ./linux-2.6.18.1/arch/arm/plat-omap/pm.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/pm.c
--- ./linux-2.6.18.1/arch/arm/plat-omap/pm.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/arm/plat-omap/pm.c	2007-05-19 23:58:35.000000000 +0900
@@ -84,7 +84,7 @@
 
 	local_irq_disable();
 	local_fiq_disable();
-	if (need_resched()) {
+	if (need_resched() || need_resched_delayed()) {
 		local_fiq_enable();
 		local_irq_enable();
 		return;
diff -urN ./linux-2.6.18.1/arch/h8300/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/h8300/Kconfig
--- ./linux-2.6.18.1/arch/h8300/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/h8300/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -41,6 +41,10 @@
 	bool
 	default y
 
+config GENERIC_TIME
+	bool
+	default y
+
 config TIME_LOW_RES
 	bool
 	default y
diff -urN ./linux-2.6.18.1/arch/h8300/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/h8300/kernel/time.c
--- ./linux-2.6.18.1/arch/h8300/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/h8300/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -68,58 +68,6 @@
 	platform_timer_setup(timer_interrupt);
 }
 
-/*
- * This version of gettimeofday has near microsecond resolution.
- */
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long flags;
-	unsigned long usec, sec;
-
-	read_lock_irqsave(&xtime_lock, flags);
-	usec = 0;
-	sec = xtime.tv_sec;
-	usec += (xtime.tv_nsec / 1000);
-	read_unlock_irqrestore(&xtime_lock, flags);
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_lock_irq(&xtime_lock);
-	/* This is revolting. We need to set the xtime.tv_usec
-	 * correctly. However, the value in this location is
-	 * is value at the last tick.
-	 * Discover what correction gettimeofday
-	 * would have done, and then undo it!
-	 */
-	while (tv->tv_nsec < 0) {
-		tv->tv_nsec += NSEC_PER_SEC;
-		tv->tv_sec--;
-	}
-
-	xtime.tv_sec = tv->tv_sec;
-	xtime.tv_nsec = tv->tv_nsec;
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 unsigned long long sched_clock(void)
 {
 	return (unsigned long long)jiffies * (1000000000 / HZ);
diff -urN ./linux-2.6.18.1/arch/i386/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig
--- ./linux-2.6.18.1/arch/i386/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig	2007-05-20 14:14:28.000000000 +0900
@@ -65,6 +65,8 @@
 
 menu "Processor type and features"
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -261,6 +263,19 @@
 
 source "kernel/Kconfig.preempt"
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on M386 || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+	default y if !RWSEM_GENERIC_SPINLOCK
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on !SMP && !(X86_VISWS || X86_VOYAGER)
@@ -708,6 +723,7 @@
 
 config REGPARM
 	bool "Use register arguments"
+	depends on !MCOUNT
 	default y
 	help
 	Compile the kernel with -mregparm=3. This instructs gcc to use
@@ -791,6 +807,10 @@
 	  enable suspend on SMP systems. CPUs can be controlled through
 	  /sys/devices/system/cpu.
 
+config GENERIC_TIME_VSYSCALL
+	depends on EXPERIMENTAL
+	bool "VSYSCALL gettimeofday() interface"
+
 config COMPAT_VDSO
 	bool "Compat VDSO support"
 	default y
@@ -803,8 +823,11 @@
 
 	  If unsure, say Y.
 
+source "drivers/cabi/Kconfig"
+
 endmenu
 
+
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 	depends on HIGHMEM
@@ -966,6 +989,7 @@
 
 source "arch/i386/kernel/cpu/cpufreq/Kconfig"
 
+
 endmenu
 
 menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
diff -urN ./linux-2.6.18.1/arch/i386/Kconfig.cpu linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig.cpu
--- ./linux-2.6.18.1/arch/i386/Kconfig.cpu	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig.cpu	2007-05-19 23:58:35.000000000 +0900
@@ -235,11 +235,6 @@
 	depends on M386
 	default y
 
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	depends on !M386
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
diff -urN ./linux-2.6.18.1/arch/i386/Kconfig.debug linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig.debug
--- ./linux-2.6.18.1/arch/i386/Kconfig.debug	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig.debug	2007-05-19 23:58:35.000000000 +0900
@@ -22,6 +22,7 @@
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  This option will cause messages to be printed if free stack space
 	  drops below a certain limit.
@@ -29,6 +30,7 @@
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
@@ -49,6 +51,7 @@
 config DEBUG_RODATA
 	bool "Write protect kernel read-only data structures"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Mark the kernel read-only data as write-protected in the pagetables,
 	  in order to catch accidental (and incorrect) writes to such const
@@ -59,6 +62,7 @@
 config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  If you say Y here the kernel will use a 4Kb stacksize for the
 	  kernel stack attached to each process/thread. This facilitates
diff -urN ./linux-2.6.18.1/arch/i386/Kconfig.orig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig.orig
--- ./linux-2.6.18.1/arch/i386/Kconfig.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/Kconfig.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1209 @@
+#
+# For a description of the syntax of this configuration file,
+# see Documentation/kbuild/kconfig-language.txt.
+#
+
+mainmenu "Linux Kernel Configuration"
+
+config X86_32
+	bool
+	default y
+	help
+	  This is Linux's home port.  Linux was originally native to the Intel
+	  386, and runs on all the later x86 processors including the Intel
+	  486, 586, Pentiums, and various instruction-set-compatible chips by
+	  AMD, Cyrix, and others.
+
+config GENERIC_TIME
+	bool
+	default y
+
+config LOCKDEP_SUPPORT
+	bool
+	default y
+
+config STACKTRACE_SUPPORT
+	bool
+	default y
+
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
+config X86
+	bool
+	default y
+
+config MMU
+	bool
+	default y
+
+config SBUS
+	bool
+
+config GENERIC_ISA_DMA
+	bool
+	default y
+
+config GENERIC_IOMAP
+	bool
+	default y
+
+config GENERIC_HWEIGHT
+	bool
+	default y
+
+config ARCH_MAY_HAVE_PC_FDC
+	bool
+	default y
+
+config DMI
+	bool
+	default y
+
+source "init/Kconfig"
+
+menu "Processor type and features"
+
+source "kernel/time/Kconfig"
+
+config SMP
+	bool "Symmetric multi-processing support"
+	---help---
+	  This enables support for systems with more than one CPU. If you have
+	  a system with only one CPU, like most personal computers, say N. If
+	  you have a system with more than one CPU, say Y.
+
+	  If you say N here, the kernel will run on single and multiprocessor
+	  machines, but will use only one CPU of a multiprocessor machine. If
+	  you say Y here, the kernel will run on many, but not all,
+	  singleprocessor machines. On a singleprocessor machine, the kernel
+	  will run faster if you say N here.
+
+	  Note that if you say Y here and choose architecture "586" or
+	  "Pentium" under "Processor family", the kernel will not work on 486
+	  architectures. Similarly, multiprocessor kernels for the "PPro"
+	  architecture may not work on all Pentium based boards.
+
+	  People using multiprocessor machines who say Y here should also say
+	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
+	  Management" code will be disabled if you say Y here.
+
+	  See also the <file:Documentation/smp.txt>,
+	  <file:Documentation/i386/IO-APIC.txt>,
+	  <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  If you don't know what to do here, say N.
+
+choice
+	prompt "Subarchitecture Type"
+	default X86_PC
+
+config X86_PC
+	bool "PC-compatible"
+	help
+	  Choose this option if your computer is a standard PC or compatible.
+
+config X86_ELAN
+	bool "AMD Elan"
+	help
+	  Select this for an AMD Elan processor.
+
+	  Do not use this option for K6/Athlon/Opteron processors!
+
+	  If unsure, choose "PC-compatible" instead.
+
+config X86_VOYAGER
+	bool "Voyager (NCR)"
+	help
+	  Voyager is an MCA-based 32-way capable SMP architecture proprietary
+	  to NCR Corp.  Machine classes 345x/35xx/4100/51xx are Voyager-based.
+
+	  *** WARNING ***
+
+	  If you do not specifically know you have a Voyager based machine,
+	  say N here, otherwise the kernel you build will not be bootable.
+
+config X86_NUMAQ
+	bool "NUMAQ (IBM/Sequent)"
+	select SMP
+	select NUMA
+	help
+	  This option is used for getting Linux to run on a (IBM/Sequent) NUMA
+	  multiquad box. This changes the way that processors are bootstrapped,
+	  and uses Clustered Logical APIC addressing mode instead of Flat Logical.
+	  You will need a new lynxer.elf file to flash your firmware with - send
+	  email to <Martin.Bligh@us.ibm.com>.
+
+config X86_SUMMIT
+	bool "Summit/EXA (IBM x440)"
+	depends on SMP
+	help
+	  This option is needed for IBM systems that use the Summit/EXA chipset.
+	  In particular, it is needed for the x440.
+
+	  If you don't have one of these computers, you should say N here.
+	  If you want to build a NUMA kernel, you must select ACPI.
+
+config X86_BIGSMP
+	bool "Support for other sub-arch SMP systems with more than 8 CPUs"
+	depends on SMP
+	help
+	  This option is needed for the systems that have more than 8 CPUs
+	  and if the system is not of any sub-arch type above.
+
+	  If you don't have such a system, you should say N here.
+
+config X86_VISWS
+	bool "SGI 320/540 (Visual Workstation)"
+	help
+	  The SGI Visual Workstation series is an IA32-based workstation
+	  based on SGI systems chips with some legacy PC hardware attached.
+
+	  Say Y here to create a kernel to run on the SGI 320 or 540.
+
+	  A kernel compiled for the Visual Workstation will not run on PCs
+	  and vice versa. See <file:Documentation/sgi-visws.txt> for details.
+
+config X86_GENERICARCH
+       bool "Generic architecture (Summit, bigsmp, ES7000, default)"
+       depends on SMP
+       help
+          This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
+	  It is intended for a generic binary kernel.
+	  If you want a NUMA kernel, select ACPI.   We need SRAT for NUMA.
+
+config X86_ES7000
+	bool "Support for Unisys ES7000 IA32 series"
+	depends on SMP
+	help
+	  Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
+	  supposed to run on an IA32-based Unisys ES7000 system.
+	  Only choose this option if you have such a system, otherwise you
+	  should say N here.
+
+endchoice
+
+config ACPI_SRAT
+	bool
+	default y
+	depends on ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	select ACPI_NUMA
+
+config HAVE_ARCH_PARSE_SRAT
+       bool
+       default y
+       depends on ACPI_SRAT
+
+config X86_SUMMIT_NUMA
+	bool
+	default y
+	depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
+
+config X86_CYCLONE_TIMER
+	bool
+	default y
+	depends on X86_SUMMIT || X86_GENERICARCH
+
+config ES7000_CLUSTERED_APIC
+	bool
+	default y
+	depends on SMP && X86_ES7000 && MPENTIUMIII
+
+source "arch/i386/Kconfig.cpu"
+
+config HPET_TIMER
+	bool "HPET Timer Support"
+	help
+	  This enables the use of the HPET for the kernel's internal timer.
+	  HPET is the next generation timer replacing legacy 8254s.
+	  You can safely choose Y here.  However, HPET will only be
+	  activated if the platform and the BIOS support this feature.
+	  Otherwise the 8254 will be used for timing services.
+
+	  Choose N to continue using the legacy 8254 timer.
+
+config HPET_EMULATE_RTC
+	bool
+	depends on HPET_TIMER && RTC=y
+	default y
+
+config NR_CPUS
+	int "Maximum number of CPUs (2-255)"
+	range 2 255
+	depends on SMP
+	default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
+	default "8"
+	help
+	  This allows you to specify the maximum number of CPUs which this
+	  kernel will support.  The maximum supported value is 255 and the
+	  minimum value which makes sense is 2.
+
+	  This is purely to save memory - each supported CPU adds
+	  approximately eight kilobytes to the kernel image.
+
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on X86_HT
+	help
+	  SMT scheduler support improves the CPU scheduler's decision making
+	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  cost of slightly increased overhead in some places. If unsure say
+	  N here.
+
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on X86_HT
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
+source "kernel/Kconfig.preempt"
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on M386 || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+	default y if !RWSEM_GENERIC_SPINLOCK
+
+config X86_UP_APIC
+	bool "Local APIC support on uniprocessors"
+	depends on !SMP && !(X86_VISWS || X86_VOYAGER)
+	help
+	  A local APIC (Advanced Programmable Interrupt Controller) is an
+	  integrated interrupt controller in the CPU. If you have a single-CPU
+	  system which has a processor with a local APIC, you can say Y here to
+	  enable and use it. If you say Y here even though your machine doesn't
+	  have a local APIC, then the kernel will still run with no slowdown at
+	  all. The local APIC supports CPU-generated self-interrupts (timer,
+	  performance counters), and the NMI watchdog which detects hard
+	  lockups.
+
+config X86_UP_IOAPIC
+	bool "IO-APIC support on uniprocessors"
+	depends on X86_UP_APIC
+	help
+	  An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
+	  SMP-capable replacement for PC-style interrupt controllers. Most
+	  SMP systems and many recent uniprocessor systems have one.
+
+	  If you have a single-CPU system with an IO-APIC, you can say Y here
+	  to use it. If you say Y here even though your machine doesn't have
+	  an IO-APIC, then the kernel will still run with no slowdown at all.
+
+config X86_LOCAL_APIC
+	bool
+	depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
+	default y
+
+config X86_IO_APIC
+	bool
+	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
+	default y
+
+config X86_VISWS_APIC
+	bool
+	depends on X86_VISWS
+	default y
+
+config X86_MCE
+	bool "Machine Check Exception"
+	depends on !X86_VOYAGER
+	---help---
+	  Machine Check Exception support allows the processor to notify the
+	  kernel if it detects a problem (e.g. overheating, component failure).
+	  The action the kernel takes depends on the severity of the problem,
+	  ranging from a warning message on the console, to halting the machine.
+	  Your processor must be a Pentium or newer to support this - check the
+	  flags in /proc/cpuinfo for mce.  Note that some older Pentium systems
+	  have a design flaw which leads to false MCE events - hence MCE is
+	  disabled on all P5 processors, unless explicitly enabled with "mce"
+	  as a boot argument.  Similarly, if MCE is built in and creates a
+	  problem on some new non-standard machine, you can boot with "nomce"
+	  to disable it.  MCE support simply ignores non-MCE processors like
+	  the 386 and 486, so nearly everyone can say Y here.
+
+config X86_MCE_NONFATAL
+	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
+	depends on X86_MCE
+	help
+	  Enabling this feature starts a timer that triggers every 5 seconds which
+	  will look at the machine check registers to see if anything happened.
+	  Non-fatal problems automatically get corrected (but still logged).
+	  Disable this if you don't want to see these messages.
+	  Seeing the messages this option prints out may be indicative of dying hardware,
+	  or out-of-spec (ie, overclocked) hardware.
+	  This option only does something on certain CPUs.
+	  (AMD Athlon/Duron and Intel Pentium 4)
+
+config X86_MCE_P4THERMAL
+	bool "check for P4 thermal throttling interrupt."
+	depends on X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS
+	help
+	  Enabling this feature will cause a message to be printed when the P4
+	  enters thermal throttling.
+
+config VM86
+	default y
+	bool "Enable VM86 support" if EMBEDDED
+	help
+          This option is required by programs like DOSEMU to run 16-bit legacy
+	  code on X86 processors. It also may be needed by software like
+          XFree86 to initialize some video cards via BIOS. Disabling this
+          option saves about 6k.
+
+config TOSHIBA
+	tristate "Toshiba Laptop support"
+	---help---
+	  This adds a driver to safely access the System Management Mode of
+	  the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
+	  not work on models with a Phoenix BIOS. The System Management Mode
+	  is used to set the BIOS and power saving options on Toshiba portables.
+
+	  For information on utilities to make use of this driver see the
+	  Toshiba Linux utilities web site at:
+	  <http://www.buzzard.org.uk/toshiba/>.
+
+	  Say Y if you intend to run this kernel on a Toshiba portable.
+	  Say N otherwise.
+
+config I8K
+	tristate "Dell laptop support"
+	---help---
+	  This adds a driver to safely access the System Management Mode
+	  of the CPU on the Dell Inspiron 8000. The System Management Mode
+	  is used to read cpu temperature and cooling fan status and to
+	  control the fans on the I8K portables.
+
+	  This driver has been tested only on the Inspiron 8000 but it may
+	  also work with other Dell laptops. You can force loading on other
+	  models by passing the parameter `force=1' to the module. Use at
+	  your own risk.
+
+	  For information on utilities to make use of this driver see the
+	  I8K Linux utilities web site at:
+	  <http://people.debian.org/~dz/i8k/>
+
+	  Say Y if you intend to run this kernel on a Dell Inspiron 8000.
+	  Say N otherwise.
+
+config X86_REBOOTFIXUPS
+	bool "Enable X86 board specific fixups for reboot"
+	depends on X86
+	default n
+	---help---
+	  This enables chipset and/or board specific fixups to be done
+	  in order to get reboot to work correctly. This is only needed on
+	  some combinations of hardware and BIOS. The symptom, for which
+	  this config is intended, is when reboot ends with a stalled/hung
+	  system.
+
+	  Currently, the only fixup is for the Geode GX1/CS5530A/TROM2.1.
+	  combination.
+
+	  Say Y if you want to enable the fixup. Currently, it's safe to
+	  enable this option even if you don't need it.
+	  Say N otherwise.
+
+config MICROCODE
+	tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
+	---help---
+	  If you say Y here and also to "/dev file system support" in the
+	  'File systems' section, you will be able to update the microcode on
+	  Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II,
+	  Pentium III, Pentium 4, Xeon etc.  You will obviously need the
+	  actual microcode binary data itself which is not shipped with the
+	  Linux kernel.
+
+	  For latest news and information on obtaining all the required
+	  ingredients for this driver, check:
+	  <http://www.urbanmyth.org/microcode/>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called microcode.
+
+config X86_MSR
+	tristate "/dev/cpu/*/msr - Model-specific register support"
+	help
+	  This device gives privileged processes access to the x86
+	  Model-Specific Registers (MSRs).  It is a character device with
+	  major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
+	  MSR accesses are directed to a specific CPU on multi-processor
+	  systems.
+
+config X86_CPUID
+	tristate "/dev/cpu/*/cpuid - CPU information support"
+	help
+	  This device gives processes access to the x86 CPUID instruction to
+	  be executed on a specific processor.  It is a character device
+	  with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
+	  /dev/cpu/31/cpuid.
+
+source "drivers/firmware/Kconfig"
+
+choice
+	prompt "High Memory Support"
+	default NOHIGHMEM
+
+config NOHIGHMEM
+	bool "off"
+	depends on !X86_NUMAQ
+	---help---
+	  Linux can use up to 64 Gigabytes of physical memory on x86 systems.
+	  However, the address space of 32-bit x86 processors is only 4
+	  Gigabytes large. That means that, if you have a large amount of
+	  physical memory, not all of it can be "permanently mapped" by the
+	  kernel. The physical memory that's not permanently mapped is called
+	  "high memory".
+
+	  If you are compiling a kernel which will never run on a machine with
+	  more than 1 Gigabyte total physical RAM, answer "off" here (default
+	  choice and suitable for most users). This will result in a "3GB/1GB"
+	  split: 3GB are mapped so that each process sees a 3GB virtual memory
+	  space and the remaining part of the 4GB virtual memory space is used
+	  by the kernel to permanently map as much physical memory as
+	  possible.
+
+	  If the machine has between 1 and 4 Gigabytes physical RAM, then
+	  answer "4GB" here.
+
+	  If more than 4 Gigabytes is used then answer "64GB" here. This
+	  selection turns Intel PAE (Physical Address Extension) mode on.
+	  PAE implements 3-level paging on IA32 processors. PAE is fully
+	  supported by Linux, PAE mode is implemented on all recent Intel
+	  processors (Pentium Pro and better). NOTE: If you say "64GB" here,
+	  then the kernel will not boot on CPUs that don't support PAE!
+
+	  The actual amount of total physical memory will either be
+	  auto detected or can be forced by using a kernel command line option
+	  such as "mem=256M". (Try "man bootparam" or see the documentation of
+	  your boot loader (lilo or loadlin) about how to pass options to the
+	  kernel at boot time.)
+
+	  If unsure, say "off".
+
+config HIGHMEM4G
+	bool "4GB"
+	depends on !X86_NUMAQ
+	help
+	  Select this if you have a 32-bit processor and between 1 and 4
+	  gigabytes of physical RAM.
+
+config HIGHMEM64G
+	bool "64GB"
+	depends on X86_CMPXCHG64
+	help
+	  Select this if you have a 32-bit processor and more than 4
+	  gigabytes of physical RAM.
+
+endchoice
+
+choice
+	depends on EXPERIMENTAL && !X86_PAE
+	prompt "Memory split" if EMBEDDED
+	default VMSPLIT_3G
+	help
+	  Select the desired split between kernel and user memory.
+
+	  If the address range available to the kernel is less than the
+	  physical memory installed, the remaining memory will be available
+	  as "high memory". Accessing high memory is a little more costly
+	  than low memory, as it needs to be mapped into the kernel first.
+	  Note that increasing the kernel address space limits the range
+	  available to user programs, making the address space there
+	  tighter.  Selecting anything other than the default 3G/1G split
+	  will also likely make your kernel incompatible with binary-only
+	  kernel modules.
+
+	  If you are not absolutely sure what you are doing, leave this
+	  option alone!
+
+	config VMSPLIT_3G
+		bool "3G/1G user/kernel split"
+	config VMSPLIT_3G_OPT
+		bool "3G/1G user/kernel split (for full 1G low memory)"
+	config VMSPLIT_2G
+		bool "2G/2G user/kernel split"
+	config VMSPLIT_1G
+		bool "1G/3G user/kernel split"
+endchoice
+
+config PAGE_OFFSET
+	hex
+	default 0xB0000000 if VMSPLIT_3G_OPT
+	default 0x78000000 if VMSPLIT_2G
+	default 0x40000000 if VMSPLIT_1G
+	default 0xC0000000
+
+config HIGHMEM
+	bool
+	depends on HIGHMEM64G || HIGHMEM4G
+	default y
+
+config X86_PAE
+	bool
+	depends on HIGHMEM64G
+	default y
+	select RESOURCES_64BIT
+
+# Common NUMA Features
+config NUMA
+	bool "Numa Memory Allocation and Scheduler Support"
+	depends on SMP && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI)
+	default n if X86_PC
+	default y if (X86_NUMAQ || X86_SUMMIT)
+
+comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
+	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI)
+
+config NODES_SHIFT
+	int
+	default "4" if X86_NUMAQ
+	default "3"
+	depends on NEED_MULTIPLE_NODES
+
+config HAVE_ARCH_BOOTMEM_NODE
+	bool
+	depends on NUMA
+	default y
+
+config ARCH_HAVE_MEMORY_PRESENT
+	bool
+	depends on DISCONTIGMEM
+	default y
+
+config NEED_NODE_MEMMAP_SIZE
+	bool
+	depends on DISCONTIGMEM || SPARSEMEM
+	default y
+
+config HAVE_ARCH_ALLOC_REMAP
+	bool
+	depends on NUMA
+	default y
+
+config ARCH_FLATMEM_ENABLE
+	def_bool y
+	depends on (ARCH_SELECT_MEMORY_MODEL && X86_PC)
+
+config ARCH_DISCONTIGMEM_ENABLE
+	def_bool y
+	depends on NUMA
+
+config ARCH_DISCONTIGMEM_DEFAULT
+	def_bool y
+	depends on NUMA
+
+config ARCH_SPARSEMEM_ENABLE
+	def_bool y
+	depends on (NUMA || (X86_PC && EXPERIMENTAL))
+	select SPARSEMEM_STATIC
+
+config ARCH_SELECT_MEMORY_MODEL
+	def_bool y
+	depends on ARCH_SPARSEMEM_ENABLE
+
+source "mm/Kconfig"
+
+config HAVE_ARCH_EARLY_PFN_TO_NID
+	bool
+	default y
+	depends on NUMA
+
+config HIGHPTE
+	bool "Allocate 3rd-level pagetables from highmem"
+	depends on HIGHMEM4G || HIGHMEM64G
+	help
+	  The VM uses one page table entry for each page of physical memory.
+	  For systems with a lot of RAM, this can be wasteful of precious
+	  low memory.  Setting this option will put user-space page table
+	  entries in high memory.
+
+config MATH_EMULATION
+	bool "Math emulation"
+	---help---
+	  Linux can emulate a math coprocessor (used for floating point
+	  operations) if you don't have one. 486DX and Pentium processors have
+	  a math coprocessor built in, 486SX and 386 do not, unless you added
+	  a 487DX or 387, respectively. (The messages during boot time can
+	  give you some hints here ["man dmesg"].) Everyone needs either a
+	  coprocessor or this emulation.
+
+	  If you don't have a math coprocessor, you need to say Y here; if you
+	  say Y here even though you have a coprocessor, the coprocessor will
+	  be used nevertheless. (This behavior can be changed with the kernel
+	  command line option "no387", which comes handy if your coprocessor
+	  is broken. Try "man bootparam" or see the documentation of your boot
+	  loader (lilo or loadlin) about how to pass options to the kernel at
+	  boot time.) This means that it is a good idea to say Y here if you
+	  intend to use this kernel on different machines.
+
+	  More information about the internals of the Linux math coprocessor
+	  emulation can be found in <file:arch/i386/math-emu/README>.
+
+	  If you are not sure, say Y; apart from resulting in a 66 KB bigger
+	  kernel, it won't hurt.
+
+config MTRR
+	bool "MTRR (Memory Type Range Register) support"
+	---help---
+	  On Intel P6 family processors (Pentium Pro, Pentium II and later)
+	  the Memory Type Range Registers (MTRRs) may be used to control
+	  processor access to memory ranges. This is most useful if you have
+	  a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+	  allows bus write transfers to be combined into a larger transfer
+	  before bursting over the PCI/AGP bus. This can increase performance
+	  of image write operations 2.5 times or more. Saying Y here creates a
+	  /proc/mtrr file which may be used to manipulate your processor's
+	  MTRRs. Typically the X server should use this.
+
+	  This code has a reasonably generic interface so that similar
+	  control registers on other processors can be easily supported
+	  as well:
+
+	  The Cyrix 6x86, 6x86MX and M II processors have Address Range
+	  Registers (ARRs) which provide a similar functionality to MTRRs. For
+	  these, the ARRs are used to emulate the MTRRs.
+	  The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
+	  MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
+	  write-combining. All of these processors are supported by this code
+	  and it makes sense to say Y here if you have one of them.
+
+	  Saying Y here also fixes a problem with buggy SMP BIOSes which only
+	  set the MTRRs for the boot CPU and not for the secondary CPUs. This
+	  can lead to all sorts of problems, so it's good to say Y here.
+
+	  You can safely say Y even if your machine doesn't have MTRRs, you'll
+	  just add about 9 KB to your kernel.
+
+	  See <file:Documentation/mtrr.txt> for more information.
+
+config EFI
+	bool "Boot from EFI support"
+	depends on ACPI
+	default n
+	---help---
+	This enables the the kernel to boot on EFI platforms using
+	system configuration information passed to it from the firmware.
+	This also enables the kernel to use any EFI runtime services that are
+	available (such as the EFI variable services).
+
+	This option is only useful on systems that have EFI firmware
+	and will result in a kernel image that is ~8k larger.  In addition,
+	you must use the latest ELILO loader available at
+	<http://elilo.sourceforge.net> in order to take advantage of
+	kernel initialization using EFI information (neither GRUB nor LILO know
+	anything about EFI).  However, even with this option, the resultant
+	kernel should continue to boot on existing non-EFI platforms.
+
+config IRQBALANCE
+ 	bool "Enable kernel irq balancing"
+	depends on SMP && X86_IO_APIC
+	default y
+	help
+ 	  The default yes will allow the kernel to do irq load balancing.
+	  Saying no will keep the kernel from doing irq load balancing.
+
+# turning this on wastes a bunch of space.
+# Summit needs it only when NUMA is on
+config BOOT_IOREMAP
+	bool
+	depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
+	default y
+
+config REGPARM
+	bool "Use register arguments"
+	depends on !MCOUNT
+	default y
+	help
+	Compile the kernel with -mregparm=3. This instructs gcc to use
+	a more efficient function call ABI which passes the first three
+	arguments of a function call via registers, which results in denser
+	and faster code.
+
+	If this option is disabled, then the default ABI of passing
+	arguments via the stack is used.
+
+	If unsure, say Y.
+
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	depends on PROC_FS
+	default y
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via /proc/<pid>/seccomp, it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
+	  If unsure, say Y. Only embedded should say N here.
+
+source kernel/Kconfig.hz
+
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is independent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
+	  The name comes from the similarity to the exec system call.
+
+	  It is an ongoing process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
+config CRASH_DUMP
+	bool "kernel crash dumps (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on HIGHMEM
+	help
+	  Generate crash dump after being started by kexec.
+
+config PHYSICAL_START
+	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+
+	default "0x1000000" if CRASH_DUMP
+	default "0x100000"
+	help
+	  This gives the physical address where the kernel is loaded. Normally
+	  for regular kernels this value is 0x100000 (1MB). But in the case
+	  of kexec on panic the fail safe kernel needs to run at a different
+	  address than the panic-ed kernel. This option is used to set the load
+	  address for kernels used to capture crash dump on being kexec'ed
+	  after panic. The default value for crash dump kernels is
+	  0x1000000 (16MB). This can also be set based on the "X" value as
+	  specified in the "crashkernel=YM@XM" command line boot parameter
+	  passed to the panic-ed kernel. Typically this parameter is set as
+	  crashkernel=64M@16M. Please take a look at
+	  Documentation/kdump/kdump.txt for more details about crash dumps.
+
+	  Don't change this unless you know what you are doing.
+
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+	depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
+	---help---
+	  Say Y here to experiment with turning CPUs off and on, and to
+	  enable suspend on SMP systems. CPUs can be controlled through
+	  /sys/devices/system/cpu.
+
+config GENERIC_TIME_VSYSCALL
+	depends on EXPERIMENTAL
+	bool "VSYSCALL gettimeofday() interface"
+
+config COMPAT_VDSO
+	bool "Compat VDSO support"
+	default y
+	help
+	  Map the VDSO to the predictable old-style address too.
+	---help---
+	  Say N here if you are running a sufficiently recent glibc
+	  version (2.3.3 or later), to remove the high-mapped
+	  VDSO mapping and to exclusively use the randomized VDSO.
+
+	  If unsure, say Y.
+
+endmenu
+
+config ARCH_ENABLE_MEMORY_HOTPLUG
+	def_bool y
+	depends on HIGHMEM
+
+menu "Power management options (ACPI, APM)"
+	depends on !X86_VOYAGER
+
+source kernel/power/Kconfig
+
+source "drivers/acpi/Kconfig"
+
+menu "APM (Advanced Power Management) BIOS Support"
+depends on PM && !X86_VISWS
+
+config APM
+	tristate "APM (Advanced Power Management) BIOS support"
+	depends on PM
+	---help---
+	  APM is a BIOS specification for saving power using several different
+	  techniques. This is mostly useful for battery powered laptops with
+	  APM compliant BIOSes. If you say Y here, the system time will be
+	  reset after a RESUME operation, the /proc/apm device will provide
+	  battery status information, and user-space programs will receive
+	  notification of APM "events" (e.g. battery status change).
+
+	  If you select "Y" here, you can disable actual use of the APM
+	  BIOS by passing the "apm=off" option to the kernel at boot time.
+
+	  Note that the APM support is almost completely disabled for
+	  machines with more than one CPU.
+
+	  In order to use APM, you will need supporting software. For location
+	  and more information, read <file:Documentation/pm.txt> and the
+	  Battery Powered Linux mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  This driver does not spin down disk drives (see the hdparm(8)
+	  manpage ("man 8 hdparm") for that), and it doesn't turn off
+	  VESA-compliant "green" monitors.
+
+	  This driver does not support the TI 4000M TravelMate and the ACER
+	  486/DX4/75 because they don't have compliant BIOSes. Many "green"
+	  desktop machines also don't have compliant BIOSes, and this driver
+	  may cause those machines to panic during the boot phase.
+
+	  Generally, if you don't have a battery in your machine, there isn't
+	  much point in using this driver and you should say N. If you get
+	  random kernel OOPSes or reboots that don't seem to be related to
+	  anything, try disabling/enabling this option (or disabling/enabling
+	  APM in your BIOS).
+
+	  Some other things you should try when experiencing seemingly random,
+	  "weird" problems:
+
+	  1) make sure that you have enough swap space and that it is
+	  enabled.
+	  2) pass the "no-hlt" option to the kernel
+	  3) switch on floating point emulation in the kernel and pass
+	  the "no387" option to the kernel
+	  4) pass the "floppy=nodma" option to the kernel
+	  5) pass the "mem=4M" option to the kernel (thereby disabling
+	  all but the first 4 MB of RAM)
+	  6) make sure that the CPU is not over clocked.
+	  7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/>
+	  8) disable the cache from your BIOS settings
+	  9) install a fan for the video card or exchange video RAM
+	  10) install a better fan for the CPU
+	  11) exchange RAM chips
+	  12) exchange the motherboard.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called apm.
+
+config APM_IGNORE_USER_SUSPEND
+	bool "Ignore USER SUSPEND"
+	depends on APM
+	help
+	  This option will ignore USER SUSPEND requests. On machines with a
+	  compliant APM BIOS, you want to say N. However, on the NEC Versa M
+	  series notebooks, it is necessary to say Y because of a BIOS bug.
+
+config APM_DO_ENABLE
+	bool "Enable PM at boot time"
+	depends on APM
+	---help---
+	  Enable APM features at boot time. From page 36 of the APM BIOS
+	  specification: "When disabled, the APM BIOS does not automatically
+	  power manage devices, enter the Standby State, enter the Suspend
+	  State, or take power saving steps in response to CPU Idle calls."
+	  This driver will make CPU Idle calls when Linux is idle (unless this
+	  feature is turned off -- see "Do CPU IDLE calls", below). This
+	  should always save battery power, but more complicated APM features
+	  will be dependent on your BIOS implementation. You may need to turn
+	  this option off if your computer hangs at boot time when using APM
+	  support, or if it beeps continuously instead of suspending. Turn
+	  this off if you have a NEC UltraLite Versa 33/C or a Toshiba
+	  T400CDT. This is off by default since most machines do fine without
+	  this feature.
+
+config APM_CPU_IDLE
+	bool "Make CPU Idle calls when idle"
+	depends on APM
+	help
+	  Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
+	  On some machines, this can activate improved power savings, such as
+	  a slowed CPU clock rate, when the machine is idle. These idle calls
+	  are made after the idle loop has run for some length of time (e.g.,
+	  333 mS). On some machines, this will cause a hang at boot time or
+	  whenever the CPU becomes idle. (On machines with more than one CPU,
+	  this option does nothing.)
+
+config APM_DISPLAY_BLANK
+	bool "Enable console blanking using APM"
+	depends on APM
+	help
+	  Enable console blanking using the APM. Some laptops can use this to
+	  turn off the LCD backlight when the screen blanker of the Linux
+	  virtual console blanks the screen. Note that this is only used by
+	  the virtual console screen blanker, and won't turn off the backlight
+	  when using the X Window system. This also doesn't have anything to
+	  do with your VESA-compliant power-saving monitor. Further, this
+	  option doesn't work for all laptops -- it might not turn off your
+	  backlight at all, or it might print a lot of errors to the console,
+	  especially if you are using gpm.
+
+config APM_RTC_IS_GMT
+	bool "RTC stores time in GMT"
+	depends on APM
+	help
+	  Say Y here if your RTC (Real Time Clock a.k.a. hardware clock)
+	  stores the time in GMT (Greenwich Mean Time). Say N if your RTC
+	  stores localtime.
+
+	  It is in fact recommended to store GMT in your RTC, because then you
+	  don't have to worry about daylight savings time changes. The only
+	  reason not to use GMT in your RTC is if you also run a broken OS
+	  that doesn't understand GMT.
+
+config APM_ALLOW_INTS
+	bool "Allow interrupts during APM BIOS calls"
+	depends on APM
+	help
+	  Normally we disable external interrupts while we are making calls to
+	  the APM BIOS as a measure to lessen the effects of a badly behaving
+	  BIOS implementation.  The BIOS should reenable interrupts if it
+	  needs to.  Unfortunately, some BIOSes do not -- especially those in
+	  many of the newer IBM Thinkpads.  If you experience hangs when you
+	  suspend, try setting this to Y.  Otherwise, say N.
+
+config APM_REAL_MODE_POWER_OFF
+	bool "Use real mode APM BIOS call to power off"
+	depends on APM
+	help
+	  Use real mode APM BIOS calls to switch off the computer. This is
+	  a work-around for a number of buggy BIOSes. Switch this option on if
+	  your computer crashes instead of powering off properly.
+
+endmenu
+
+source "arch/i386/kernel/cpu/cpufreq/Kconfig"
+
+endmenu
+
+menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
+
+config PCI
+	bool "PCI support" if !X86_VISWS
+	depends on !X86_VOYAGER
+	default y if X86_VISWS
+	help
+	  Find out whether you have a PCI motherboard. PCI is the name of a
+	  bus system, i.e. the way the CPU talks to the other stuff inside
+	  your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
+	  VESA. If you have PCI, say Y, otherwise N.
+
+	  The PCI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, contains valuable
+	  information about which PCI hardware does work under Linux and which
+	  doesn't.
+
+choice
+	prompt "PCI access mode"
+	depends on PCI && !X86_VISWS
+	default PCI_GOANY
+	---help---
+	  On PCI systems, the BIOS can be used to detect the PCI devices and
+	  determine their configuration. However, some old PCI motherboards
+	  have BIOS bugs and may crash if this is done. Also, some embedded
+	  PCI-based systems don't have any BIOS at all. Linux can also try to
+	  detect the PCI hardware directly without using the BIOS.
+
+	  With this option, you can specify how Linux should detect the
+	  PCI devices. If you choose "BIOS", the BIOS will be used,
+	  if you choose "Direct", the BIOS won't be used, and if you
+	  choose "MMConfig", then PCI Express MMCONFIG will be used.
+	  If you choose "Any", the kernel will try MMCONFIG, then the
+	  direct access method and falls back to the BIOS if that doesn't
+	  work. If unsure, go with the default, which is "Any".
+
+config PCI_GOBIOS
+	bool "BIOS"
+
+config PCI_GOMMCONFIG
+	bool "MMConfig"
+
+config PCI_GODIRECT
+	bool "Direct"
+
+config PCI_GOANY
+	bool "Any"
+
+endchoice
+
+config PCI_BIOS
+	bool
+	depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
+	default y
+
+config PCI_DIRECT
+	bool
+ 	depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS)
+	default y
+
+config PCI_MMCONFIG
+	bool
+	depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
+	default y
+
+source "drivers/pci/pcie/Kconfig"
+
+source "drivers/pci/Kconfig"
+
+config ISA_DMA_API
+	bool
+	default y
+
+config ISA
+	bool "ISA support"
+	depends on !(X86_VOYAGER || X86_VISWS)
+	help
+	  Find out whether you have ISA slots on your motherboard.  ISA is the
+	  name of a bus system, i.e. the way the CPU talks to the other stuff
+	  inside your box.  Other bus systems are PCI, EISA, MicroChannel
+	  (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
+	  newer boards don't support it.  If you have ISA, say Y, otherwise N.
+
+config EISA
+	bool "EISA support"
+	depends on ISA
+	---help---
+	  The Extended Industry Standard Architecture (EISA) bus was
+	  developed as an open alternative to the IBM MicroChannel bus.
+
+	  The EISA bus provided some of the features of the IBM MicroChannel
+	  bus while maintaining backward compatibility with cards made for
+	  the older ISA bus.  The EISA bus saw limited use between 1988 and
+	  1995 when it was made obsolete by the PCI bus.
+
+	  Say Y here if you are building a kernel for an EISA-based machine.
+
+	  Otherwise, say N.
+
+source "drivers/eisa/Kconfig"
+
+config MCA
+	bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
+	default y if X86_VOYAGER
+	help
+	  MicroChannel Architecture is found in some IBM PS/2 machines and
+	  laptops.  It is a bus system similar to PCI or ISA. See
+	  <file:Documentation/mca.txt> (and especially the web page given
+	  there) before attempting to build an MCA bus kernel.
+
+source "drivers/mca/Kconfig"
+
+config SCx200
+	tristate "NatSemi SCx200 support"
+	depends on !X86_VOYAGER
+	help
+	  This provides basic support for National Semiconductor's
+	  (now AMD's) Geode processors.  The driver probes for the
+	  PCI-IDs of several on-chip devices, so its a good dependency
+	  for other scx200_* drivers.
+
+	  If compiled as a module, the driver is named scx200.
+
+config SCx200HR_TIMER
+	tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
+	depends on SCx200 && GENERIC_TIME
+	default y
+	help
+	  This driver provides a clocksource built upon the on-chip
+	  27MHz high-resolution timer.  Its also a workaround for
+	  NSC Geode SC-1100's buggy TSC, which loses time when the
+	  processor goes idle (as is done by the scheduler).  The
+	  other workaround is idle=poll boot option.
+
+config K8_NB
+	def_bool y
+	depends on AGP_AMD64
+
+source "drivers/pcmcia/Kconfig"
+
+source "drivers/pci/hotplug/Kconfig"
+
+endmenu
+
+menu "Executable file formats"
+
+source "fs/Kconfig.binfmt"
+
+endmenu
+
+source "net/Kconfig"
+
+source "drivers/Kconfig"
+
+source "fs/Kconfig"
+
+menu "Instrumentation Support"
+	depends on EXPERIMENTAL
+
+source "arch/i386/oprofile/Kconfig"
+
+config KPROBES
+	bool "Kprobes (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && MODULES
+	help
+	  Kprobes allows you to trap at almost any kernel address and
+	  execute a callback function.  register_kprobe() establishes
+	  a probepoint and specifies the callback.  Kprobes is useful
+	  for kernel debugging, non-intrusive instrumentation and testing.
+	  If in doubt, say "N".
+endmenu
+
+source "arch/i386/Kconfig.debug"
+
+source "security/Kconfig"
+
+source "crypto/Kconfig"
+
+source "lib/Kconfig"
+
+#
+# Use the generic interrupt handling code in kernel/irq/:
+#
+config GENERIC_HARDIRQS
+	bool
+	default y
+
+config GENERIC_IRQ_PROBE
+	bool
+	default y
+
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
+config X86_SMP
+	bool
+	depends on SMP && !X86_VOYAGER
+	default y
+
+config X86_HT
+	bool
+	depends on SMP && !(X86_VISWS || X86_VOYAGER)
+	default y
+
+config X86_BIOS_REBOOT
+	bool
+	depends on !(X86_VISWS || X86_VOYAGER)
+	default y
+
+config X86_TRAMPOLINE
+	bool
+	depends on X86_SMP || (X86_VOYAGER && SMP)
+	default y
+
+config KTIME_SCALAR
+	bool
+	default y
diff -urN ./linux-2.6.18.1/arch/i386/boot/compressed/misc.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/boot/compressed/misc.c
--- ./linux-2.6.18.1/arch/i386/boot/compressed/misc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/boot/compressed/misc.c	2007-05-19 23:58:35.000000000 +0900
@@ -15,6 +15,12 @@
 #include <asm/io.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_MCOUNT
+void notrace mcount(void)
+{
+}
+#endif
+
 /*
  * gzip declarations
  */
@@ -107,7 +113,7 @@
 #define INPLACE_MOVE_ROUTINE  0x1000
 #define LOW_BUFFER_START      0x2000
 #define LOW_BUFFER_MAX       0x90000
-#define HEAP_SIZE             0x3000
+#define HEAP_SIZE             0x4000
 static unsigned int low_buffer_end, low_buffer_size;
 static int high_loaded =0;
 static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
diff -urN ./linux-2.6.18.1/arch/i386/kernel/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/Makefile
--- ./linux-2.6.18.1/arch/i386/kernel/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -4,7 +4,7 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o bootflag.o \
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
@@ -12,6 +12,7 @@
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
+obj-$(CONFIG_GENERIC_TIME_VSYSCALL)	+= vsyscall-gtod.o
 obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
 obj-$(CONFIG_MCA)		+= mca.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
@@ -20,6 +21,7 @@
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-$(CONFIG_MCOUNT)		+= mcount-wrapper.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
@@ -30,6 +32,7 @@
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module.o
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-y				+= sysenter.o vsyscall.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
diff -urN ./linux-2.6.18.1/arch/i386/kernel/acpi/boot.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/acpi/boot.c
--- ./linux-2.6.18.1/arch/i386/kernel/acpi/boot.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/acpi/boot.c	2007-05-19 23:58:35.000000000 +0900
@@ -53,8 +53,6 @@
 #include <mach_mpparse.h>
 #endif				/* CONFIG_X86_LOCAL_APIC */
 
-static inline int gsi_irq_sharing(int gsi) { return gsi; }
-
 #endif				/* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (					    \
@@ -459,12 +457,7 @@
 
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-#ifdef CONFIG_X86_IO_APIC
-	if (use_pci_vector() && !platform_legacy_irq(gsi))
-		*irq = IO_APIC_VECTOR(gsi);
-	else
-#endif
-		*irq = gsi_irq_sharing(gsi);
+	*irq = gsi;
 	return 0;
 }
 
@@ -575,6 +568,7 @@
 }
 
 #ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
 
 static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 {
@@ -595,21 +589,13 @@
 		return -1;
 	}
 #ifdef	CONFIG_X86_64
-	vxtime.hpet_address = hpet_tbl->addr.addrl |
+	hpet_address = hpet_tbl->addr.addrl |
 	    ((long)hpet_tbl->addr.addrh << 32);
-
+#else
+	hpet_address = hpet_tbl->addr.addrl;
+#endif
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-	       hpet_tbl->id, vxtime.hpet_address);
-#else				/* X86 */
-	{
-		extern unsigned long hpet_address;
-
-		hpet_address = hpet_tbl->addr.addrl;
-		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-		       hpet_tbl->id, hpet_address);
-	}
-#endif				/* X86 */
-
+	       hpet_tbl->id, hpet_address);
 	return 0;
 }
 #else
diff -urN ./linux-2.6.18.1/arch/i386/kernel/apic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/apic.c
--- ./linux-2.6.18.1/arch/i386/kernel/apic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/apic.c	2007-05-19 23:58:35.000000000 +0900
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
 #include <linux/module.h>
 
 #include <asm/atomic.h>
@@ -59,6 +60,23 @@
  */
 int apic_verbosity;
 
+static unsigned int calibration_result;
+
+static void lapic_next_event(unsigned long delta, struct clock_event *evt);
+static void lapic_timer_setup(int mode, struct clock_event *evt);
+
+static struct clock_event lapic_clockevent = {
+	.name = "lapic",
+	.capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE
+#ifdef CONFIG_SMP
+			| CLOCK_CAP_UPDATE
+#endif
+	,
+	.shift = 32,
+	.set_mode = lapic_timer_setup,
+	.set_next_event = lapic_next_event,
+};
+static DEFINE_PER_CPU(struct clock_event, lapic_events);
 
 static void apic_pm_activate(void);
 
@@ -909,6 +927,11 @@
  */
 
 /*
+ * FIXME: Move this to i8253.h. There is no need to keep the access to
+ * the PIT scattered all around the place -tglx
+ */
+
+/*
  * The timer chip is already set up at HZ interrupts per second here,
  * but we do not accept timer interrupts yet. We only allow the BP
  * to calibrate.
@@ -966,13 +989,15 @@
 
 #define APIC_DIVISOR 16
 
-static void __setup_APIC_LVTT(unsigned int clocks)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot)
 {
 	unsigned int lvtt_value, tmp_value, ver;
 	int cpu = smp_processor_id();
 
 	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
 	if (!APIC_INTEGRATED(ver))
 		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
 
@@ -989,23 +1014,31 @@
 				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
 				| APIC_TDR_DIV_16);
 
-	apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+	if (!oneshot)
+		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+static void lapic_next_event(unsigned long delta, struct clock_event *evt)
+{
+	apic_write_around(APIC_TMICT, delta);
 }
 
-static void __devinit setup_APIC_timer(unsigned int clocks)
+static void lapic_timer_setup(int mode, struct clock_event *evt)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
+	__setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC);
+	local_irq_restore(flags);
+}
 
-	/*
-	 * Wait for IRQ0's slice:
-	 */
-	wait_timer_tick();
+static void __devinit setup_APIC_timer(void)
+{
+	struct clock_event *levt = &__get_cpu_var(lapic_events);
 
-	__setup_APIC_LVTT(clocks);
+	memcpy(levt, &lapic_clockevent, sizeof(*levt));
 
-	local_irq_restore(flags);
+	register_local_clockevent(levt);
 }
 
 /*
@@ -1014,6 +1047,8 @@
  * to calibrate, since some later bootup code depends on getting
  * the first irq? Ugh.
  *
+ * TODO: Fix this rather than saying "Ugh" -tglx
+ *
  * We want to do the calibration only once since we
  * want to have local timer irqs syncron. CPUs connected
  * by the same APIC bus have the very same bus frequency.
@@ -1036,7 +1071,7 @@
 	 * value into the APIC clock, we just want to get the
 	 * counter running for calibration.
 	 */
-	__setup_APIC_LVTT(1000000000);
+	__setup_APIC_LVTT(1000000000, 0);
 
 	/*
 	 * The timer chip counts down to zero. Let's wait
@@ -1073,6 +1108,14 @@
 
 	result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
 
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc(tt1-tt2, TICK_NSEC * LOOPS, 32);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
 	if (cpu_has_tsc)
 		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
 			"%ld.%04ld MHz.\n",
@@ -1087,8 +1130,6 @@
 	return result;
 }
 
-static unsigned int calibration_result;
-
 void __init setup_boot_APIC_clock(void)
 {
 	unsigned long flags;
@@ -1101,14 +1142,14 @@
 	/*
 	 * Now set up the timer for real.
 	 */
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 
 	local_irq_restore(flags);
 }
 
 void __devinit setup_secondary_APIC_clock(void)
 {
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 }
 
 void disable_APIC_timer(void)
@@ -1154,6 +1195,13 @@
 	    !cpu_isset(cpu, timer_bcast_ipi)) {
 		disable_APIC_timer();
 		cpu_set(cpu, timer_bcast_ipi);
+#ifdef CONFIG_HIGH_RES_TIMERS
+		printk("Disabling NO_HZ and high resolution timers "
+		       "due to timer broadcasting\n");
+		for_each_possible_cpu(cpu)
+			per_cpu(lapic_events, cpu).capabilities &=
+				~CLOCK_CAP_NEXTEVT;
+#endif
 	}
 }
 EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
@@ -1190,6 +1238,8 @@
 	update_process_times(user_mode_vm(regs));
 #endif
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * We take the 'long' return path, and there every subsystem
 	 * grabs the apropriate locks (kernel lock/ irq lock).
@@ -1211,15 +1261,18 @@
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
+	struct clock_event *evt = &per_cpu(lapic_events, cpu);
 
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
@@ -1231,7 +1284,15 @@
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	irq_enter();
-	smp_local_timer_interrupt(regs);
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_softlockup_watchdog();
+
+	evt->event_handler(regs);
 	irq_exit();
 }
 
@@ -1240,6 +1301,8 @@
 {
 	int cpu = smp_processor_id();
 
+	trace_special(regs->eip, 1, 0);
+
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
@@ -1323,6 +1386,7 @@
 	*/
 	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
 	        smp_processor_id(), v , v1);
+	dump_stack();
 	irq_exit();
 }
 
diff -urN ./linux-2.6.18.1/arch/i386/kernel/apm.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/apm.c
--- ./linux-2.6.18.1/arch/i386/kernel/apm.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/apm.c	2007-05-19 23:58:35.000000000 +0900
@@ -233,7 +233,6 @@
 
 #include "io_ports.h"
 
-extern unsigned long get_cmos_time(void);
 extern void machine_real_restart(unsigned char *, int);
 
 #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
@@ -1152,26 +1151,6 @@
 	spin_unlock(&user_list_lock);
 }
 
-static void set_time(void)
-{
-	if (got_clock_diff) {	/* Must know time zone in order to set clock */
-		xtime.tv_sec = get_cmos_time() + clock_cmos_diff;
-		xtime.tv_nsec = 0; 
-	} 
-}
-
-static void get_time_diff(void)
-{
-#ifndef CONFIG_APM_RTC_IS_GMT
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	clock_cmos_diff = -get_cmos_time();
-	clock_cmos_diff += get_seconds();
-	got_clock_diff = 1;
-#endif
-}
-
 static void reinit_timer(void)
 {
 #ifdef INIT_TIMER_AFTER_SUSPEND
@@ -1211,19 +1190,6 @@
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
 
-	/* serialize with the timer interrupt */
-	write_seqlock(&xtime_lock);
-
-	/* protect against access to timer chip registers */
-	spin_lock(&i8253_lock);
-
-	get_time_diff();
-	/*
-	 * Irq spinlock must be dropped around set_system_power_state.
-	 * We'll undo any timer changes due to interrupts below.
-	 */
-	spin_unlock(&i8253_lock);
-	write_sequnlock(&xtime_lock);
 	local_irq_enable();
 
 	save_processor_state();
@@ -1232,13 +1198,7 @@
 	restore_processor_state();
 
 	local_irq_disable();
-	write_seqlock(&xtime_lock);
-	spin_lock(&i8253_lock);
 	reinit_timer();
-	set_time();
-
-	spin_unlock(&i8253_lock);
-	write_sequnlock(&xtime_lock);
 
 	if (err == APM_NO_ERROR)
 		err = APM_SUCCESS;
@@ -1267,11 +1227,6 @@
 
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
-	/* serialize with the timer interrupt */
-	write_seqlock(&xtime_lock);
-	/* If needed, notify drivers here */
-	get_time_diff();
-	write_sequnlock(&xtime_lock);
 	local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
@@ -1365,9 +1320,6 @@
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				write_seqlock_irq(&xtime_lock);
-				set_time();
-				write_sequnlock_irq(&xtime_lock);
 				device_resume();
 				pm_send_all(PM_RESUME, (void *)0);
 				queue_event(event, NULL);
@@ -1383,9 +1335,6 @@
 			break;
 
 		case APM_UPDATE_TIME:
-			write_seqlock_irq(&xtime_lock);
-			set_time();
-			write_sequnlock_irq(&xtime_lock);
 			break;
 
 		case APM_CRITICAL_SUSPEND:
diff -urN ./linux-2.6.18.1/arch/i386/kernel/cpu/mtrr/generic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/cpu/mtrr/generic.c
--- ./linux-2.6.18.1/arch/i386/kernel/cpu/mtrr/generic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/cpu/mtrr/generic.c	2007-05-19 23:58:35.000000000 +0900
@@ -234,7 +234,7 @@
 
 static unsigned long cr4 = 0;
 static u32 deftype_lo, deftype_hi;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
 
 /*
  * Since we are disabling the cache don't allow any interrupts - they
diff -urN ./linux-2.6.18.1/arch/i386/kernel/cpu/mtrr/main.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/cpu/mtrr/main.c
--- ./linux-2.6.18.1/arch/i386/kernel/cpu/mtrr/main.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/cpu/mtrr/main.c	2007-05-19 23:58:35.000000000 +0900
@@ -135,8 +135,6 @@
 	mtrr_type	smp_type;
 };
 
-#ifdef CONFIG_SMP
-
 static void ipi_handler(void *info)
 /*  [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
     [RETURNS] Nothing.
@@ -166,8 +164,6 @@
 	local_irq_restore(flags);
 }
 
-#endif
-
 /**
  * set_mtrr - update mtrrs on all processors
  * @reg:	mtrr in question
diff -urN ./linux-2.6.18.1/arch/i386/kernel/cpu/transmeta.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/cpu/transmeta.c
--- ./linux-2.6.18.1/arch/i386/kernel/cpu/transmeta.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/cpu/transmeta.c	2007-05-19 23:58:35.000000000 +0900
@@ -9,7 +9,8 @@
 {
 	unsigned int cap_mask, uk, max, dummy;
 	unsigned int cms_rev1, cms_rev2;
-	unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev;
+	unsigned int cpu_rev, cpu_freq = 0 /* shut up gcc warning */,
+		     cpu_flags, new_cpu_rev;
 	char cpu_info[65];
 
 	get_model_name(c);	/* Same as AMD/Cyrix */
diff -urN ./linux-2.6.18.1/arch/i386/kernel/entry.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/entry.S
--- ./linux-2.6.18.1/arch/i386/kernel/entry.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/entry.S	2007-05-23 20:41:33.000000000 +0900
@@ -235,7 +235,20 @@
 	movb CS(%esp), %al
 	testl $(VM_MASK | 3), %eax
 	jz resume_kernel
+
 ENTRY(resume_userspace)
+#ifdef  CONFIG_CABI
+	sti
+        movl    cabi_ret_with_reschedule_hook,%eax
+        testl   %eax,%eax               /* if (hook == 0)      */
+        je      1f                      /* yes, then skip it   */
+        call    *%eax                   /* no, then call hook  */
+        testl   %eax,%eax               /* if (ret != 0)       */
+        jne     resume_userspace        /* yes, then jump back */ 
+       /* jne     work_pending  */
+1:
+#endif
+
  	cli				# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
@@ -247,15 +260,29 @@
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
+#ifdef  CONFIG_CABI
+        movl    cabi_ret_with_reschedule_hook,%eax
+        testl   %eax,%eax               /* if (hook == 0)      */
+        je      1f                      /* yes, then skip it   */
+        call    *%eax                   /* no, then call hook  */
+        testl   %eax,%eax               /* if (ret != 0)       */
+        jne     syscall_exit            /* yes, then jump back */ 
+       /* jne     work_pending  */
+1:
+#endif
 	cli
+	cmpl $0, kernel_preemption
+	jz restore_nocheck
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
-	jz restore_all
+	jz restore_nocheck
 	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
-	jz restore_all
+	jz restore_nocheck
+	cli
+	TRACE_IRQS_OFF
 	call preempt_schedule_irq
 	jmp need_resched
 #endif
@@ -311,6 +338,11 @@
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -322,9 +354,15 @@
 	movl %eax,EAX(%esp)
 	cli
 	TRACE_IRQS_OFF
+
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
@@ -341,6 +379,11 @@
 	pushl %eax			# save orig_eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 	testl $TF_MASK,EFLAGS(%esp)
 	jz no_singlestep
@@ -430,19 +473,20 @@
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jz work_notifysig
 work_resched:
-	call schedule
-	cli				# make sure we don't miss an interrupt
+	cli
+	TRACE_IRQS_OFF
+	call __schedule
+					# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
-	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
diff -urN ./linux-2.6.18.1/arch/i386/kernel/entry.S.orig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/entry.S.orig
--- ./linux-2.6.18.1/arch/i386/kernel/entry.S.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/entry.S.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,975 @@
+/*
+ *  linux/arch/i386/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'ret_from_system_call':
+ * 	ptrace needs to have all regs on the stack.
+ *	if the order here is changed, it needs to be
+ *	updated in fork.c:copy_process, signal.c:do_signal,
+ *	ptrace.c and ptrace.h
+ *
+ *	 0(%esp) - %ebx
+ *	 4(%esp) - %ecx
+ *	 8(%esp) - %edx
+ *       C(%esp) - %esi
+ *	10(%esp) - %edi
+ *	14(%esp) - %ebp
+ *	18(%esp) - %eax
+ *	1C(%esp) - %ds
+ *	20(%esp) - %es
+ *	24(%esp) - orig_eax
+ *	28(%esp) - %eip
+ *	2C(%esp) - %cs
+ *	30(%esp) - %eflags
+ *	34(%esp) - %oldesp
+ *	38(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/dwarf2.h>
+#include "irq_vectors.h"
+
+#define nr_syscalls ((syscall_table_size)/4)
+
+EBX		= 0x00
+ECX		= 0x04
+EDX		= 0x08
+ESI		= 0x0C
+EDI		= 0x10
+EBP		= 0x14
+EAX		= 0x18
+DS		= 0x1C
+ES		= 0x20
+ORIG_EAX	= 0x24
+EIP		= 0x28
+CS		= 0x2C
+EFLAGS		= 0x30
+OLDESP		= 0x34
+OLDSS		= 0x38
+
+CF_MASK		= 0x00000001
+TF_MASK		= 0x00000100
+IF_MASK		= 0x00000200
+DF_MASK		= 0x00000400 
+NT_MASK		= 0x00004000
+VM_MASK		= 0x00020000
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop		cli; TRACE_IRQS_OFF
+#else
+#define preempt_stop
+#define resume_kernel		restore_nocheck
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	testl $IF_MASK,EFLAGS(%esp)     # interrupts off?
+	jz 1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#ifdef CONFIG_VM86
+#define resume_userspace_sig	check_userspace
+#else
+#define resume_userspace_sig	resume_userspace
+#endif
+
+#define SAVE_ALL \
+	cld; \
+	pushl %es; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET es, 0;*/\
+	pushl %ds; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	/*CFI_REL_OFFSET ds, 0;*/\
+	pushl %eax; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET eax, 0;\
+	pushl %ebp; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ebp, 0;\
+	pushl %edi; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET edi, 0;\
+	pushl %esi; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET esi, 0;\
+	pushl %edx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET edx, 0;\
+	pushl %ecx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ecx, 0;\
+	pushl %ebx; \
+	CFI_ADJUST_CFA_OFFSET 4;\
+	CFI_REL_OFFSET ebx, 0;\
+	movl $(__USER_DS), %edx; \
+	movl %edx, %ds; \
+	movl %edx, %es;
+
+#define RESTORE_INT_REGS \
+	popl %ebx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ebx;\
+	popl %ecx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ecx;\
+	popl %edx;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE edx;\
+	popl %esi;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE esi;\
+	popl %edi;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE edi;\
+	popl %ebp;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE ebp;\
+	popl %eax;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	CFI_RESTORE eax
+
+#define RESTORE_REGS	\
+	RESTORE_INT_REGS; \
+1:	popl %ds;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE ds;*/\
+2:	popl %es;	\
+	CFI_ADJUST_CFA_OFFSET -4;\
+	/*CFI_RESTORE es;*/\
+.section .fixup,"ax";	\
+3:	movl $0,(%esp);	\
+	jmp 1b;		\
+4:	movl $0,(%esp);	\
+	jmp 2b;		\
+.previous;		\
+.section __ex_table,"a";\
+	.align 4;	\
+	.long 1b,3b;	\
+	.long 2b,4b;	\
+.previous
+
+#define RING0_INT_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_DEF_CFA esp, 3*4;\
+	/*CFI_OFFSET cs, -2*4;*/\
+	CFI_OFFSET eip, -3*4
+
+#define RING0_EC_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_DEF_CFA esp, 4*4;\
+	/*CFI_OFFSET cs, -2*4;*/\
+	CFI_OFFSET eip, -3*4
+
+#define RING0_PTREGS_FRAME \
+	CFI_STARTPROC simple;\
+	CFI_DEF_CFA esp, OLDESP-EBX;\
+	/*CFI_OFFSET cs, CS-OLDESP;*/\
+	CFI_OFFSET eip, EIP-OLDESP;\
+	/*CFI_OFFSET es, ES-OLDESP;*/\
+	/*CFI_OFFSET ds, DS-OLDESP;*/\
+	CFI_OFFSET eax, EAX-OLDESP;\
+	CFI_OFFSET ebp, EBP-OLDESP;\
+	CFI_OFFSET edi, EDI-OLDESP;\
+	CFI_OFFSET esi, ESI-OLDESP;\
+	CFI_OFFSET edx, EDX-OLDESP;\
+	CFI_OFFSET ecx, ECX-OLDESP;\
+	CFI_OFFSET ebx, EBX-OLDESP
+
+ENTRY(ret_from_fork)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	pushl $0x0202			# Reset kernel eflags
+	CFI_ADJUST_CFA_OFFSET 4
+	popfl
+	CFI_ADJUST_CFA_OFFSET -4
+	jmp syscall_exit
+	CFI_ENDPROC
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+	# userspace resumption stub bypassing syscall exit tracing
+	ALIGN
+	RING0_PTREGS_FRAME
+ret_from_exception:
+	preempt_stop
+ret_from_intr:
+	GET_THREAD_INFO(%ebp)
+check_userspace:
+	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb CS(%esp), %al
+	testl $(VM_MASK | 3), %eax
+	jz resume_kernel
+ENTRY(resume_userspace)
+ 	cli				# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	movl TI_flags(%ebp), %ecx
+	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
+					# int/exception return?
+	jne work_pending
+	jmp restore_all
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+	cli
+	cmpl $0, kernel_preemption
+	jz restore_nocheck
+	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
+	jnz restore_nocheck
+need_resched:
+	movl TI_flags(%ebp), %ecx	# need_resched set ?
+	testb $_TIF_NEED_RESCHED, %cl
+	jz restore_nocheck
+	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
+	jz restore_nocheck
+	cli
+	TRACE_IRQS_OFF
+	call preempt_schedule_irq
+	jmp need_resched
+#endif
+	CFI_ENDPROC
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+
+	# sysenter call handler stub
+ENTRY(sysenter_entry)
+	CFI_STARTPROC simple
+	CFI_DEF_CFA esp, 0
+	CFI_REGISTER esp, ebp
+	movl TSS_sysenter_esp0(%esp),%esp
+sysenter_past_esp:
+	/*
+	 * No need to follow this irqs on/off section: the syscall
+	 * disabled irqs and here we enable it straight after entry:
+	 */
+	sti
+	pushl $(__USER_DS)
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ss, 0*/
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esp, 0
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $(__USER_CS)
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET cs, 0*/
+	/*
+	 * Push current_thread_info()->sysenter_return to the stack.
+	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
+	 */
+	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eip, 0
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl $__PAGE_OFFSET-3,%ebp
+	jae syscall_fault
+1:	movl (%ebp),%ebp
+.section __ex_table,"a"
+	.align 4
+	.long 1b,syscall_fault
+.previous
+
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
+	GET_THREAD_INFO(%ebp)
+
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	cmpl $(nr_syscalls), %eax
+	jae syscall_badsys
+	call *sys_call_table(,%eax,4)
+	movl %eax,EAX(%esp)
+	cli
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testw $_TIF_ALLWORK_MASK, %cx
+	jne syscall_exit_work
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
+/* if something modifies registers it must also disable sysexit */
+	movl EIP(%esp), %edx
+	movl OLDESP(%esp), %ecx
+	xorl %ebp,%ebp
+	TRACE_IRQS_ON
+	sti
+	sysexit
+	CFI_ENDPROC
+
+
+	# system call handler stub
+ENTRY(system_call)
+	RING0_INT_FRAME			# can't unwind into user space anyway
+	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
+	GET_THREAD_INFO(%ebp)
+	testl $TF_MASK,EFLAGS(%esp)
+	jz no_singlestep
+	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
+					# system call tracing in operation / emulation
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	cmpl $(nr_syscalls), %eax
+	jae syscall_badsys
+syscall_call:
+	call *sys_call_table(,%eax,4)
+	movl %eax,EAX(%esp)		# store the return value
+syscall_exit:
+	cli				# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testw $_TIF_ALLWORK_MASK, %cx	# current->work
+	jne syscall_exit_work
+
+restore_all:
+	movl EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
+	# Warning: OLDSS(%esp) contains the wrong/random values if we
+	# are returning to the kernel.
+	# See comments in process.c:copy_thread() for details.
+	movb OLDSS(%esp), %ah
+	movb CS(%esp), %al
+	andl $(VM_MASK | (4 << 8) | 3), %eax
+	cmpl $((4 << 8) | 3), %eax
+	CFI_REMEMBER_STATE
+	je ldt_ss			# returning to user-space with LDT SS
+restore_nocheck:
+	TRACE_IRQS_IRET
+restore_nocheck_notrace:
+	RESTORE_REGS
+	addl $4, %esp
+	CFI_ADJUST_CFA_OFFSET -4
+1:	iret
+.section .fixup,"ax"
+iret_exc:
+	TRACE_IRQS_ON
+	sti
+	pushl $0			# no error code
+	pushl $do_iret_error
+	jmp error_code
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
+	CFI_RESTORE_STATE
+ldt_ss:
+	larl OLDSS(%esp), %eax
+	jnz restore_nocheck
+	testl $0x00400000, %eax		# returning to 32bit stack?
+	jnz restore_nocheck		# allright, normal return
+	/* If returning to userspace with 16bit stack,
+	 * try to fix the higher word of ESP, as the CPU
+	 * won't restore it.
+	 * This is an "official" bug of all the x86-compatible
+	 * CPUs, which we can try to work around to make
+	 * dosemu and wine happy. */
+	subl $8, %esp		# reserve space for switch16 pointer
+	CFI_ADJUST_CFA_OFFSET 8
+	cli
+	TRACE_IRQS_OFF
+	movl %esp, %eax
+	/* Set up the 16bit stack frame with switch32 pointer on top,
+	 * and a switch16 pointer on top of the current frame. */
+	call setup_x86_bogus_stack
+	CFI_ADJUST_CFA_OFFSET -8	# frame has moved
+	TRACE_IRQS_IRET
+	RESTORE_REGS
+	lss 20+4(%esp), %esp	# switch to 16bit stack
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+	CFI_ENDPROC
+
+	# perform work that needs to be done immediately before resumption
+	ALIGN
+	RING0_PTREGS_FRAME		# can't unwind into user space anyway
+work_pending:
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
+	jz work_notifysig
+work_resched:
+	cli
+	TRACE_IRQS_OFF
+	call __schedule
+					# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	movl TI_flags(%ebp), %ecx
+	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
+					# than syscall tracing?
+	jz restore_all
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
+	jnz work_resched
+
+work_notifysig:				# deal with pending signals and
+					# notify-resume requests
+	testl $VM_MASK, EFLAGS(%esp)
+	movl %esp, %eax
+	jne work_notifysig_v86		# returning to kernel-space or
+					# vm86-space
+	xorl %edx, %edx
+	call do_notify_resume
+	jmp resume_userspace_sig
+
+	ALIGN
+work_notifysig_v86:
+#ifdef CONFIG_VM86
+	pushl %ecx			# save ti_flags for do_notify_resume
+	CFI_ADJUST_CFA_OFFSET 4
+	call save_v86_state		# %eax contains pt_regs pointer
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	movl %eax, %esp
+	xorl %edx, %edx
+	call do_notify_resume
+	jmp resume_userspace_sig
+#endif
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_trace_entry:
+	movl $-ENOSYS,EAX(%esp)
+	movl %esp, %eax
+	xorl %edx,%edx
+	call do_syscall_trace
+	cmpl $0, %eax
+	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
+	movl ORIG_EAX(%esp), %eax
+	cmpl $(nr_syscalls), %eax
+	jnae syscall_call
+	jmp syscall_exit
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_exit_work:
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+	jz work_pending
+	TRACE_IRQS_ON
+	sti				# could let do_syscall_trace() call
+					# schedule() instead
+	movl %esp, %eax
+	movl $1, %edx
+	call do_syscall_trace
+	jmp resume_userspace
+	CFI_ENDPROC
+
+	RING0_INT_FRAME			# can't unwind into user space anyway
+syscall_fault:
+	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+	movl $-EFAULT,EAX(%esp)
+	jmp resume_userspace
+
+syscall_badsys:
+	movl $-ENOSYS,EAX(%esp)
+	jmp resume_userspace
+	CFI_ENDPROC
+
+#define FIXUP_ESPFIX_STACK \
+	movl %esp, %eax; \
+	/* switch to 32bit stack using the pointer on top of 16bit stack */ \
+	lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
+	/* copy data from 16bit stack to 32bit stack */ \
+	call fixup_x86_bogus_stack; \
+	/* put ESP to the proper location */ \
+	movl %eax, %esp;
+#define UNWIND_ESPFIX_STACK \
+	pushl %eax; \
+	CFI_ADJUST_CFA_OFFSET 4; \
+	movl %ss, %eax; \
+	/* see if on 16bit stack */ \
+	cmpw $__ESPFIX_SS, %ax; \
+	je 28f; \
+27:	popl %eax; \
+	CFI_ADJUST_CFA_OFFSET -4; \
+.section .fixup,"ax"; \
+28:	movl $__KERNEL_DS, %eax; \
+	movl %eax, %ds; \
+	movl %eax, %es; \
+	/* switch to 32bit stack */ \
+	FIXUP_ESPFIX_STACK; \
+	jmp 27b; \
+.previous
+
+/*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+.data
+ENTRY(interrupt)
+.text
+
+vector=0
+ENTRY(irq_entries_start)
+	RING0_INT_FRAME
+.rept NR_IRQS
+	ALIGN
+ .if vector
+	CFI_ADJUST_CFA_OFFSET -4
+ .endif
+1:	pushl $~(vector)
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp common_interrupt
+.data
+	.long 1b
+.text
+vector=vector+1
+.endr
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+	ALIGN
+common_interrupt:
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	movl %esp,%eax
+	call do_IRQ
+	jmp ret_from_intr
+	CFI_ENDPROC
+
+#define BUILD_INTERRUPT(name, nr)	\
+ENTRY(name)				\
+	RING0_INT_FRAME;		\
+	pushl $~(nr);			\
+	CFI_ADJUST_CFA_OFFSET 4;	\
+	SAVE_ALL;			\
+	TRACE_IRQS_OFF			\
+	movl %esp,%eax;			\
+	call smp_/**/name;		\
+	jmp ret_from_intr;		\
+	CFI_ENDPROC
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include "entry_arch.h"
+
+ENTRY(divide_error)
+	RING0_INT_FRAME
+	pushl $0			# no error code
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_divide_error
+	CFI_ADJUST_CFA_OFFSET 4
+	ALIGN
+error_code:
+	pushl %ds
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ds, 0*/
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eax, 0
+	xorl %eax, %eax
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebp, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx, 0
+	decl %eax			# eax = -1
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	cld
+	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0*/
+	UNWIND_ESPFIX_STACK
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_REGISTER es, ecx*/
+	movl ES(%esp), %edi		# get the function address
+	movl ORIG_EAX(%esp), %edx	# get the error code
+	movl %eax, ORIG_EAX(%esp)
+	movl %ecx, ES(%esp)
+	/*CFI_REL_OFFSET es, ES*/
+	movl $(__USER_DS), %ecx
+	movl %ecx, %ds
+	movl %ecx, %es
+	movl %esp,%eax			# pt_regs pointer
+	call *%edi
+	jmp ret_from_exception
+	CFI_ENDPROC
+
+ENTRY(coprocessor_error)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_coprocessor_error
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(simd_coprocessor_error)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_simd_coprocessor_error
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(device_not_available)
+	RING0_INT_FRAME
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	movl %cr0, %eax
+	testl $0x4, %eax		# EM (math emulation bit)
+	jne device_not_available_emulate
+	preempt_stop
+	call math_state_restore
+	jmp ret_from_exception
+device_not_available_emulate:
+	pushl $0			# temporary storage for ORIG_EIP
+	CFI_ADJUST_CFA_OFFSET 4
+	call math_emulate
+	addl $4, %esp
+	CFI_ADJUST_CFA_OFFSET -4
+	jmp ret_from_exception
+	CFI_ENDPROC
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)		\
+	cmpw $__KERNEL_CS,4(%esp);		\
+	jne ok;					\
+label:						\
+	movl TSS_sysenter_esp0+offset(%esp),%esp;	\
+	pushfl;					\
+	pushl $__KERNEL_CS;			\
+	pushl $sysenter_past_esp
+
+KPROBE_ENTRY(debug)
+	RING0_INT_FRAME
+	cmpl $sysenter_entry,(%esp)
+	jne debug_stack_correct
+	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx			# error code 0
+	movl %esp,%eax			# pt_regs pointer
+	call do_debug
+	jmp ret_from_exception
+	CFI_ENDPROC
+	.previous .text
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+	RING0_INT_FRAME
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	je nmi_16bit_stack
+	cmpl $sysenter_entry,(%esp)
+	je nmi_stack_fixup
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %esp,%eax
+	/* Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl $(THREAD_SIZE-1),%eax
+	cmpl $(THREAD_SIZE-20),%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	jae nmi_stack_correct
+	cmpl $sysenter_entry,12(%esp)
+	je nmi_debug_stack_check
+nmi_stack_correct:
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	jmp restore_nocheck_notrace
+	CFI_ENDPROC
+
+nmi_stack_fixup:
+	FIX_STACK(12,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+nmi_debug_stack_check:
+	cmpw $__KERNEL_CS,16(%esp)
+	jne nmi_stack_correct
+	cmpl $debug,(%esp)
+	jb nmi_stack_correct
+	cmpl $debug_esp_fix_insn,(%esp)
+	ja nmi_stack_correct
+	FIX_STACK(24,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_16bit_stack:
+	RING0_INT_FRAME
+	/* create the pointer to lss back */
+	pushl %ss
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %esp
+	CFI_ADJUST_CFA_OFFSET 4
+	movzwl %sp, %esp
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
+	.endr
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	CFI_ADJUST_CFA_OFFSET -20	# the frame has now moved
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to 16bit stack
+1:	iret
+	CFI_ENDPROC
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
+KPROBE_ENTRY(int3)
+	RING0_INT_FRAME
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_int3
+	jmp ret_from_exception
+	CFI_ENDPROC
+	.previous .text
+
+ENTRY(overflow)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_overflow
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(bounds)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_bounds
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(invalid_op)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_invalid_op
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(coprocessor_segment_overrun)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_coprocessor_segment_overrun
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(invalid_TSS)
+	RING0_EC_FRAME
+	pushl $do_invalid_TSS
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(segment_not_present)
+	RING0_EC_FRAME
+	pushl $do_segment_not_present
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+ENTRY(stack_segment)
+	RING0_EC_FRAME
+	pushl $do_stack_segment
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+KPROBE_ENTRY(general_protection)
+	RING0_EC_FRAME
+	pushl $do_general_protection
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+	.previous .text
+
+ENTRY(alignment_check)
+	RING0_EC_FRAME
+	pushl $do_alignment_check
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+KPROBE_ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+	.previous .text
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl machine_check_vector
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+#endif
+
+ENTRY(spurious_interrupt_bug)
+	RING0_INT_FRAME
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_spurious_interrupt_bug
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+	CFI_STARTPROC
+	movl	4(%esp), %edx
+	movl	(%esp), %ecx
+	leal	4(%esp), %eax
+	movl	%ebx, EBX(%edx)
+	xorl	%ebx, %ebx
+	movl	%ebx, ECX(%edx)
+	movl	%ebx, EDX(%edx)
+	movl	%esi, ESI(%edx)
+	movl	%edi, EDI(%edx)
+	movl	%ebp, EBP(%edx)
+	movl	%ebx, EAX(%edx)
+	movl	$__USER_DS, DS(%edx)
+	movl	$__USER_DS, ES(%edx)
+	movl	%ebx, ORIG_EAX(%edx)
+	movl	%ecx, EIP(%edx)
+	movl	12(%esp), %ecx
+	movl	$__KERNEL_CS, CS(%edx)
+	movl	%ebx, EFLAGS(%edx)
+	movl	%eax, OLDESP(%edx)
+	movl	8(%esp), %eax
+	movl	%ecx, 8(%esp)
+	movl	EBX(%edx), %ebx
+	movl	$__KERNEL_DS, OLDSS(%edx)
+	jmpl	*%eax
+	CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
+
+.section .rodata,"a"
+#include "syscall_table.S"
+
+syscall_table_size=(.-sys_call_table)
diff -urN ./linux-2.6.18.1/arch/i386/kernel/head.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/head.S
--- ./linux-2.6.18.1/arch/i386/kernel/head.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/head.S	2007-05-19 23:58:35.000000000 +0900
@@ -397,6 +397,7 @@
 	call printk
 #endif
 	addl $(5*4),%esp
+	call dump_stack
 	popl %ds
 	popl %es
 	popl %edx
diff -urN ./linux-2.6.18.1/arch/i386/kernel/i386_ksyms.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/i386_ksyms.c
--- ./linux-2.6.18.1/arch/i386/kernel/i386_ksyms.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/i386_ksyms.c	2007-05-19 23:58:35.000000000 +0900
@@ -2,10 +2,12 @@
 #include <asm/checksum.h>
 #include <asm/desc.h>
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_ASM_SEMAPHORES
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
@@ -20,7 +22,7 @@
 
 EXPORT_SYMBOL(strstr);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES)
 extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
 extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
 EXPORT_SYMBOL(__write_lock_failed);
diff -urN ./linux-2.6.18.1/arch/i386/kernel/i8253.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/i8253.c
--- ./linux-2.6.18.1/arch/i386/kernel/i8253.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/i8253.c	2007-05-19 23:58:35.000000000 +0900
@@ -2,7 +2,7 @@
  * i8253.c  8253/PIT functions
  *
  */
-#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/sysdev.h>
@@ -16,22 +16,66 @@
 
 #include "io_ports.h"
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
-void setup_pit_timer(void)
+static void init_pit_timer(int mode, struct clock_event *evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	switch(mode) {
+	case CLOCK_EVT_PERIODIC:
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		udelay(10);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+		break;
+
+	case CLOCK_EVT_ONESHOT:
+	case CLOCK_EVT_SHUTDOWN:
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		udelay(10);
+		break;
+	}
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static void pit_next_event(unsigned long delta, struct clock_event *evt)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&i8253_lock, flags);
-	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
-	udelay(10);
-	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-	udelay(10);
-	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
+	outb(delta >> 8 , PIT_CH0);	/* MSB */
 	spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
+struct clock_event pit_clockevent = {
+	.name		= "pit",
+	.capabilities	= CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE
+#ifndef CONFIG_SMP
+			| CLOCK_CAP_NEXTEVT
+#endif
+	,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+	.shift		= 32,
+};
+
+void setup_pit_timer(void)
+{
+	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	register_global_clockevent(&pit_clockevent);
+}
+
 /*
  * Since the PIT overflows every tick, its not very useful
  * to just read by itself. So use jiffies to emulate a free
@@ -46,7 +90,7 @@
 	static u32 old_jifs;
 
 	spin_lock_irqsave(&i8253_lock, flags);
-        /*
+	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
 	 * by having side effects on state that we cannot undo if
diff -urN ./linux-2.6.18.1/arch/i386/kernel/i8259.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/i8259.c
--- ./linux-2.6.18.1/arch/i386/kernel/i8259.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/i8259.c	2007-05-19 23:58:35.000000000 +0900
@@ -34,39 +34,21 @@
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
-
-static void end_8259A_irq (unsigned int irq)
-{
-	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
-							irq_desc[irq].action)
-		enable_8259A_irq(irq);
-}
-
-#define shutdown_8259A_irq	disable_8259A_irq
-
 static void mask_and_ack_8259A(unsigned int);
 
-unsigned int startup_8259A_irq(unsigned int irq)
-{ 
-	enable_8259A_irq(irq);
-	return 0; /* never anything pending */
-}
-
-static struct hw_interrupt_type i8259A_irq_type = {
-	.typename = "XT-PIC",
-	.startup = startup_8259A_irq,
-	.shutdown = shutdown_8259A_irq,
-	.enable = enable_8259A_irq,
-	.disable = disable_8259A_irq,
-	.ack = mask_and_ack_8259A,
-	.end = end_8259A_irq,
+static struct irq_chip i8259A_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_8259A_irq,
+	.unmask		= enable_8259A_irq,
+	.mask_ack	= mask_and_ack_8259A,
 };
 
 /*
  * 8259A PIC functions to handle ISA devices:
  */
 
+DEFINE_RAW_SPINLOCK(i8259A_lock);
+
 /*
  * This contains the irq mask for both 8259A irq controllers,
  */
@@ -131,7 +113,7 @@
 {
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
-	irq_desc[irq].chip = &i8259A_irq_type;
+	set_irq_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
 	enable_irq(irq);
 }
 
@@ -187,6 +169,8 @@
 	 */
 	if (cached_irq_mask & irqmask)
 		goto spurious_8259A_irq;
+	if (irq & 8)
+		outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */
 	cached_irq_mask |= irqmask;
 
 handle_real_irq:
@@ -312,10 +296,10 @@
 	outb_p(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 	outb_p(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
 	outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
+	if (!auto_eoi)	/* master expects normal EOI */
 		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+	else		/* master does Auto EOI */
+		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 
 	outb_p(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 	outb_p(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
@@ -323,12 +307,12 @@
 	outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
 	if (auto_eoi)
 		/*
-		 * in AEOI mode we just have to mask the interrupt
+		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_irq_type.ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq;
 	else
-		i8259A_irq_type.ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
@@ -365,7 +349,7 @@
  * New motherboards sometimes make IRQ 13 be a PCI interrupt,
  * so allow interrupt sharing.
  */
-static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+static struct irqaction fpu_irq = { math_error_irq, IRQF_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL };
 
 void __init init_ISA_irqs (void)
 {
@@ -385,12 +369,13 @@
 			/*
 			 * 16 old-style INTA-cycle interrupts:
 			 */
-			irq_desc[i].chip = &i8259A_irq_type;
+			set_irq_chip_and_handler(i, &i8259A_chip,
+						 handle_level_irq);
 		} else {
 			/*
 			 * 'high' PCI IRQs filled in on demand
 			 */
-			irq_desc[i].chip = &no_irq_type;
+			irq_desc[i].chip = &no_irq_chip;
 		}
 	}
 }
diff -urN ./linux-2.6.18.1/arch/i386/kernel/io_apic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/io_apic.c
--- ./linux-2.6.18.1/arch/i386/kernel/io_apic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/io_apic.c	2007-05-19 23:58:35.000000000 +0900
@@ -31,6 +31,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+#include <linux/pci.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -38,6 +39,7 @@
 #include <asm/timer.h>
 #include <asm/i8259.h>
 #include <asm/nmi.h>
+#include <asm/msidef.h>
 
 #include <mach_apic.h>
 
@@ -49,8 +51,8 @@
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
 
 int timer_over_8254 __initdata = 1;
 
@@ -85,14 +87,6 @@
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
-#ifdef CONFIG_PCI_MSI
-#define vector_to_irq(vector) 	\
-	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
-#else
-#define vector_to_irq(vector)	(vector)
-#endif
-
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -136,6 +130,105 @@
 	}
 }
 
+//#define IOAPIC_CACHE
+
+#ifdef IOAPIC_CACHE
+# define MAX_IOAPIC_CACHE 512
+
+/*
+ * Cache register values:
+ */
+static unsigned int io_apic_cache[MAX_IO_APICS][MAX_IOAPIC_CACHE]
+		____cacheline_aligned_in_smp;
+#endif
+
+inline unsigned int __raw_io_apic_read(unsigned int apic, unsigned int reg)
+{
+	*IO_APIC_BASE(apic) = reg;
+	return *(IO_APIC_BASE(apic)+4);
+}
+
+unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg)
+{
+	unsigned int val = __raw_io_apic_read(apic, reg);
+
+#ifdef IOAPIC_CACHE
+	io_apic_cache[apic][reg] = val;
+#endif
+	return val;
+}
+
+unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifdef IOAPIC_CACHE
+	if (unlikely(reg >= MAX_IOAPIC_CACHE)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			printk("WARNING: ioapic register cache overflow: %d.\n",
+				reg);
+			dump_stack();
+		}
+		return __raw_io_apic_read(apic, reg);
+	}
+	if (io_apic_cache[apic][reg] && !sis_apic_bug)
+		return io_apic_cache[apic][reg];
+#endif
+	return raw_io_apic_read(apic, reg);
+}
+
+void io_apic_write(unsigned int apic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	if (unlikely(reg >= MAX_IOAPIC_CACHE)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			printk("WARNING: ioapic register cache overflow: %d.\n",
+				reg);
+			dump_stack();
+		}
+	} else
+		io_apic_cache[apic][reg] = val;
+#endif
+	*IO_APIC_BASE(apic) = reg;
+	*(IO_APIC_BASE(apic)+4) = val;
+}
+
+/*
+ * Some systems need a POST flush or else level-triggered interrupts
+ * generate lots of spurious interrupts due to the POST-ed write not
+ * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC.
+ */
+#ifdef CONFIG_SMP
+# define IOAPIC_POSTFLUSH
+#endif
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index regiser
+ */
+void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	io_apic_cache[apic][reg] = val;
+#endif
+	if (unlikely(sis_apic_bug))
+		*IO_APIC_BASE(apic) = reg;
+	*(IO_APIC_BASE(apic)+4) = val;
+#ifndef IOAPIC_POSTFLUSH
+	if (unlikely(sis_apic_bug))
+#endif
+		/*
+		 * Force POST flush by reading:
+ 		 */
+		val = *(IO_APIC_BASE(apic)+4);
+}
+
 static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
@@ -167,18 +260,6 @@
 	__modify_IO_APIC_irq(irq, 0, 0x00010000);
 }
 
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
-}
-
 static void mask_IO_APIC_irq (unsigned int irq)
 {
 	unsigned long flags;
@@ -258,7 +339,7 @@
 			break;
 		entry = irq_2_pin + entry->next;
 	}
-	set_irq_info(irq, cpumask);
+	set_native_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -1159,46 +1240,45 @@
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
-	unsigned long flags;
 	int vector;
 
-	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
 
-	spin_lock_irqsave(&vector_lock, flags);
-
-	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
-		spin_unlock_irqrestore(&vector_lock, flags);
+	if (IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
-	}
-next:
+
 	current_vector += 8;
 	if (current_vector == SYSCALL_VECTOR)
-		goto next;
+		current_vector += 8;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
 		offset++;
-		if (!(offset%8)) {
-			spin_unlock_irqrestore(&vector_lock, flags);
+		if (!(offset % 8))
 			return -ENOSPC;
-		}
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
 	vector = current_vector;
-	vector_irq[vector] = irq;
-	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = vector;
+	IO_APIC_VECTOR(irq) = vector;
 
+	return vector;
+}
+
+static int assign_irq_vector(int irq)
+{
+	unsigned long flags;
+	int vector;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	vector = __assign_irq_vector(irq);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return vector;
 }
-
-static struct hw_interrupt_type ioapic_level_type;
-static struct hw_interrupt_type ioapic_edge_type;
+static struct irq_chip ioapic_chip;
 
 #define IOAPIC_AUTO	-1
 #define IOAPIC_EDGE	0
@@ -1206,16 +1286,17 @@
 
 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	unsigned idx;
-
-	idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
-
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-			trigger == IOAPIC_LEVEL)
-		irq_desc[idx].chip = &ioapic_level_type;
-	else
-		irq_desc[idx].chip = &ioapic_edge_type;
-	set_intr_gate(vector, interrupt[idx]);
+			trigger == IOAPIC_LEVEL) {
+#ifdef CONFIG_PREEMPT_HARDIRQS
+		set_irq_chip_and_handler(irq, &ioapic_chip, handle_level_irq);
+#else
+		set_irq_chip_and_handler(irq, &ioapic_chip, handle_fasteoi_irq);
+#endif
+	} else {
+		set_irq_chip_and_handler(irq, &ioapic_chip, handle_edge_irq);
+	}
+	set_intr_gate(vector, interrupt[irq]);
 }
 
 static void __init setup_IO_APIC_irqs(void)
@@ -1326,7 +1407,8 @@
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_edge_type;
+	irq_desc[0].chip = &ioapic_chip;
+	set_irq_handler(0, handle_edge_irq);
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1445,8 +1527,8 @@
 		struct IO_APIC_route_entry entry;
 
 		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+		*(((int *)&entry)+0) = raw_io_apic_read(apic, 0x10+i*2);
+		*(((int *)&entry)+1) = raw_io_apic_read(apic, 0x11+i*2);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
@@ -1467,17 +1549,12 @@
 		);
 	}
 	}
-	if (use_pci_vector())
-		printk(KERN_INFO "Using vector-based indexing\n");
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for (i = 0; i < NR_IRQS; i++) {
 		struct irq_pin_list *entry = irq_2_pin + i;
 		if (entry->pin < 0)
 			continue;
- 		if (use_pci_vector() && !platform_legacy_irq(i))
-			printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
-		else
-			printk(KERN_DEBUG "IRQ%d ", i);
+		printk(KERN_DEBUG "IRQ%d ", i);
 		for (;;) {
 			printk("-> %d:%d", entry->apic, entry->pin);
 			if (!entry->next)
@@ -1492,7 +1569,7 @@
 	return;
 }
 
-#if 0
+#if 1
 
 static void print_APIC_bitfield (int base)
 {
@@ -1893,7 +1970,7 @@
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (jiffies - t1 > 4)
+	if (jiffies - t1 > 4 && jiffies - t1 < 16)
 		return 1;
 
 	return 0;
@@ -1913,6 +1990,8 @@
  */
 
 /*
+ * Startup quirk:
+ *
  * Starting up a edge-triggered IO-APIC interrupt is
  * nasty - we need to make sure that we get the edge.
  * If it is already asserted for some reason, we need
@@ -1920,8 +1999,10 @@
  *
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
+ *
+ * (We do this for level-triggered IRQs too - it cannot hurt.)
  */
-static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(unsigned int irq)
 {
 	int was_pending = 0;
 	unsigned long flags;
@@ -1938,47 +2019,18 @@
 	return was_pending;
 }
 
-/*
- * Once we have recorded IRQ_PENDING already, we can mask the
- * interrupt for real. This prevents IRQ storms from unhandled
- * devices.
- */
-static void ack_edge_ioapic_irq(unsigned int irq)
-{
-	move_irq(irq);
-	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
-					== (IRQ_PENDING | IRQ_DISABLED))
-		mask_IO_APIC_irq(irq);
-	ack_APIC_irq();
-}
-
-/*
- * Level triggered interrupts can just be masked,
- * and shutting down and starting up the interrupt
- * is the same as enabling and disabling them -- except
- * with a startup need to return a "was pending" value.
- *
- * Level triggered interrupts are special because we
- * do not touch any IO-APIC register while handling
- * them. We ack the APIC in the end-IRQ handler, not
- * in the start-IRQ-handler. Protection against reentrance
- * from the same interrupt is still provided, both by the
- * generic IRQ layer and by the fact that an unacked local
- * APIC does not accept IRQs.
- */
-static unsigned int startup_level_ioapic_irq (unsigned int irq)
+static void ack_ioapic_irq(unsigned int irq)
 {
-	unmask_IO_APIC_irq(irq);
-
-	return 0; /* don't check for pending */
+	move_native_irq(irq);
+	ack_APIC_irq();
 }
 
-static void end_level_ioapic_irq (unsigned int irq)
+static void ack_ioapic_quirk_irq(unsigned int irq)
 {
 	unsigned long v;
 	int i;
 
-	move_irq(irq);
+	move_native_irq(irq);
 /*
  * It appears there is an erratum which affects at least version 0x11
  * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -2007,111 +2059,34 @@
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		/* mask = 1, trigger = 0 */
+		__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+		/* mask = 0, trigger = 1 */
+		__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 		spin_unlock(&ioapic_lock);
 	}
 }
 
-#ifdef CONFIG_PCI_MSI
-static unsigned int startup_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_edge_ioapic_irq(irq);
-}
-
-static void ack_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	ack_edge_ioapic_irq(irq);
-}
-
-static unsigned int startup_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_level_ioapic_irq (irq);
-}
-
-static void end_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	end_level_ioapic_irq(irq);
-}
-
-static void mask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	mask_IO_APIC_irq(irq);
-}
-
-static void unmask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	unmask_IO_APIC_irq(irq);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_vector (unsigned int vector,
-					cpumask_t cpu_mask)
-{
-	int irq = vector_to_irq(vector);
-
-	set_native_irq_info(vector, cpu_mask);
-	set_ioapic_affinity_irq(irq, cpu_mask);
-}
-#endif
-#endif
-
-static int ioapic_retrigger(unsigned int irq)
+static int ioapic_retrigger_irq(unsigned int irq)
 {
 	send_IPI_self(IO_APIC_VECTOR(irq));
 
 	return 1;
 }
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
-	.typename 	= "IO-APIC-edge",
-	.startup 	= startup_edge_ioapic,
-	.shutdown 	= shutdown_edge_ioapic,
-	.enable 	= enable_edge_ioapic,
-	.disable 	= disable_edge_ioapic,
-	.ack 		= ack_edge_ioapic,
-	.end 		= end_edge_ioapic,
+static struct irq_chip ioapic_chip __read_mostly = {
+	.name 		= "IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= mask_IO_APIC_irq,
+	.unmask	 	= unmask_IO_APIC_irq,
+	.ack 		= ack_ioapic_irq,
+	.eoi 		= ack_ioapic_quirk_irq,
 #ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity,
+	.set_affinity 	= set_ioapic_affinity_irq,
 #endif
-	.retrigger	= ioapic_retrigger,
+	.retrigger	= ioapic_retrigger_irq,
 };
 
-static struct hw_interrupt_type ioapic_level_type __read_mostly = {
-	.typename 	= "IO-APIC-level",
-	.startup 	= startup_level_ioapic,
-	.shutdown 	= shutdown_level_ioapic,
-	.enable 	= enable_level_ioapic,
-	.disable 	= disable_level_ioapic,
-	.ack 		= mask_and_ack_level_ioapic,
-	.end 		= end_level_ioapic,
-#ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity,
-#endif
-	.retrigger	= ioapic_retrigger,
-};
 
 static inline void init_IO_APIC_traps(void)
 {
@@ -2130,11 +2105,6 @@
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
 		int tmp = irq;
-		if (use_pci_vector()) {
-			if (!platform_legacy_irq(tmp))
-				if ((tmp = vector_to_irq(tmp)) == -1)
-					continue;
-		}
 		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
 			/*
 			 * Hmm.. We don't have an entry for this,
@@ -2145,20 +2115,21 @@
 				make_8259A_irq(irq);
 			else
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_type;
+				irq_desc[irq].chip = &no_irq_chip;
 		}
 	}
 }
 
-static void enable_lapic_irq (unsigned int irq)
-{
-	unsigned long v;
+/*
+ * The local APIC irq-chip implementation:
+ */
 
-	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+static void ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
 }
 
-static void disable_lapic_irq (unsigned int irq)
+static void mask_lapic_irq (unsigned int irq)
 {
 	unsigned long v;
 
@@ -2166,21 +2137,19 @@
 	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq (unsigned int irq)
 {
-	ack_APIC_irq();
-}
+	unsigned long v;
 
-static void end_lapic_irq (unsigned int i) { /* nothing */ }
+	v = apic_read(APIC_LVT0);
+	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
 
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-	.typename 	= "local-APIC-edge",
-	.startup 	= NULL, /* startup_irq() not used for IRQ0 */
-	.shutdown 	= NULL, /* shutdown_irq() not used for IRQ0 */
-	.enable 	= enable_lapic_irq,
-	.disable 	= disable_lapic_irq,
-	.ack 		= ack_lapic_irq,
-	.end 		= end_lapic_irq
+static struct irq_chip lapic_chip __read_mostly = {
+	.name		= "local-APIC-edge",
+	.mask		= mask_lapic_irq,
+	.unmask		= unmask_lapic_irq,
+	.eoi		= ack_apic,
 };
 
 static void setup_nmi (void)
@@ -2361,7 +2330,7 @@
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
 	disable_8259A_irq(0);
-	irq_desc[0].chip = &lapic_irq_type;
+	set_irq_chip_and_handler(0, &lapic_chip, handle_fasteoi_irq);
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2543,6 +2512,117 @@
 
 device_initcall(ioapic_init_sysfs);
 
+#ifdef CONFIG_PCI_MSI
+/*
+ * Dynamic irq allocate and deallocation for MSI
+ */
+int create_irq(void)
+{
+	/* Allocate an unused irq */
+	int irq, new, vector;
+	unsigned long flags;
+
+	irq = -ENOSPC;
+	spin_lock_irqsave(&vector_lock, flags);
+	for (new = (NR_IRQS - 1); new >= 0; new--) {
+		if (platform_legacy_irq(new))
+			continue;
+		if (irq_vector[new] != 0)
+			continue;
+		vector = __assign_irq_vector(new);
+		if (likely(vector > 0))
+			irq = new;
+		break;
+	}
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	if (irq >= 0) {
+		set_intr_gate(vector, interrupt[irq]);
+		dynamic_irq_init(irq);
+	}
+	return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	dynamic_irq_cleanup(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+	irq_vector[irq] = 0;
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_msg_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+	/* For now always this code always uses physical delivery
+	 * mode.
+	 */
+	int vector;
+	unsigned dest;
+
+	vector = assign_irq_vector(irq);
+	if (vector >= 0) {
+		dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo =
+			MSI_ADDR_BASE_LO |
+			((INT_DEST_MODE == 0) ?
+				MSI_ADDR_DEST_MODE_PHYSICAL:
+				MSI_ADDR_DEST_MODE_LOGICAL) |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_ADDR_REDIRECTION_CPU:
+				MSI_ADDR_REDIRECTION_LOWPRI) |
+			MSI_ADDR_DEST_ID(dest);
+
+		msg->data =
+			MSI_DATA_TRIGGER_EDGE |
+			MSI_DATA_LEVEL_ASSERT |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_DATA_DELIVERY_FIXED:
+				MSI_DATA_DELIVERY_LOWPRI) |
+			MSI_DATA_VECTOR(vector);
+	}
+	return vector;
+}
+
+static void msi_msg_teardown(unsigned int irq)
+{
+	return;
+}
+
+static void msi_msg_set_affinity(unsigned int irq, cpumask_t mask, struct msi_msg *msg)
+{
+	int vector;
+	unsigned dest;
+
+	vector = assign_irq_vector(irq);
+	if (vector > 0) {
+		dest = cpu_mask_to_apicid(mask);
+
+		msg->data &= ~MSI_DATA_VECTOR_MASK;
+		msg->data |= MSI_DATA_VECTOR(vector);
+		msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+		msg->address_lo |= MSI_ADDR_DEST_ID(dest);
+	}
+}
+
+struct msi_ops arch_msi_ops = {
+	.needs_64bit_address = 0,
+	.setup = msi_msg_setup,
+	.teardown = msi_msg_teardown,
+	.target = msi_msg_set_affinity,
+};
+
+#endif /* CONFIG_PCI_MSI */
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -2697,7 +2777,7 @@
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
-	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
+	set_native_irq_info(irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff -urN ./linux-2.6.18.1/arch/i386/kernel/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/irq.c
--- ./linux-2.6.18.1/arch/i386/kernel/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -51,10 +51,11 @@
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	/* high bit used in ret_from_ code */
 	int irq = ~regs->orig_eax;
+	struct irq_desc *desc = irq_desc + irq;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
@@ -67,6 +68,11 @@
 	}
 
 	irq_enter();
+#ifdef CONFIG_LATENCY_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->eip, irq, 0);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
@@ -75,12 +81,25 @@
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
 		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
+			printk("BUG: do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
 		}
 	}
 #endif
+#ifdef CONFIG_NO_HZ
+	if (idle_cpu(smp_processor_id())) {
+		update_jiffies();
+		/*
+		 * Force polling-idle loops to break out into
+		 * the sched-timer setting code, to make sure
+		 * that timer interval changes due to __mod_timer()
+		 * in IRQ context get properly propagated:
+		 */
+		if (tsk_is_polling(current))
+			set_need_resched();
+	}
+#endif
 
 #ifdef CONFIG_4KSTACKS
 
@@ -94,7 +113,7 @@
 	 * current stack (which is the irq stack already after all)
 	 */
 	if (curctx != irqctx) {
-		int arg1, arg2, ebx;
+		int arg1, arg2, arg3, ebx;
 
 		/* build the stack frame on the IRQ stack */
 		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -110,16 +129,17 @@
 			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
 		asm volatile(
-			"       xchgl   %%ebx,%%esp      \n"
-			"       call    __do_IRQ         \n"
+			"       xchgl  %%ebx,%%esp      \n"
+			"       call   *%%edi           \n"
 			"       movl   %%ebx,%%esp      \n"
-			: "=a" (arg1), "=d" (arg2), "=b" (ebx)
-			:  "0" (irq),   "1" (regs),  "2" (isp)
-			: "memory", "cc", "ecx"
+			: "=a" (arg1), "=d" (arg2), "=c" (arg3), "=b" (ebx)
+			:  "0" (irq),   "1" (desc),  "2" (regs),  "3" (isp),
+			   "D" (desc->handle_irq)
+			: "memory", "cc"
 		);
 	} else
 #endif
-		__do_IRQ(irq, regs);
+		desc->handle_irq(irq, desc, regs);
 
 	irq_exit();
 
@@ -242,8 +262,10 @@
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		irq_desc_t *desc = irq_desc + i;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -253,7 +275,22 @@
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].chip->typename);
+		seq_printf(p, " %-14s", irq_desc[i].chip->name);
+#define F(x,c) ((desc->status & x) ? c : '.')
+		seq_printf(p, " [%c%c%c%c%c%c%c%c%c/",
+			F(IRQ_INPROGRESS,	'I'),
+			F(IRQ_DISABLED,		'D'),
+			F(IRQ_PENDING,		'P'),
+			F(IRQ_REPLAY,		'R'),
+			F(IRQ_AUTODETECT,	'A'),
+			F(IRQ_WAITING,		'W'),
+			F(IRQ_LEVEL,		'L'),
+			F(IRQ_MASKED,		'M'),
+			F(IRQ_NODELAY,		'N'));
+#undef F
+		seq_printf(p, "%3d]", desc->irqs_unhandled);
+
+		seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq));
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
diff -urN ./linux-2.6.18.1/arch/i386/kernel/kprobes.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/kprobes.c
--- ./linux-2.6.18.1/arch/i386/kernel/kprobes.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/kprobes.c	2007-05-19 23:58:35.000000000 +0900
@@ -338,7 +338,7 @@
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
 		regs->eip = (unsigned long)p->ainsn.insn;
-		preempt_enable_no_resched();
+		preempt_enable();
 		return 1;
 	}
 #endif
@@ -347,7 +347,7 @@
 	return 1;
 
 no_kprobe:
-	preempt_enable_no_resched();
+	preempt_enable();
 	return ret;
 }
 
@@ -566,7 +566,7 @@
 	}
 	reset_current_kprobe();
 out:
-	preempt_enable_no_resched();
+	preempt_enable();
 
 	/*
 	 * if somebody else is singlestepping across a probe point, eflags
@@ -600,7 +600,7 @@
 			restore_previous_kprobe(kcb);
 		else
 			reset_current_kprobe();
-		preempt_enable_no_resched();
+		preempt_enable();
 		break;
 	case KPROBE_HIT_ACTIVE:
 	case KPROBE_HIT_SSDONE:
@@ -734,7 +734,7 @@
 		*regs = kcb->jprobe_saved_regs;
 		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
 		       MIN_STACK_SIZE(stack_addr));
-		preempt_enable_no_resched();
+		preempt_enable();
 		return 1;
 	}
 	return 0;
diff -urN ./linux-2.6.18.1/arch/i386/kernel/mcount-wrapper.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/mcount-wrapper.S
--- ./linux-2.6.18.1/arch/i386/kernel/mcount-wrapper.S	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/mcount-wrapper.S	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/i386/mcount-wrapper.S
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %ebp
+	mov %esp, %ebp
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	call __mcount
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	popl %ebp
+out:
+	ret
+
diff -urN ./linux-2.6.18.1/arch/i386/kernel/microcode.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/microcode.c
--- ./linux-2.6.18.1/arch/i386/kernel/microcode.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/microcode.c	2007-05-19 23:58:35.000000000 +0900
@@ -115,7 +115,7 @@
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
 /* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
+static DEFINE_RAW_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DEFINE_MUTEX(microcode_mutex);
diff -urN ./linux-2.6.18.1/arch/i386/kernel/mpparse.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/mpparse.c
--- ./linux-2.6.18.1/arch/i386/kernel/mpparse.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/mpparse.c	2007-05-19 23:58:35.000000000 +0900
@@ -228,12 +228,17 @@
 
 	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 
+	/*
+	 * mpc_busid is char:
+	 */
+#if MAX_MP_BUSSES < 256
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
 		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
 			" is too large, max. supported is %d\n",
 			m->mpc_busid, str, MAX_MP_BUSSES - 1);
 		return;
 	}
+#endif
 
 	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
diff -urN ./linux-2.6.18.1/arch/i386/kernel/nmi.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/nmi.c
--- ./linux-2.6.18.1/arch/i386/kernel/nmi.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/nmi.c	2007-05-19 23:58:35.000000000 +0900
@@ -21,6 +21,7 @@
 #include <linux/sysdev.h>
 #include <linux/sysctl.h>
 #include <linux/percpu.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
@@ -30,7 +31,7 @@
 
 unsigned int nmi_watchdog = NMI_NONE;
 extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
 extern void show_registers(struct pt_regs *regs);
@@ -99,7 +100,6 @@
 #define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
 #define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
 
-#ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
  * CPUs during the test make them busy.
@@ -107,7 +107,12 @@
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
+	/*
+	 * avoid a warning, on PREEMPT_RT this wont run in hardirq context:
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_enable_in_hardirq();
+#endif
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -117,7 +122,6 @@
 	while (*endflag == 0)
 		barrier();
 }
-#endif
 
 static int __init check_nmi_watchdog(void)
 {
@@ -140,7 +144,7 @@
 	for_each_possible_cpu(cpu)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
-	mdelay((10*1000)/nmi_hz); // wait 10 ticks
+	mdelay((100*1000)/nmi_hz); // wait 100 ticks
 
 	for_each_possible_cpu(cpu) {
 #ifdef CONFIG_SMP
@@ -167,7 +171,7 @@
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = 1;
+		nmi_hz = 10000;
 
 	kfree(prev_nmi_count);
 	return 0;
@@ -579,9 +583,34 @@
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
 {
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+	printk("nmi_show_all_regs(): start on CPU#%d.\n",
+		raw_smp_processor_id());
+	dump_stack();
 
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
+void notrace nmi_watchdog_tick (struct pt_regs * regs)
+{
 	/*
 	 * Since current_thread_info()-> is always on the stack, and we
 	 * always switch the stack NMI-atomically, it's safe to use
@@ -590,7 +619,16 @@
 	unsigned int sum;
 	int cpu = smp_processor_id();
 
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
+
+	profile_tick(CPU_PROFILING, regs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		printk("NMI show regs on CPU#%d:\n", cpu);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 
 	if (last_irq_sums[cpu] == sum) {
 		/*
@@ -598,11 +636,26 @@
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz)
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+		if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) {
+			int i;
+
+			bust_spinlocks(1);
+			spin_lock(&nmi_print_lock);
+			printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n",
+			       cpu, alert_counter[cpu], 5*nmi_hz);
+			show_regs(regs);
+			spin_unlock(&nmi_print_lock);
+
+			for_each_online_cpu(i)
+				if (i != cpu)
+					nmi_show_regs[i] = 1;
+			for_each_online_cpu(i)
+				while (nmi_show_regs[i] == 1)
+					barrier();
+
+			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+		}
+
 	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
diff -urN ./linux-2.6.18.1/arch/i386/kernel/process.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/process.c
--- ./linux-2.6.18.1/arch/i386/kernel/process.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/process.c	2007-05-19 23:58:35.000000000 +0900
@@ -103,16 +103,20 @@
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		current_thread_info()->status &= ~TS_POLLING;
 		smp_mb__after_clear_bit();
-		while (!need_resched()) {
+		while (!need_resched() && !need_resched_delayed()) {
 			local_irq_disable();
-			if (!need_resched())
-				safe_halt();
-			else
+			if (!need_resched() && !need_resched_delayed()) {
+				if (!hrtimer_stop_sched_tick())
+					safe_halt();
+				else
+					local_irq_enable();
+				hrtimer_restart_sched_tick();
+			} else
 				local_irq_enable();
 		}
 		current_thread_info()->status |= TS_POLLING;
 	} else {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			cpu_relax();
 	}
 }
@@ -125,16 +129,18 @@
  * to poll the ->work.need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+static void poll_idle(void)
 {
 	local_irq_enable();
 
-	asm volatile(
-		"2:"
-		"testl %0, %1;"
-		"rep; nop;"
-		"je 2b;"
-		: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+	while (!need_resched() && !need_resched_delayed()) {
+		hrtimer_stop_sched_tick();
+		local_irq_enable();
+		while (!need_resched() && !need_resched_delayed() && !rcu_pending(smp_processor_id()) && !local_softirq_pending())
+			rep_nop();
+		hrtimer_restart_sched_tick();
+		local_irq_enable();
+	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -177,7 +183,9 @@
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -195,9 +203,11 @@
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -240,13 +250,16 @@
 {
 	local_irq_enable();
 
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
+		if (hrtimer_stop_sched_tick())
+			break;
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (need_resched())
+		if (need_resched() || need_resched_delayed())
 			break;
 		__mwait(0, 0);
 	}
+	hrtimer_restart_sched_tick();
 }
 
 void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -363,15 +376,23 @@
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
 		struct task_struct *tsk = current;
 		struct thread_struct *t = &tsk->thread;
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		void *io_bitmap_ptr = t->io_bitmap_ptr;
+		int cpu;
+		struct tss_struct *tss;
 
-		kfree(t->io_bitmap_ptr);
+		/*
+		 * On PREEMPT_RT we must not call kfree() with
+		 * preemption disabled, so we first zap the pointer:
+		 */
 		t->io_bitmap_ptr = NULL;
+		kfree(io_bitmap_ptr);
+
 		clear_thread_flag(TIF_IO_BITMAP);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		cpu = get_cpu();
+		tss = &per_cpu(init_tss, cpu);
 		memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		tss->io_bitmap_owner = NULL;
diff -urN ./linux-2.6.18.1/arch/i386/kernel/semaphore.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/semaphore.c
--- ./linux-2.6.18.1/arch/i386/kernel/semaphore.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/semaphore.c	2007-05-19 23:58:35.000000000 +0900
@@ -12,6 +12,7 @@
  *
  * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
  */
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -27,15 +28,15 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
+".globl __compat_down_failed\n"
+"__compat_down_failed:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down\n\t"
+	"call __compat_down\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -48,15 +49,15 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
+".globl __compat_down_failed_interruptible\n"
+"__compat_down_failed_interruptible:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_interruptible\n\t"
+	"call __compat_down_interruptible\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -69,15 +70,15 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
+".globl __compat_down_failed_trylock\n"
+"__compat_down_failed_trylock:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_trylock\n\t"
+	"call __compat_down_trylock\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -90,45 +91,13 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
+".globl __compat_up_wakeup\n"
+"__compat_up_wakeup:\n\t"
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __up\n\t"
+	"call __compat_up\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 	"ret"
 );
 
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__write_lock_failed\n"
-"__write_lock_failed:\n\t"
-	LOCK_PREFIX "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jne	1b\n\t"
-	LOCK_PREFIX "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jnz	__write_lock_failed\n\t"
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__read_lock_failed\n"
-"__read_lock_failed:\n\t"
-	LOCK_PREFIX "incl	(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$1,(%eax)\n\t"
-	"js	1b\n\t"
-	LOCK_PREFIX "decl	(%eax)\n\t"
-	"js	__read_lock_failed\n\t"
-	"ret"
-);
-#endif
diff -urN ./linux-2.6.18.1/arch/i386/kernel/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/setup.c
--- ./linux-2.6.18.1/arch/i386/kernel/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -61,7 +61,7 @@
 #include <asm/io.h>
 #include <setup_arch.h>
 #include <bios_ebda.h>
-
+#include <asm/vsyscall-gtod.h>
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -1580,6 +1580,7 @@
 #endif
 #endif
 	tsc_init();
+	vsyscall_init();
 }
 
 static __init int add_pcspkr(void)
diff -urN ./linux-2.6.18.1/arch/i386/kernel/signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/signal.c
--- ./linux-2.6.18.1/arch/i386/kernel/signal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/signal.c	2007-05-19 23:58:35.000000000 +0900
@@ -532,6 +532,13 @@
 		}
 	}
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
 	 * that register information in the sigcontext is correct.
@@ -572,6 +579,13 @@
 	struct k_sigaction ka;
 	sigset_t *oldset;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
diff -urN ./linux-2.6.18.1/arch/i386/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/smp.c
--- ./linux-2.6.18.1/arch/i386/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -255,7 +255,7 @@
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
 #define FLUSH_ALL	0xffffffff
 
 /*
@@ -400,7 +400,7 @@
 
 	while (!cpus_empty(flush_cpumask))
 		/* nothing. lockup detection does not belong here */
-		mb();
+		cpu_relax();
 
 	flush_mm = NULL;
 	flush_va = 0;
@@ -491,10 +491,20 @@
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -599,13 +609,14 @@
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. Trigger a reschedule pass so that
+ * RT-overload balancing can pass tasks around.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs)
 {
+	trace_special(regs->eip, 0, 0);
 	ack_APIC_irq();
+	set_tsk_need_resched(current);
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
diff -urN ./linux-2.6.18.1/arch/i386/kernel/syscall_table.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/syscall_table.S
--- ./linux-2.6.18.1/arch/i386/kernel/syscall_table.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/syscall_table.S	2007-05-20 14:14:28.000000000 +0900
@@ -317,3 +317,14 @@
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+#ifdef  CONFIG_CABI
+        .long sys_cabi_account_create    /* 318-324 */
+        .long sys_cabi_account_destroy
+        .long sys_cabi_account_bind_pid
+        .long sys_cabi_account_bind_pgid
+        .long sys_cabi_account_unbind
+        .long sys_cabi_account_get
+        .long sys_cabi_account_set
+        .long sys_cabi_account_eval
+#endif  /* CONFIG_CABI */
+
diff -urN ./linux-2.6.18.1/arch/i386/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/time.c
--- ./linux-2.6.18.1/arch/i386/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -131,7 +131,7 @@
 int timer_ack;
 
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -150,15 +150,6 @@
  */
 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-
 #ifdef CONFIG_X86_IO_APIC
 	if (timer_ack) {
 		/*
@@ -177,7 +168,6 @@
 
 	do_timer_interrupt_hook(regs);
 
-
 	if (MCA_bus) {
 		/* The PS/2 uses level-triggered interrupts.  You can't
 		turn them off, nor would you want to (any attempt to
@@ -192,8 +182,6 @@
 		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
 	}
 
-	write_sequnlock(&xtime_lock);
-
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (using_apic_timer)
 		smp_send_timer_broadcast_ipi(regs);
@@ -203,7 +191,7 @@
 }
 
 /* not static: needed by APM */
-unsigned long get_cmos_time(void)
+unsigned long read_persistent_clock(void)
 {
 	unsigned long retval;
 	unsigned long flags;
@@ -219,7 +207,7 @@
 
 	return retval;
 }
-EXPORT_SYMBOL(get_cmos_time);
+EXPORT_SYMBOL(read_persistent_clock);
 
 static void sync_cmos_clock(unsigned long dummy);
 
@@ -270,75 +258,11 @@
 	mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
-static long clock_cmos_diff, sleep_start;
-
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	clock_cmos_diff = -get_cmos_time();
-	clock_cmos_diff += get_seconds();
-	sleep_start = get_cmos_time();
-	return 0;
-}
-
-static int timer_resume(struct sys_device *dev)
-{
-	unsigned long flags;
-	unsigned long sec;
-	unsigned long sleep_length;
-
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled())
-		hpet_reenable();
-#endif
-	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
-	write_seqlock_irqsave(&xtime_lock, flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	jiffies_64 += sleep_length;
-	wall_jiffies += sleep_length;
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	touch_softlockup_watchdog();
-	return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-	.resume = timer_resume,
-	.suspend = timer_suspend,
-	set_kset_name("timer"),
-};
-
-
-/* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(time_init_device);
-
 #ifdef CONFIG_HPET_TIMER
 extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
 	}
@@ -359,10 +283,6 @@
 		return;
 	}
 #endif
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
 
 	time_init_hook();
 }
diff -urN ./linux-2.6.18.1/arch/i386/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/traps.c
--- ./linux-2.6.18.1/arch/i386/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -226,6 +226,7 @@
 			break;
 		printk("%s =======================\n", log_lvl);
 	}
+	print_traces(task);
 }
 
 void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
@@ -276,6 +277,12 @@
 
 EXPORT_SYMBOL(dump_stack);
 
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_LATENCY_TRACE)
+extern unsigned long worst_stack_left;
+#else
+# define worst_stack_left -1L
+#endif
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
@@ -302,8 +309,8 @@
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
+	printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x   preempt: %08x\n",
+		regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count());
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
 		TASK_COMM_LEN, current->comm, current->pid,
 		current_thread_info(), current, current->thread_info);
@@ -375,11 +382,11 @@
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED(die.lock),
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -486,6 +493,11 @@
 	if (!user_mode(regs))
 		goto kernel_trap;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	trap_signal: {
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -505,6 +517,7 @@
 		if (ret) goto trap_signal;
 		return;
 	}
+	print_traces(tsk);
 }
 
 #define DO_ERROR(trapnr, signr, str, name) \
@@ -703,10 +716,11 @@
 		crash_kexec(regs);
 	}
 
+	nmi_exit();
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static void notrace default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
@@ -715,9 +729,6 @@
 		reason = get_nmi_reason();
  
 	if (!(reason & 0xc0)) {
-		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-							== NOTIFY_STOP)
-			return;
 #ifdef CONFIG_X86_LOCAL_APIC
 		/*
 		 * Ok, so this is none of the documented NMI sources,
@@ -725,9 +736,13 @@
 		 */
 		if (nmi_watchdog) {
 			nmi_watchdog_tick(regs);
+//			trace_special(6, 1, 0);
 			return;
 		}
 #endif
+		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
 		unknown_nmi_error(reason, regs);
 		return;
 	}
@@ -744,18 +759,19 @@
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu)
 {
 	return 0;
 }
  
 static nmi_callback_t nmi_callback = dummy_nmi_callback;
  
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall notrace void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
 	nmi_enter();
+	nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags);
 
 	cpu = smp_processor_id();
 
diff -urN ./linux-2.6.18.1/arch/i386/kernel/tsc.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/tsc.c
--- ./linux-2.6.18.1/arch/i386/kernel/tsc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/tsc.c	2007-05-19 23:58:35.000000000 +0900
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/dmi.h>
 
+#include <asm/vsyscall-gtod.h>
 #include <asm/delay.h>
 #include <asm/tsc.h>
 #include <asm/delay.h>
@@ -333,6 +334,16 @@
 	return ret;
 }
 
+
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
 static struct clocksource clocksource_tsc = {
 	.name			= "tsc",
 	.rating			= 300,
@@ -342,6 +353,7 @@
 	.shift			= 22,
 	.update_callback	= tsc_update_callback,
 	.is_continuous		= 1,
+	.vread			= vread_tsc,
 };
 
 static int tsc_update_callback(void)
diff -urN ./linux-2.6.18.1/arch/i386/kernel/vm86.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/vm86.c
--- ./linux-2.6.18.1/arch/i386/kernel/vm86.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/vm86.c	2007-05-19 23:58:35.000000000 +0900
@@ -109,6 +109,7 @@
 	local_irq_enable();
 
 	if (!current->thread.vm86_info) {
+		local_irq_disable();
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
diff -urN ./linux-2.6.18.1/arch/i386/kernel/vmlinux.lds.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/vmlinux.lds.S
--- ./linux-2.6.18.1/arch/i386/kernel/vmlinux.lds.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/vmlinux.lds.S	2007-05-19 23:58:35.000000000 +0900
@@ -8,6 +8,8 @@
 #include <asm/thread_info.h>
 #include <asm/page.h>
 #include <asm/cache.h>
+#include <linux/config.h>
+#include <asm/vsyscall-gtod.h>
 
 OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
@@ -71,6 +73,51 @@
   .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
   _edata = .;			/* End of data section */
 
+/* VSYSCALL_GTOD data */
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+#undef VSYSCALL_ADDR
+#define VSYSCALL_ADDR VSYSCALL_GTOD_START
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+  . = VSYSCALL_ADDR;
+  .vsyscall_0 :	 AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+  .vsyscall_fn :  AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data :  AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
+  . = ALIGN(32);
+  .vsyscall_gtod_data : AT (VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+
+  . = ALIGN(32);
+  .vsyscall_gtod_lock : AT (VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) }
+  vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock);
+
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
+  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
+  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+
+  . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif
+/* END of VSYSCALL_GTOD data*/
+
 #ifdef CONFIG_STACK_UNWIND
   . = ALIGN(4);
   .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
diff -urN ./linux-2.6.18.1/arch/i386/kernel/vsyscall-gtod.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/vsyscall-gtod.c
--- ./linux-2.6.18.1/arch/i386/kernel/vsyscall-gtod.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/kernel/vsyscall-gtod.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,179 @@
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/clocksource.h>
+#include <linux/sched.h>
+#include <asm/vsyscall-gtod.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/msr.h>
+#include <asm/timer.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+
+struct vsyscall_gtod_data_t {
+	struct timeval wall_time_tv;
+	struct timezone sys_tz;
+	struct clocksource clock;
+};
+
+struct vsyscall_gtod_data_t vsyscall_gtod_data;
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data;
+
+seqlock_t vsyscall_gtod_lock = SEQLOCK_UNLOCKED;
+seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = SEQLOCK_UNLOCKED;
+
+int errno;
+static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz);
+
+static int vsyscall_mapped = 0; /* flag variable for remap_vsyscall() */
+extern struct timezone sys_tz;
+
+static inline void do_vgettimeofday(struct timeval* tv)
+{
+	cycle_t now, cycle_delta;
+	s64 nsec_delta;
+
+	if (!__vsyscall_gtod_data.clock.vread) {
+		gettimeofday(tv, NULL);
+		return;
+	}
+
+	/* read the clocksource and calc cycle_delta */
+	now = __vsyscall_gtod_data.clock.vread();
+	cycle_delta = (now - __vsyscall_gtod_data.clock.cycle_last)
+				& __vsyscall_gtod_data.clock.mask;
+
+	/* convert cycles to nsecs */
+	nsec_delta = cycle_delta * __vsyscall_gtod_data.clock.mult;
+	nsec_delta = nsec_delta >> __vsyscall_gtod_data.clock.shift;
+
+	/* add nsec offset to wall_time_tv */
+	*tv = __vsyscall_gtod_data.wall_time_tv;
+	do_div(nsec_delta, NSEC_PER_USEC);
+	while (nsec_delta > NSEC_PER_SEC) {
+		tv->tv_sec += 1;
+		nsec_delta -= NSEC_PER_SEC;
+	}
+	tv->tv_usec += ((unsigned long)nsec_delta)/1000;
+}
+
+static inline void do_get_tz(struct timezone *tz)
+{
+	*tz = __vsyscall_gtod_data.sys_tz;
+}
+
+static int  __vsyscall(0) asmlinkage vgettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	unsigned long seq;
+	do {
+		seq = read_seqbegin(&__vsyscall_gtod_lock);
+
+		if (tv)
+			do_vgettimeofday(tv);
+		if (tz)
+			do_get_tz(tz);
+
+	} while (read_seqretry(&__vsyscall_gtod_lock, seq));
+
+	return 0;
+}
+
+static time_t __vsyscall(1) asmlinkage vtime(time_t * t)
+{
+	struct timeval tv;
+	vgettimeofday(&tv,NULL);
+	if (t)
+		*t = tv.tv_sec;
+	return tv.tv_sec;
+}
+
+struct clocksource* curr_clock;
+
+void update_vsyscall(struct timespec *wall_time,
+				struct clocksource* clock)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_lock, flags);
+
+	/* XXX - hackitty hack hack. this is terrible! */
+	if (curr_clock != clock) {
+		curr_clock = clock;
+	}
+
+	/* save off wall time as timeval */
+	vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
+	vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
+
+	/* copy current clocksource */
+	vsyscall_gtod_data.clock = *clock;
+
+	/* save off current timezone */
+	vsyscall_gtod_data.sys_tz = sys_tz;
+
+	write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags);
+
+}
+extern char __vsyscall_0;
+
+static void __init map_vsyscall(void)
+{
+	unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET;
+
+	/* Initially we map the VSYSCALL page w/ PAGE_KERNEL permissions to
+	 * keep the alternate_instruction code from bombing out when it
+	 * changes the seq_lock memory barriers in vgettimeofday()
+	 */
+	__set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL);
+}
+
+static int __init remap_vsyscall(void)
+{
+	unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET;
+
+	if (!vsyscall_mapped)
+		return 0;
+
+	/* Remap the VSYSCALL page w/ PAGE_KERNEL_VSYSCALL permissions
+	 * after the alternate_instruction code has run
+	 */
+	clear_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE);
+	__set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+
+	return 0;
+}
+
+int __init vsyscall_init(void)
+{
+	printk("VSYSCALL: consistency checks...");
+	if ((unsigned long) &vgettimeofday != VSYSCALL_ADDR(__NR_vgettimeofday)) {
+		printk("vgettimeofday link addr broken\n");
+		printk("VSYSCALL: vsyscall_init failed!\n");
+		return -EFAULT;
+	}
+	if ((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)) {
+		printk("vtime link addr broken\n");
+		printk("VSYSCALL: vsyscall_init failed!\n");
+		return -EFAULT;
+	}
+	if (VSYSCALL_ADDR(0) != __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)) {
+		printk("fixmap first vsyscall 0x%lx should be 0x%x\n",
+			__fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE),
+			VSYSCALL_ADDR(0));
+		printk("VSYSCALL: vsyscall_init failed!\n");
+		return -EFAULT;
+	}
+
+
+	printk("passed...mapping...");
+	map_vsyscall();
+	printk("done.\n");
+	vsyscall_mapped = 1;
+	printk("VSYSCALL: fixmap virt addr: 0x%lx\n",
+		__fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE));
+
+	return 0;
+}
+__initcall(remap_vsyscall);
diff -urN ./linux-2.6.18.1/arch/i386/lib/bitops.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/lib/bitops.c
--- ./linux-2.6.18.1/arch/i386/lib/bitops.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/lib/bitops.c	2007-05-19 23:58:35.000000000 +0900
@@ -1,5 +1,11 @@
-#include <linux/bitops.h>
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
 #include <linux/module.h>
+#include <linux/bitops.h>
+#include <asm/uaccess.h>
+#include <asm/mmx.h>
 
 /**
  * find_next_bit - find the first set bit in a memory region
@@ -68,3 +74,39 @@
 	return (offset + set + res);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
+
+
+/*
+ * rw spinlock fallbacks
+ */
+#ifdef CONFIG_SMP
+asm(
+".section .sched.text, \"ax\"\n"
+".align	4\n"
+".globl	__write_lock_failed\n"
+"__write_lock_failed:\n\t"
+	"lock; addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jne	1b\n\t"
+	"lock; subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jnz	__write_lock_failed\n\t"
+	"ret\n"
+".previous\n"
+);
+
+asm(
+".section .sched.text, \"ax\"\n"
+".align	4\n"
+".globl	__read_lock_failed\n"
+"__read_lock_failed:\n\t"
+	"lock; incl	(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$1,(%eax)\n\t"
+	"js	1b\n\t"
+	"lock; decl	(%eax)\n\t"
+	"js	__read_lock_failed\n\t"
+	"ret\n"
+".previous\n"
+);
+#endif
diff -urN ./linux-2.6.18.1/arch/i386/mach-default/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-default/setup.c
--- ./linux-2.6.18.1/arch/i386/mach-default/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-default/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -35,7 +35,7 @@
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 /**
  * intr_init_hook - post gate setup interrupt initialisation
@@ -79,7 +79,7 @@
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL};
 
 /**
  * time_init_hook - do any specific initialisations for the system timer.
diff -urN ./linux-2.6.18.1/arch/i386/mach-visws/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-visws/setup.c
--- ./linux-2.6.18.1/arch/i386/mach-visws/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-visws/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -115,7 +115,7 @@
 
 static struct irqaction irq0 = {
 	.handler =	timer_interrupt,
-	.flags =	IRQF_DISABLED,
+	.flags =	IRQF_DISABLED | IRQF_NODELAY,
 	.name =		"timer",
 };
 
diff -urN ./linux-2.6.18.1/arch/i386/mach-visws/visws_apic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-visws/visws_apic.c
--- ./linux-2.6.18.1/arch/i386/mach-visws/visws_apic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-visws/visws_apic.c	2007-05-19 23:58:35.000000000 +0900
@@ -259,11 +259,13 @@
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	IRQF_NODELAY,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	IRQF_NODELAY,
 };
 
 
diff -urN ./linux-2.6.18.1/arch/i386/mach-voyager/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-voyager/setup.c
--- ./linux-2.6.18.1/arch/i386/mach-voyager/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mach-voyager/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -18,7 +18,7 @@
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init intr_init_hook(void)
 {
@@ -40,7 +40,7 @@
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL};
 
 void __init time_init_hook(void)
 {
diff -urN ./linux-2.6.18.1/arch/i386/mm/fault.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/fault.c
--- ./linux-2.6.18.1/arch/i386/mm/fault.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/fault.c	2007-05-19 23:58:35.000000000 +0900
@@ -73,6 +73,9 @@
 	int loglevel_save = console_loglevel;
 
 	if (yes) {
+		stop_trace();
+		user_trace_stop();
+		zap_rt_locks();
 		oops_in_progress = 1;
 		return;
 	}
@@ -325,8 +328,8 @@
  *	bit 3 == 1 means use of reserved bit detected
  *	bit 4 == 1 means fault was an instruction fetch
  */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
+fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs,
+					      unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -337,6 +340,7 @@
 
 	/* get the address */
         address = read_cr2();
+	trace_special(regs->eip, error_code, address);
 
 	tsk = current;
 
diff -urN ./linux-2.6.18.1/arch/i386/mm/highmem.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/highmem.c
--- ./linux-2.6.18.1/arch/i386/mm/highmem.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/highmem.c	2007-05-19 23:58:35.000000000 +0900
@@ -18,6 +18,26 @@
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -26,7 +46,7 @@
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -48,7 +68,7 @@
 	return (void*) vaddr;
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -78,7 +98,7 @@
 /* This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -93,7 +113,7 @@
 	return (void*) vaddr;
 }
 
-struct page *kmap_atomic_to_page(void *ptr)
+struct page *__kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
@@ -108,6 +128,7 @@
 
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic_to_page);
diff -urN ./linux-2.6.18.1/arch/i386/mm/init.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/init.c
--- ./linux-2.6.18.1/arch/i386/mm/init.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/init.c	2007-05-19 23:58:35.000000000 +0900
@@ -45,7 +45,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static int noinline do_test_wp_bit(void);
@@ -194,7 +194,7 @@
 
 extern int is_available_memory(efi_memory_desc_t *);
 
-int page_is_ram(unsigned long pagenr)
+int notrace page_is_ram(unsigned long pagenr)
 {
 	int i;
 	unsigned long addr, end;
diff -urN ./linux-2.6.18.1/arch/i386/mm/pgtable.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/pgtable.c
--- ./linux-2.6.18.1/arch/i386/mm/pgtable.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/mm/pgtable.c	2007-05-19 23:58:35.000000000 +0900
@@ -182,7 +182,7 @@
  * recommendations and having no core impact whatsoever.
  * -- wli
  */
-DEFINE_SPINLOCK(pgd_lock);
+DEFINE_RAW_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
 static inline void pgd_list_add(pgd_t *pgd)
diff -urN ./linux-2.6.18.1/arch/i386/oprofile/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/oprofile/Kconfig
--- ./linux-2.6.18.1/arch/i386/oprofile/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/oprofile/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -15,3 +15,6 @@
 
 	  If unsure, say N.
 
+config PROFILE_NMI
+	bool
+	default y
diff -urN ./linux-2.6.18.1/arch/i386/pci/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/pci/Makefile
--- ./linux-2.6.18.1/arch/i386/pci/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/pci/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -4,8 +4,9 @@
 obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig.o direct.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 
+obj-$(CONFIG_ACPI)		+= acpi.o
+
 pci-y				:= fixup.o
-pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o
 
 pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
diff -urN ./linux-2.6.18.1/arch/i386/pci/direct.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/pci/direct.c
--- ./linux-2.6.18.1/arch/i386/pci/direct.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/pci/direct.c	2007-05-19 23:58:35.000000000 +0900
@@ -220,16 +220,23 @@
 	unsigned int tmp;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x01, 0xCFB);
 	tmp = inl(0xCF8);
 	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
-		works = 1;
+
+	if (inl(0xCF8) == 0x80000000) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf1))
+			works = 1;
+
+		spin_lock_irqsave(&pci_config_lock, flags);
 	}
 	outl(tmp, 0xCF8);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
@@ -239,17 +246,19 @@
 	unsigned long flags;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x00, 0xCFB);
 	outb(0x00, 0xCF8);
 	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
-	    pci_sanity_check(&pci_direct_conf2)) {
-		works = 1;
-	}
 
-	local_irq_restore(flags);
+	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf2))
+			works = 1;
+	} else
+		spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
diff -urN ./linux-2.6.18.1/arch/i386/pci/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/pci/irq.c
--- ./linux-2.6.18.1/arch/i386/pci/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/i386/pci/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -981,10 +981,6 @@
 							pci_name(bridge), 'A' + pin, irq);
 				}
 				if (irq >= 0) {
-					if (use_pci_vector() &&
-						!platform_legacy_irq(irq))
-						irq = IO_APIC_VECTOR(irq);
-
 					printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
 						pci_name(dev), 'A' + pin, irq);
 					dev->irq = irq;
@@ -1169,33 +1165,3 @@
 	}
 	return 0;
 }
-
-int pci_vector_resources(int last, int nr_released)
-{
-	int count = nr_released;
-
-	int next = last;
-	int offset = (last % 8);
-
-	while (next < FIRST_SYSTEM_VECTOR) {
-		next += 8;
-#ifdef CONFIG_X86_64
-		if (next == IA32_SYSCALL_VECTOR)
-			continue;
-#else
-		if (next == SYSCALL_VECTOR)
-			continue;
-#endif
-		count++;
-		if (next >= FIRST_SYSTEM_VECTOR) {
-			if (offset%8) {
-				next = FIRST_DEVICE_VECTOR + offset;
-				offset++;
-				continue;
-			}
-			count--;
-		}
-	}
-
-	return count;
-}
diff -urN ./linux-2.6.18.1/arch/ia64/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/Kconfig
--- ./linux-2.6.18.1/arch/ia64/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -32,6 +32,7 @@
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
+	depends on !PREEMPT_RT
 	default y
 
 config GENERIC_FIND_NEXT_BIT
@@ -42,7 +43,11 @@
 	bool
 	default y
 
-config TIME_INTERPOLATION
+config GENERIC_TIME
+	bool
+	default y
+
+config GENERIC_TIME_VSYSCALL
 	bool
 	default y
 
@@ -258,6 +263,69 @@
 
 	  If you don't know what to do here, say N.
 
+
+config GENERIC_TIME
+       bool
+       default y
+
+config HIGH_RES_TIMERS
+	bool "High-Resolution Timers"
+	help
+
+	  POSIX timers are available by default.  This option enables
+	  high-resolution POSIX timers.  With this option the resolution
+	  is at least 1 microsecond.  High resolution is not free.  If
+	  enabled this option will add a small overhead each time a
+	  timer expires that is not on a 1/HZ tick boundary.  If no such
+	  timers are used the overhead is nil.
+
+	  This option enables two additional POSIX CLOCKS,
+	  CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR.  Note that this
+	  option does not change the resolution of CLOCK_REALTIME or
+	  CLOCK_MONOTONIC which remain at 1/HZ resolution.
+
+config HIGH_RES_RESOLUTION
+	int "High-Resolution-Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+
+	  This sets the resolution of timers accessed with
+          CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR.  Too
+	  fine a resolution (small a number) will usually not
+          be observable due to normal system latencies.  For an
+          800 MHZ processor about 10,000 is the recommended maximum
+	  (smallest number).  If you don't need that sort of resolution,
+	  higher numbers may generate less overhead.
+
+choice
+	prompt "Clock source"
+	depends on HIGH_RES_TIMERS
+ 	default HIGH_RES_TIMER_ITC
+	help
+	  This option allows you to choose the hardware source in charge
+	  of generating high precision interruptions on your system.
+	  On IA-64 these are:
+
+	  <timer>				<resolution>
+	  ITC Interval Time Counter		1/CPU clock
+  	  HPET High Precision Event Timer	~ (XXX:have to check the spec)
+
+	  The ITC timer is available on all the ia64 computers because
+	  it is integrated directly into the processor. However it may not
+	  give correct results on MP machines with processors running
+	  at different clock rates. In this case you may want to use
+	  the HPET if available on your machine.
+
+
+config HIGH_RES_TIMER_ITC
+	bool "Interval Time Counter/ITC"
+
+config HIGH_RES_TIMER_HPET
+	bool "High Precision Event Timer/HPET"
+
+endchoice
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-1024)"
 	range 2 1024
@@ -310,17 +378,15 @@
 	This option it useful to enable this feature on older BIOS's as well.
 	You can also enable this by using boot command line option force_cpei=1.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-        help
-          This option reduces the latency of the kernel when reacting to
-          real-time or interactive events by allowing a low priority process to
-          be preempted even if it is in kernel mode executing a system call.
-          This allows applications to run more reliably even when the system is
-          under load.
+source "kernel/Kconfig.preempt"
 
-          Say Y here if you are building a kernel for a desktop, embedded
-          or real-time system.  Say N if you are unsure.
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on PREEMPT_RT
+	default y
+
+config PREEMPT
+	def_bool y if (PREEMPT_RT || PREEMPT_SOFTIRQS || PREEMPT_HARDIRQS || PREEMPT_VOLUNTARY || PREEMPT_DESKTOP)
 
 source "mm/Kconfig"
 
diff -urN ./linux-2.6.18.1/arch/ia64/configs/bigsur_defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/bigsur_defconfig
--- ./linux-2.6.18.1/arch/ia64/configs/bigsur_defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/bigsur_defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -85,7 +85,7 @@
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff -urN ./linux-2.6.18.1/arch/ia64/configs/gensparse_defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/gensparse_defconfig
--- ./linux-2.6.18.1/arch/ia64/configs/gensparse_defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/gensparse_defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -86,7 +86,7 @@
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff -urN ./linux-2.6.18.1/arch/ia64/configs/sim_defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/sim_defconfig
--- ./linux-2.6.18.1/arch/ia64/configs/sim_defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/sim_defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -86,7 +86,7 @@
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff -urN ./linux-2.6.18.1/arch/ia64/configs/sn2_defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/sn2_defconfig
--- ./linux-2.6.18.1/arch/ia64/configs/sn2_defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/sn2_defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -83,7 +83,7 @@
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_DMI=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
diff -urN ./linux-2.6.18.1/arch/ia64/configs/tiger_defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/tiger_defconfig
--- ./linux-2.6.18.1/arch/ia64/configs/tiger_defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/tiger_defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -86,7 +86,7 @@
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff -urN ./linux-2.6.18.1/arch/ia64/configs/zx1_defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/zx1_defconfig
--- ./linux-2.6.18.1/arch/ia64/configs/zx1_defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/configs/zx1_defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -84,7 +84,7 @@
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff -urN ./linux-2.6.18.1/arch/ia64/defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/defconfig
--- ./linux-2.6.18.1/arch/ia64/defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -86,7 +86,7 @@
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/asm-offsets.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/asm-offsets.c
--- ./linux-2.6.18.1/arch/ia64/kernel/asm-offsets.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/asm-offsets.c	2007-05-19 23:58:35.000000000 +0900
@@ -7,6 +7,7 @@
 #define ASM_OFFSETS_C 1
 
 #include <linux/sched.h>
+#include <linux/clocksource.h>
 
 #include <asm-ia64/processor.h>
 #include <asm-ia64/ptrace.h>
@@ -254,18 +255,13 @@
 	       offsetof (struct pal_min_state_area_s, pmsa_xip));
 	BLANK();
 
+#ifdef CONFIG_TIME_INTERPOLATION
 	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
-	DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
-	DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
-	DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift));
-	DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc));
-	DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset));
-	DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle));
-	DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter));
-	DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter));
-	DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask));
-	DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
-	DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
-	DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
 	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
+	DEFINE(IA64_CLOCKSOURCE_MASK_OFFSET, offsetof (struct clocksource, mask));
+	DEFINE(IA64_CLOCKSOURCE_MULT_OFFSET, offsetof (struct clocksource, mult));
+	DEFINE(IA64_CLOCKSOURCE_SHIFT_OFFSET, offsetof (struct clocksource, shift));
+	DEFINE(IA64_CLOCKSOURCE_MMIO_PTR_OFFSET, offsetof (struct clocksource, fsys_mmio_ptr));
+	DEFINE(IA64_CLOCKSOURCE_CYCLE_LAST_OFFSET, offsetof (struct clocksource, cycle_last));
+#endif
 }
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/cyclone.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/cyclone.c
--- ./linux-2.6.18.1/arch/ia64/kernel/cyclone.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/cyclone.c	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/timex.h>
+#include <linux/clocksource.h>
 #include <asm/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
@@ -18,13 +19,21 @@
 	use_cyclone = 1;
 }
 
+static void __iomem *cyclone_mc_ptr;
 
-struct time_interpolator cyclone_interpolator = {
-	.source =	TIME_SOURCE_MMIO64,
-	.shift =	16,
-	.frequency =	CYCLONE_TIMER_FREQ,
-	.drift =	-100,
-	.mask =		(1LL << 40) - 1
+static cycle_t read_cyclone(void)
+{
+        return (cycle_t)readq((void __iomem *)cyclone_mc_ptr);
+}
+
+static struct clocksource clocksource_cyclone = {
+        .name           = "cyclone",
+        .rating         = 300,
+        .read           = read_cyclone,
+        .mask           = (1LL << 40) - 1,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 16,
+        .is_continuous  = 1,
 };
 
 int __init init_cyclone_clock(void)
@@ -101,8 +110,10 @@
 		}
 	}
 	/* initialize last tick */
-	cyclone_interpolator.addr = cyclone_timer;
-	register_time_interpolator(&cyclone_interpolator);
+	clocksource_cyclone.fsys_mmio_ptr = cyclone_mc_ptr = cyclone_timer;
+	clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
+                                                  clocksource_cyclone.shift);
+	clocksource_register(&clocksource_cyclone);
 
 	return 0;
 }
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/entry.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/entry.S
--- ./linux-2.6.18.1/arch/ia64/kernel/entry.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/entry.S	2007-05-19 23:58:35.000000000 +0900
@@ -1101,23 +1101,24 @@
 	st8 [r2]=r8
 	st8 [r3]=r10
 .work_pending:
-	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+	tbit.nz p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
+(p6)	br.cond.sptk.few .needresched
+	tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED	// current_thread_info()->need_resched_delayed==0?
 (p6)	br.cond.sptk.few .notify
-#ifdef CONFIG_PREEMPT
-(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
+
+.needresched:
+
+(pKStk) br.cond.sptk.many .fromkernel
 	;;
-(pKStk) st4 [r20]=r21
 	ssm psr.i		// enable interrupts
-#endif
 	br.call.spnt.many rp=schedule
-.ret9:	cmp.eq p6,p0=r0,r0				// p6 <- 1
-	rsm psr.i		// disable interrupts
-	;;
-#ifdef CONFIG_PREEMPT
-(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
-	;;
-(pKStk)	st4 [r20]=r0		// preempt_count() <- 0
-#endif
+.ret9a:	rsm psr.i		// disable interrupts
+	;; 
+	br.cond.sptk.many .endpreemptdep
+.fromkernel:
+	br.call.spnt.many rp=preempt_schedule_irq
+.ret9b:	rsm psr.i		// disable interrupts
+.endpreemptdep:	
 (pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// re-check
 
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/fsys.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/fsys.S
--- ./linux-2.6.18.1/arch/ia64/kernel/fsys.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/fsys.S	2007-05-19 23:58:35.000000000 +0900
@@ -24,6 +24,7 @@
 
 #include "entry.h"
 
+#ifdef CONFIG_TIME_INTERPOLATION
 /*
  * See Documentation/ia64/fsys.txt for details on fsyscalls.
  *
@@ -145,13 +146,6 @@
 	FSYS_RETURN
 END(fsys_set_tid_address)
 
-/*
- * Ensure that the time interpolator structure is compatible with the asm code
- */
-#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
-	|| IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
-#error fsys_gettimeofday incompatible with changes to struct time_interpolator
-#endif
 #define CLOCK_REALTIME 0
 #define CLOCK_MONOTONIC 1
 #define CLOCK_DIVIDE_BY_1000 0x4000
@@ -177,19 +171,18 @@
 	// r11 = preserved: saved ar.pfs
 	// r12 = preserved: memory stack
 	// r13 = preserved: thread pointer
-	// r14 = address of mask / mask
+	// r14 = address of mask / mask value
 	// r15 = preserved: system call number
 	// r16 = preserved: current task pointer
 	// r17 = wall to monotonic use
-	// r18 = time_interpolator->offset
-	// r19 = address of wall_to_monotonic
-	// r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
-	// r21 = shift factor
-	// r22 = address of time interpolator->last_counter
-	// r23 = address of time_interpolator->last_cycle
-	// r24 = adress of time_interpolator->offset
-	// r25 = last_cycle value
-	// r26 = last_counter value
+	// r19 = address of itc_lastcycle
+	// r20 = struct clocksource / address of first element
+	// r21 = shift value
+	// r22 = address of itc_jitter/ wall_to_monotonic
+	// r23 = address of shift
+	// r24 = address mult factor / cycle_last value
+	// r25 = itc_lastcycle value
+	// r26 = address clocksource cycle_last
 	// r27 = pointer to xtime
 	// r28 = sequence number at the beginning of critcal section
 	// r29 = address of seqlock
@@ -199,9 +192,9 @@
 	// p6,p7 short term use
 	// p8 = timesource ar.itc
 	// p9 = timesource mmio64
-	// p10 = timesource mmio32
+	// p10 = timesource mmio32 - not used
 	// p11 = timesource not to be handled by asm code
-	// p12 = memory time source ( = p9 | p10)
+	// p12 = memory time source ( = p9 | p10) - not used
 	// p13 = do cmpxchg with time_interpolator_last_cycle
 	// p14 = Divide by 1000
 	// p15 = Add monotonic
@@ -212,61 +205,55 @@
 	tnat.nz p6,p0 = r31	// branch deferred since it does not fit into bundle structure
 	mov pr = r30,0xc000	// Set predicates according to function
 	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
-	movl r20 = time_interpolator
+	movl r20 = fsyscall_clock // load fsyscall clocksource address
 	;;
-	ld8 r20 = [r20]		// get pointer to time_interpolator structure
+	add r10 = IA64_CLOCKSOURCE_MMIO_PTR_OFFSET,r20
 	movl r29 = xtime_lock
 	ld4 r2 = [r2]		// process work pending flags
 	movl r27 = xtime
 	;;	// only one bundle here
-	ld8 r21 = [r20]		// first quad with control information
+	add r14 = IA64_CLOCKSOURCE_MASK_OFFSET,r20
+	movl r22 = itc_jitter
+	add r24 = IA64_CLOCKSOURCE_MULT_OFFSET,r20
 	and r2 = TIF_ALLWORK_MASK,r2
 (p6)    br.cond.spnt.few .fail_einval	// deferred branch
 	;;
-	add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
-	extr r3 = r21,32,32	// time_interpolator->nsec_per_cyc
-	extr r8 = r21,0,16	// time_interpolator->source
+	ld8 r30 = [r10]		// clocksource->mmio_ptr
+	movl r19 = itc_lastcycle 
+	add r23 = IA64_CLOCKSOURCE_SHIFT_OFFSET,r20
 	cmp.ne p6, p0 = 0, r2	// Fallback if work is scheduled
 (p6)    br.cond.spnt.many fsys_fallback_syscall
 	;;
-	cmp.eq p8,p12 = 0,r8	// Check for cpu timer
-	cmp.eq p9,p0 = 1,r8	// MMIO64 ?
-	extr r2 = r21,24,8	// time_interpolator->jitter
-	cmp.eq p10,p0 = 2,r8	// MMIO32 ?
-	cmp.ltu p11,p0 = 2,r8	// function or other clock
-(p11)	br.cond.spnt.many fsys_fallback_syscall
-	;;
-	setf.sig f7 = r3	// Setup for scaling of counter
-(p15)	movl r19 = wall_to_monotonic
-(p12)	ld8 r30 = [r10]
-	cmp.ne p13,p0 = r2,r0	// need jitter compensation?
-	extr r21 = r21,16,8	// shift factor
+	ld8 r14 = [r14]         // clocksource mask value
+	ld4 r2 = [r22]		// itc_jitter value
+	add r26 = IA64_CLOCKSOURCE_CYCLE_LAST_OFFSET,r20 // clock fsyscall_cycle_last
+	ld4 r3 = [r24]		// clocksource->mult value
+	cmp.eq p8,p9 = 0,r30	// Check for cpu timer, no mmio_ptr, set p8, clear p9
+	;;
+	setf.sig f7 = r3	// Setup for mult scaling of counter
+(p15)	movl r22 = wall_to_monotonic
+	ld4 r21 = [r23]		// shift value
+(p8)	cmp.ne p13,p0 = r2,r0	// need jitter compensation, set p13
+(p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
 	;;
 .time_redo:
 	.pred.rel.mutex p8,p9,p10
 	ld4.acq r28 = [r29]	// xtime_lock.sequence. Must come first for locking purposes
 (p8)	mov r2 = ar.itc		// CPU_TIMER. 36 clocks latency!!!
-	add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
 (p9)	ld8 r2 = [r30]		// readq(ti->address). Could also have latency issues..
-(p10)	ld4 r2 = [r30]		// readw(ti->address)
-(p13)	add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
+(p13)	ld8 r25 = [r19]		// get itc_lastcycle value
 	;;			// could be removed by moving the last add upward
-	ld8 r26 = [r22]		// time_interpolator->last_counter
-(p13)	ld8 r25 = [r23]		// time interpolator->last_cycle
-	add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
-(p15)	ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
  	ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
-	add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
+	ld8 r24 = [r26]		// get fsyscall_cycle_last value
+(p15)	ld8 r17 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET
 	;;
-	ld8 r18 = [r24]		// time_interpolator->offset
 	ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET	// xtime.tv_nsec
-(p13)	sub r3 = r25,r2	// Diff needed before comparison (thanks davidm)
+(p13)	sub r3 = r25,r2		// Diff needed before comparison (thanks davidm)
 	;;
-	ld8 r14 = [r14]		// time_interpolator->mask
-(p13)	cmp.gt.unc p6,p7 = r3,r0	// check if it is less than last. p6,p7 cleared
-	sub r10 = r2,r26	// current_counter - last_counter
+(p13)	cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
+	sub r10 = r2,r24	// current_counter - last_counter
 	;;
-(p6)	sub r10 = r25,r26	// time we got was less than last_cycle
+(p6)	sub r10 = r25,r24	// time we got was less than last_cycle
 (p7)	mov ar.ccv = r25	// more than last_cycle. Prep for cmpxchg
 	;;
 	and r10 = r10,r14	// Apply mask
@@ -274,22 +261,21 @@
 	setf.sig f8 = r10
 	nop.i 123
 	;;
-(p7)	cmpxchg8.rel r3 = [r23],r2,ar.ccv
+(p7)	cmpxchg8.rel r3 = [r19],r2,ar.ccv
 EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare time
 	xmpy.l f8 = f8,f7	// nsec_per_cyc*(counter-last_counter)
 (p15)	add r9 = r9,r17		// Add wall to monotonic.secs to result secs
 	;;
-(p15)	ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
+(p15)	ld8 r17 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET
 (p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful redo
 	// simulate tbit.nz.or p7,p0 = r28,0
 	and r28 = ~1,r28	// Make sequence even to force retry if odd
 	getf.sig r2 = f8
 	mf
-	add r8 = r8,r18		// Add time interpolator offset
 	;;
 	ld4 r10 = [r29]		// xtime_lock.sequence
 (p15)	add r8 = r8, r17	// Add monotonic.nsecs to nsecs
-	shr.u r2 = r2,r21
+	shr.u r2 = r2,r21	// shift by factor
 	;;		// overloaded 3 bundles!
 	// End critical section.
 	add r8 = r8,r2		// Add xtime.nsecs
@@ -348,6 +334,26 @@
 	br.many .gettime
 END(fsys_clock_gettime)
 
+
+#else // !CONFIG_TIME_INTERPOLATION
+
+# define fsys_gettimeofday 0
+# define fsys_clock_gettime 0
+
+.fail_einval:
+	mov r8 = EINVAL
+	mov r10 = -1
+	FSYS_RETURN
+
+.fail_efault:
+	mov r8 = EFAULT
+	mov r10 = -1
+	FSYS_RETURN
+
+#endif
+
+
+
 /*
  * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
  */
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/iosapic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/iosapic.c
--- ./linux-2.6.18.1/arch/ia64/kernel/iosapic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/iosapic.c	2007-05-19 23:58:35.000000000 +0900
@@ -112,7 +112,7 @@
 	(PAGE_SIZE / sizeof(struct iosapic_rte_info))
 #define RTE_PREALLOCATED	(1)
 
-static DEFINE_SPINLOCK(iosapic_lock);
+static DEFINE_RAW_SPINLOCK(iosapic_lock);
 
 /*
  * These tables map IA-64 vectors to the IOSAPIC pin that generates this
@@ -409,6 +409,34 @@
 	return 0;
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void
+iosapic_ack_level_irq (unsigned int irq)
+{
+	ia64_vector vec = irq_to_vector(irq);
+	struct iosapic_rte_info *rte;
+
+	move_irq(irq);
+	mask_irq(irq);
+	list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list)
+		iosapic_eoi(rte->addr, vec);
+}
+
+static void
+iosapic_end_level_irq (unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+#define iosapic_ack_level_irq		nop
+
 static void
 iosapic_end_level_irq (unsigned int irq)
 {
@@ -420,10 +448,12 @@
 		iosapic_eoi(rte->addr, vec);
 }
 
+
+#endif
+
 #define iosapic_shutdown_level_irq	mask_irq
 #define iosapic_enable_level_irq	unmask_irq
 #define iosapic_disable_level_irq	mask_irq
-#define iosapic_ack_level_irq		nop
 
 struct hw_interrupt_type irq_type_iosapic_level = {
 	.typename =	"IO-SAPIC-level",
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/irq_ia64.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/irq_ia64.c
--- ./linux-2.6.18.1/arch/ia64/kernel/irq_ia64.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/irq_ia64.c	2007-05-19 23:58:35.000000000 +0900
@@ -30,6 +30,7 @@
 #include <linux/smp_lock.h>
 #include <linux/threads.h>
 #include <linux/bitops.h>
+#include <linux/irq.h>
 
 #include <asm/delay.h>
 #include <asm/intrinsics.h>
@@ -105,6 +106,25 @@
 	return test_and_set_bit(pos, ia64_vector_mask);
 }
 
+/*
+ * Dynamic irq allocate and deallocation for MSI
+ */
+int create_irq(void)
+{
+	int vector = assign_irq_vector(AUTO_ASSIGN);
+
+	if (vector >= 0)
+		dynamic_irq_init(vector);
+
+	return vector;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	dynamic_irq_cleanup(irq);
+	free_irq_vector(irq);
+}
+
 #ifdef CONFIG_SMP
 #	define IS_RESCHEDULE(vec)	(vec == IA64_IPI_RESCHEDULE)
 #else
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/mca.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/mca.c
--- ./linux-2.6.18.1/arch/ia64/kernel/mca.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/mca.c	2007-05-19 23:58:35.000000000 +0900
@@ -152,7 +152,7 @@
 
 typedef struct ia64_state_log_s
 {
-	spinlock_t	isl_lock;
+	raw_spinlock_t	isl_lock;
 	int		isl_index;
 	unsigned long	isl_count;
 	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/perfmon.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/perfmon.c
--- ./linux-2.6.18.1/arch/ia64/kernel/perfmon.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/perfmon.c	2007-05-19 23:58:35.000000000 +0900
@@ -277,7 +277,7 @@
  */
 
 typedef struct pfm_context {
-	spinlock_t		ctx_lock;		/* context protection */
+	raw_spinlock_t		ctx_lock;		/* context protection */
 
 	pfm_context_flags_t	ctx_flags;		/* bitmask of flags  (block reason incl.) */
 	unsigned int		ctx_state;		/* state: active/inactive (no bitfield) */
@@ -363,7 +363,7 @@
  * mostly used to synchronize between system wide and per-process
  */
 typedef struct {
-	spinlock_t		pfs_lock;		   /* lock the structure */
+	raw_spinlock_t		pfs_lock;		   /* lock the structure */
 
 	unsigned int		pfs_task_sessions;	   /* number of per task sessions */
 	unsigned int		pfs_sys_sessions;	   /* number of per system wide sessions */
@@ -504,7 +504,7 @@
 static struct proc_dir_entry 	*perfmon_dir;
 static pfm_uuid_t		pfm_null_uuid = {0,};
 
-static spinlock_t		pfm_buffer_fmt_lock;
+static raw_spinlock_t		pfm_buffer_fmt_lock;
 static LIST_HEAD(pfm_buffer_fmt_list);
 
 static pmu_config_t		*pmu_conf;
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/process.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/process.c
--- ./linux-2.6.18.1/arch/ia64/kernel/process.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/process.c	2007-05-19 23:58:35.000000000 +0900
@@ -96,6 +96,9 @@
 void
 dump_stack (void)
 {
+	if (irqs_disabled()) {
+		printk("Uh oh.. entering dump_stack() with irqs disabled.\n");
+	}
 	show_stack(NULL, NULL);
 }
 
@@ -199,7 +202,7 @@
 default_idle (void)
 {
 	local_irq_enable();
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		if (can_do_pal_halt)
 			safe_halt();
 		else
@@ -275,7 +278,7 @@
 		else
 			current_thread_info()->status |= TS_POLLING;
 
-		if (!need_resched()) {
+		if (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 #ifdef CONFIG_SMP
 			min_xtp();
@@ -297,10 +300,11 @@
 			normal_xtp();
 #endif
 		}
-		preempt_enable_no_resched();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
+
 		preempt_disable();
-		check_pgt_cache();
+
 		if (cpu_is_offline(cpu))
 			play_dead();
 	}
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/sal.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/sal.c
--- ./linux-2.6.18.1/arch/ia64/kernel/sal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/sal.c	2007-05-19 23:58:35.000000000 +0900
@@ -18,7 +18,7 @@
 #include <asm/sal.h>
 #include <asm/pal.h>
 
- __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
+ __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock);
 unsigned long sal_platform_features;
 
 unsigned short sal_revision;
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/salinfo.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/salinfo.c
--- ./linux-2.6.18.1/arch/ia64/kernel/salinfo.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/salinfo.c	2007-05-19 23:58:35.000000000 +0900
@@ -141,7 +141,7 @@
 
 struct salinfo_data {
 	cpumask_t		cpu_event;	/* which cpus have outstanding events */
-	struct semaphore	mutex;
+	struct compat_semaphore	mutex;
 	u8			*log_buffer;
 	u64			log_size;
 	u8			*oemdata;	/* decoded oem data */
@@ -157,8 +157,8 @@
 
 static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
 
-static DEFINE_SPINLOCK(data_lock);
-static DEFINE_SPINLOCK(data_saved_lock);
+static DEFINE_RAW_SPINLOCK(data_lock);
+static DEFINE_RAW_SPINLOCK(data_saved_lock);
 
 /** salinfo_platform_oemdata - optional callback to decode oemdata from an error
  * record.
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/semaphore.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/semaphore.c
--- ./linux-2.6.18.1/arch/ia64/kernel/semaphore.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/semaphore.c	2007-05-19 23:58:35.000000000 +0900
@@ -40,12 +40,12 @@
  */
 
 void
-__up (struct semaphore *sem)
+__up (struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-void __sched __down (struct semaphore *sem)
+void __sched __down (struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,7 +82,7 @@
 	tsk->state = TASK_RUNNING;
 }
 
-int __sched __down_interruptible (struct semaphore * sem)
+int __sched __down_interruptible (struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -142,7 +142,7 @@
  * count.
  */
 int
-__down_trylock (struct semaphore *sem)
+__down_trylock (struct compat_semaphore *sem)
 {
 	unsigned long flags;
 	int sleepers;
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/signal.c
--- ./linux-2.6.18.1/arch/ia64/kernel/signal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/signal.c	2007-05-19 23:58:35.000000000 +0900
@@ -487,6 +487,14 @@
 	long errno = scr->pt.r8;
 #	define ERR_CODE(c)	(IS_IA32_PROCESS(&scr->pt) ? -(c) : (c))
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * In the ia64_leave_kernel code path, we want the common case to go fast, which
 	 * is why we may in certain cases get here from kernel mode. Just return without
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/smp.c
--- ./linux-2.6.18.1/arch/ia64/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -222,6 +222,22 @@
 	platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu) {
+		if (cpu != smp_processor_id())
+			platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
+	}
+}
+
+
 void
 smp_flush_tlb_all (void)
 {
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/smpboot.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/smpboot.c
--- ./linux-2.6.18.1/arch/ia64/kernel/smpboot.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/smpboot.c	2007-05-19 23:58:35.000000000 +0900
@@ -371,6 +371,8 @@
 {
 }
 
+extern void register_itc_clockevent(void);
+
 static void __devinit
 smp_callin (void)
 {
@@ -430,6 +432,7 @@
 #ifdef CONFIG_IA32_SUPPORT
 	ia32_gdt_init();
 #endif
+	register_itc_clockevent();
 
 	/*
 	 * Allow the master to continue.
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/time.c
--- ./linux-2.6.18.1/arch/ia64/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -20,6 +20,7 @@
 #include <linux/efi.h>
 #include <linux/profile.h>
 #include <linux/timex.h>
+#include <linux/clocksource.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -31,6 +32,10 @@
 
 extern unsigned long wall_jiffies;
 
+static cycle_t itc_get_cycles(void);
+cycle_t	itc_lastcycle __attribute__((aligned(L1_CACHE_BYTES)));
+int	itc_jitter __attribute__((aligned(L1_CACHE_BYTES)));
+
 volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
@@ -40,11 +45,16 @@
 
 #endif
 
-static struct time_interpolator itc_interpolator = {
-	.shift = 16,
-	.mask = 0xffffffffffffffffLL,
-	.source = TIME_SOURCE_CPU
+static struct clocksource clocksource_itc = {
+        .name           = "itc",
+        .rating         = 350,
+        .read           = itc_get_cycles,
+        .mask           = 0xffffffffffffffffLL,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 16,
+        .is_continuous  = 1,
 };
+static struct clocksource *clocksource_itc_p;
 
 static irqreturn_t
 timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
@@ -57,38 +67,57 @@
 
 	platform_timer_interrupt(irq, dev_id, regs);
 
+#if 0
 	new_itm = local_cpu_data->itm_next;
 
 	if (!time_after(ia64_get_itc(), new_itm))
 		printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
 		       ia64_get_itc(), new_itm);
-
 	profile_tick(CPU_PROFILING, regs);
+#endif
 
-	while (1) {
-		update_process_times(user_mode(regs));
+	if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) {
 
-		new_itm += local_cpu_data->itm_delta;
+		unsigned long new_tick_itm;
+		new_tick_itm = local_cpu_data->itm_tick_next;
 
-		if (smp_processor_id() == time_keeper_id) {
-			/*
-			 * Here we are in the timer irq handler. We have irqs locally
-			 * disabled, but we don't know if the timer_bh is running on
-			 * another CPU. We need to avoid to SMP race by acquiring the
-			 * xtime_lock.
-			 */
-			write_seqlock(&xtime_lock);
-			do_timer(regs);
-			local_cpu_data->itm_next = new_itm;
-			write_sequnlock(&xtime_lock);
-		} else
-			local_cpu_data->itm_next = new_itm;
+		profile_tick(CPU_PROFILING, regs);
+
+		while (1) {
+			update_process_times(user_mode(regs));
+
+			new_tick_itm += local_cpu_data->itm_tick_delta;
+
+			if (smp_processor_id() == time_keeper_id) {
+				/*
+				 * Here we are in the timer irq handler. We have irqs locally
+				 * disabled, but we don't know if the timer_bh is running on
+				 * another CPU. We need to avoid to SMP race by acquiring the
+				 * xtime_lock.
+				 */
+				write_seqlock(&xtime_lock);
+				do_timer(regs);
+				local_cpu_data->itm_tick_next = new_tick_itm;
+				write_sequnlock(&xtime_lock);
+			} else
+				local_cpu_data->itm_tick_next = new_tick_itm;
+
+			if (time_after(new_tick_itm, ia64_get_itc()))
+				break;
+		}
+	}
 
-		if (time_after(new_itm, ia64_get_itc()))
-			break;
+	if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) {
+		if (itc_clockevent.event_handler)
+			itc_clockevent.event_handler(regs);
 	}
 
 	do {
+		// FIXME, really, please
+		new_itm = local_cpu_data->itm_tick_next;
+
+		if (time_after(new_itm, local_cpu_data->itm_timer_next))
+			new_itm = local_cpu_data->itm_timer_next;
 		/*
 		 * If we're too close to the next clock tick for
 		 * comfort, we increase the safety margin by
@@ -98,8 +127,8 @@
 		 * too fast (with the potentially devastating effect
 		 * of losing monotony of time).
 		 */
-		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
-			new_itm += local_cpu_data->itm_delta;
+		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2))
+			new_itm += local_cpu_data->itm_tick_delta;
 		ia64_set_itm(new_itm);
 		/* double check, in case we got hit by a (slow) PMI: */
 	} while (time_after_eq(ia64_get_itc(), new_itm));
@@ -118,7 +147,7 @@
 	/* arrange for the cycle counter to generate a timer interrupt: */
 	ia64_set_itv(IA64_TIMER_VECTOR);
 
-	delta = local_cpu_data->itm_delta;
+	delta = local_cpu_data->itm_tick_delta;
 	/*
 	 * Stagger the timer tick for each CPU so they don't occur all at (almost) the
 	 * same time:
@@ -127,8 +156,8 @@
 		unsigned long hi = 1UL << ia64_fls(cpu);
 		shift = (2*(cpu - hi) + 1) * delta/hi/2;
 	}
-	local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
-	ia64_set_itm(local_cpu_data->itm_next);
+	local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift;
+	ia64_set_itm(local_cpu_data->itm_tick_next);
 }
 
 static int nojitter;
@@ -186,7 +215,7 @@
 
 	itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
 
-	local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
+	local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ;
 	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, "
 	       "ITC freq=%lu.%03luMHz", smp_processor_id(),
 	       platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
@@ -206,9 +235,8 @@
 	local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
 					+ itc_freq/2)/itc_freq;
 
+#ifdef CONFIG_TIME_INTERPOLATION
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-		itc_interpolator.frequency = local_cpu_data->itc_freq;
-		itc_interpolator.drift = itc_drift;
 #ifdef CONFIG_SMP
 		/* On IA64 in an SMP configuration ITCs are never accurately synchronized.
 		 * Jitter compensation requires a cmpxchg which may limit
@@ -220,18 +248,57 @@
 		 * even going backward) if the ITC offsets between the individual CPUs
 		 * are too large.
 		 */
-		if (!nojitter) itc_interpolator.jitter = 1;
+		if (!nojitter) itc_jitter = 1;
 #endif
-		register_time_interpolator(&itc_interpolator);
 	}
+#endif
 
 	/* Setup the CPU local timer tick */
 	ia64_cpu_local_tick();
+
+	if (!clocksource_itc_p) {
+        	/* Sort out mult/shift values: */
+		clocksource_itc.mult = clocksource_hz2mult(local_cpu_data->itc_freq,
+						   clocksource_itc.shift);
+        	clocksource_register(&clocksource_itc);
+		clocksource_itc_p = &clocksource_itc;
+	}
 }
 
+
+static cycle_t itc_get_cycles()
+{
+        if (itc_jitter) {
+                u64 lcycle;
+                u64 now;
+
+                do {
+                        lcycle = itc_lastcycle;
+                        now = get_cycles();
+                        if (lcycle && time_after(lcycle, now))
+                                return lcycle;
+
+                        /* When holding the xtime write lock, there's no need
+                         * to add the overhead of the cmpxchg.  Readers are
+                         * force to retry until the write lock is released.
+                         */
+                        if (spin_is_locked(&xtime_lock.lock)) {
+                                itc_lastcycle = now;
+                                return now;
+                        }
+                        /* Keep track of the last timer value returned. The use of cmpxchg here
+                         * will cause contention in an SMP environment.
+                         */
+                } while (unlikely(cmpxchg(&itc_lastcycle, lcycle, now) != lcycle));
+                return now;
+        } else
+                return get_cycles();
+}
+
+
 static struct irqaction timer_irqaction = {
 	.handler =	timer_interrupt,
-	.flags =	IRQF_DISABLED,
+	.flags =	IRQF_DISABLED | IRQF_NODELAY,
 	.name =		"timer"
 };
 
@@ -252,6 +319,8 @@
 	 * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC).
 	 */
 	set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
+	register_itc_clocksource();
+	register_itc_clockevent();
 }
 
 /*
@@ -304,3 +373,10 @@
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT))
 		ia64_printk_clock = ia64_itc_printk_clock;
 }
+
+struct clocksource fsyscall_clock __attribute__((aligned(L1_CACHE_BYTES)));
+
+void update_vsyscall(struct timespec *wall, struct clocksource *c)
+{
+	fsyscall_clock = *c;
+}
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/traps.c
--- ./linux-2.6.18.1/arch/ia64/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -24,7 +24,7 @@
 #include <asm/uaccess.h>
 #include <asm/kdebug.h>
 
-extern spinlock_t timerlist_lock;
+extern raw_spinlock_t timerlist_lock;
 
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
@@ -85,11 +85,11 @@
 die (const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED,
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -226,7 +226,7 @@
  * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
  * care of clearing psr.dfh.
  */
-static inline void
+void
 disabled_fph_fault (struct pt_regs *regs)
 {
 	struct ia64_psr *psr = ia64_psr(regs);
@@ -245,7 +245,7 @@
 			= (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
 
 		if (ia64_is_local_fpu_owner(current)) {
-			preempt_enable_no_resched();
+			__preempt_enable_no_resched();
 			return;
 		}
 
@@ -265,7 +265,7 @@
 		 */
 		psr->mfh = 1;
 	}
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 static inline int
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/unwind.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/unwind.c
--- ./linux-2.6.18.1/arch/ia64/kernel/unwind.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/unwind.c	2007-05-19 23:58:35.000000000 +0900
@@ -81,7 +81,7 @@
 typedef unsigned char unw_hash_index_t;
 
 static struct {
-	spinlock_t lock;			/* spinlock for unwind data */
+	raw_spinlock_t lock;			/* spinlock for unwind data */
 
 	/* list of unwind tables (one per load-module) */
 	struct unw_table *tables;
@@ -145,7 +145,7 @@
 # endif
 } unw = {
 	.tables = &unw.kernel_table,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = RAW_SPIN_LOCK_UNLOCKED,
 	.save_order = {
 		UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
 		UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
diff -urN ./linux-2.6.18.1/arch/ia64/kernel/unwind_i.h linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/unwind_i.h
--- ./linux-2.6.18.1/arch/ia64/kernel/unwind_i.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/kernel/unwind_i.h	2007-05-19 23:58:35.000000000 +0900
@@ -154,7 +154,7 @@
 	unsigned long ip;		/* ip this script is for */
 	unsigned long pr_mask;		/* mask of predicates script depends on */
 	unsigned long pr_val;		/* predicate values this script is for */
-	rwlock_t lock;
+	raw_rwlock_t lock;
 	unsigned int flags;		/* see UNW_FLAG_* in unwind.h */
 	unsigned short lru_chain;	/* used for least-recently-used chain */
 	unsigned short coll_chain;	/* used for hash collisions */
diff -urN ./linux-2.6.18.1/arch/ia64/mm/init.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/mm/init.c
--- ./linux-2.6.18.1/arch/ia64/mm/init.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/mm/init.c	2007-05-19 23:58:35.000000000 +0900
@@ -36,7 +36,7 @@
 #include <asm/unistd.h>
 #include <asm/mca.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
 DEFINE_PER_CPU(long, __pgtable_quicklist_size);
@@ -92,15 +92,11 @@
 	if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
 		return;
 
-	preempt_disable();
 	while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
 		while (pages_to_free--) {
 			free_page((unsigned long)pgtable_quicklist_alloc());
 		}
-		preempt_enable();
-		preempt_disable();
 	}
-	preempt_enable();
 }
 
 void
diff -urN ./linux-2.6.18.1/arch/ia64/mm/tlb.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/mm/tlb.c
--- ./linux-2.6.18.1/arch/ia64/mm/tlb.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/mm/tlb.c	2007-05-19 23:58:35.000000000 +0900
@@ -32,7 +32,7 @@
 } purge;
 
 struct ia64_ctx ia64_ctx = {
-	.lock =		SPIN_LOCK_UNLOCKED,
+	.lock =		RAW_SPIN_LOCK_UNLOCKED,
 	.next =		1,
 	.max_ctx =	~0U
 };
diff -urN ./linux-2.6.18.1/arch/ia64/pci/pci.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/pci/pci.c
--- ./linux-2.6.18.1/arch/ia64/pci/pci.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/pci/pci.c	2007-05-19 23:58:35.000000000 +0900
@@ -809,12 +809,3 @@
 	}
 	return rc;
 }
-
-int pci_vector_resources(int last, int nr_released)
-{
-	int count = nr_released;
-
-	count += (IA64_LAST_DEVICE_VECTOR - last);
-
-	return count;
-}
diff -urN ./linux-2.6.18.1/arch/ia64/sn/kernel/sn2/timer.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/sn/kernel/sn2/timer.c
--- ./linux-2.6.18.1/arch/ia64/sn/kernel/sn2/timer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ia64/sn/kernel/sn2/timer.c	2007-05-19 23:58:35.000000000 +0900
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/interrupt.h>
+#include <linux/clocksource.h>
 
 #include <asm/hw_irq.h>
 #include <asm/system.h>
@@ -22,11 +23,21 @@
 
 extern unsigned long sn_rtc_cycles_per_second;
 
-static struct time_interpolator sn2_interpolator = {
-	.drift = -1,
-	.shift = 10,
-	.mask = (1LL << 55) - 1,
-	.source = TIME_SOURCE_MMIO64
+static void __iomem *sn2_mc_ptr;
+
+static cycle_t read_sn2(void)
+{
+        return (cycle_t)readq(sn2_mc_ptr);
+}
+
+static struct clocksource clocksource_sn2 = {
+        .name           = "sn2_rtc",
+        .rating         = 300,
+        .read           = read_sn2,
+        .mask           = (1LL << 55) - 1,
+        .mult           = 0,
+        .shift          = 10,
+        .is_continuous  = 1,
 };
 
 /*
@@ -47,9 +58,10 @@
 
 void __init sn_timer_init(void)
 {
-	sn2_interpolator.frequency = sn_rtc_cycles_per_second;
-	sn2_interpolator.addr = RTC_COUNTER_ADDR;
-	register_time_interpolator(&sn2_interpolator);
+	clocksource_sn2.fsys_mmio_ptr = sn2_mc_ptr = RTC_COUNTER_ADDR;
+	clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+                                                  clocksource_sn2.shift);
+	clocksource_register(&clocksource_sn2);
 
 	ia64_udelay = &ia64_sn_udelay;
 }
diff -urN ./linux-2.6.18.1/arch/mips/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/Kconfig
--- ./linux-2.6.18.1/arch/mips/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -417,6 +417,7 @@
 config MOMENCO_OCELOT
 	bool "Momentum Ocelot board"
 	select DMA_NONCOHERENT
+	select NO_SPINLOCK
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_CPU_RM7K
@@ -837,6 +838,7 @@
 
 endmenu
 
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
@@ -844,6 +846,10 @@
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 
+config ASM_SEMAPHORES
+	bool
+	default y
+
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
@@ -889,6 +895,9 @@
 config OWN_DMA
 	bool
 
+config NO_SPINLOCK
+	bool
+
 config EARLY_PRINTK
 	bool
 
@@ -1843,12 +1852,17 @@
 	  This will result in additional memory usage, so it is not
 	  recommended for normal users.
 
-endmenu
-
-config RWSEM_GENERIC_SPINLOCK
+config GENERIC_TIME
 	bool
 	default y
 
+source "kernel/time/Kconfig"
+
+config CPU_SPEED
+	int "CPU speed used for clocksource/clockevent calculations"
+	default 600
+endmenu
+
 source "init/Kconfig"
 
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
diff -urN ./linux-2.6.18.1/arch/mips/kernel/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/Makefile
--- ./linux-2.6.18.1/arch/mips/kernel/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -5,7 +5,7 @@
 extra-y		:= head.o init_task.o vmlinux.lds
 
 obj-y		+= cpu-probe.o branch.o entry.o genex.o irq.o process.o \
-		   ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \
+		   ptrace.o reset.o setup.o signal.o syscall.o \
 		   time.o traps.o unaligned.o
 
 binfmt_irix-objs	:= irixelf.o irixinv.o irixioctl.o irixsig.o	\
@@ -15,6 +15,8 @@
 
 obj-$(CONFIG_APM)		+= apm.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
+
 obj-$(CONFIG_CPU_R3000)		+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX39XX)	+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX49XX)	+= r4k_fpu.o r4k_switch.o
diff -urN ./linux-2.6.18.1/arch/mips/kernel/asm-offsets.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/asm-offsets.c
--- ./linux-2.6.18.1/arch/mips/kernel/asm-offsets.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/asm-offsets.c	2007-05-19 23:58:35.000000000 +0900
@@ -10,9 +10,11 @@
  */
 #include <linux/compat.h>
 #include <linux/types.h>
+#include <linux/linkage.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/irqflags.h>
 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff -urN ./linux-2.6.18.1/arch/mips/kernel/entry.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/entry.S
--- ./linux-2.6.18.1/arch/mips/kernel/entry.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/entry.S	2007-05-19 23:58:35.000000000 +0900
@@ -25,7 +25,7 @@
 	.endm
 #else
 	.macro	preempt_stop
-	local_irq_disable
+	raw_local_irq_disable
 	.endm
 #define resume_kernel	restore_all
 #endif
@@ -40,7 +40,7 @@
 	beqz	t0, resume_kernel
 
 resume_userspace:
-	local_irq_disable		# make sure we dont miss an
+	raw_local_irq_disable	# make sure we dont miss an
 					# interrupt setting need_resched
 					# between sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -50,7 +50,9 @@
 
 #ifdef CONFIG_PREEMPT
 resume_kernel:
-	local_irq_disable
+	raw_local_irq_disable
+	lw	t0, kernel_preemption
+	beqz	t0, restore_all
 	lw	t0, TI_PRE_COUNT($28)
 	bnez	t0, restore_all
 need_resched:
@@ -60,7 +62,9 @@
 	LONG_L	t0, PT_STATUS(sp)		# Interrupts off?
 	andi	t0, 1
 	beqz	t0, restore_all
+	raw_local_irq_disable
 	jal	preempt_schedule_irq
+	sw      zero, TI_PRE_COUNT($28)
 	b	need_resched
 #endif
 
@@ -68,7 +72,7 @@
 	jal	schedule_tail		# a0 = struct task_struct *prev
 
 FEXPORT(syscall_exit)
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -133,19 +137,21 @@
 	.set	at
 
 work_pending:
-	andi	t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS
+					# a2 is preloaded with TI_FLAGS
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beqz	t0, work_notifysig
 work_resched:
+	raw_local_irq_enable  t0
 	jal	schedule
 
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
 	andi	t0, a2, _TIF_WORK_MASK	# is there any work to be done
 					# other than syscall tracing?
 	beqz	t0, restore_all
-	andi	t0, a2, _TIF_NEED_RESCHED
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bnez	t0, work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -161,7 +167,7 @@
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2			# a2 is preloaded with TI_FLAGS
 	beqz	t0, work_pending	# trace bit set?
-	local_irq_enable		# could let do_syscall_trace()
+	raw_local_irq_enable	# could let do_syscall_trace()
 					# call schedule() instead
 	move	a0, sp
 	li	a1, 1
diff -urN ./linux-2.6.18.1/arch/mips/kernel/i8259.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/i8259.c
--- ./linux-2.6.18.1/arch/mips/kernel/i8259.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/i8259.c	2007-05-19 23:58:35.000000000 +0900
@@ -31,7 +31,7 @@
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
diff -urN ./linux-2.6.18.1/arch/mips/kernel/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/irq.c
--- ./linux-2.6.18.1/arch/mips/kernel/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -137,7 +137,10 @@
 		irq_desc[i].action  = NULL;
 		irq_desc[i].depth   = 1;
 		irq_desc[i].chip = &no_irq_chip;
-		spin_lock_init(&irq_desc[i].lock);
+		_raw_spin_lock_init(&irq_desc[i].lock);
+#ifdef CONFIG_PREEMPT_HARDIRQS
+		irq_desc[i].thread = NULL;
+#endif
 #ifdef CONFIG_MIPS_MT_SMTC
 		irq_hwmask[i] = 0;
 #endif /* CONFIG_MIPS_MT_SMTC */
diff -urN ./linux-2.6.18.1/arch/mips/kernel/module.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/module.c
--- ./linux-2.6.18.1/arch/mips/kernel/module.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/module.c	2007-05-19 23:58:35.000000000 +0900
@@ -39,7 +39,7 @@
 static struct mips_hi16 *mips_hi16_list;
 
 static LIST_HEAD(dbe_list);
-static DEFINE_SPINLOCK(dbe_lock);
+static DEFINE_RAW_SPINLOCK(dbe_lock);
 
 void *module_alloc(unsigned long size)
 {
diff -urN ./linux-2.6.18.1/arch/mips/kernel/process.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/process.c
--- ./linux-2.6.18.1/arch/mips/kernel/process.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/process.c	2007-05-19 23:58:35.000000000 +0900
@@ -54,16 +54,18 @@
 {
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		while (!need_resched() && !need_resched_delayed()) {
 #ifdef CONFIG_MIPS_MT_SMTC
 			smtc_idle_loop_hook();
 #endif /* CONFIG_MIPS_MT_SMTC */
 			if (cpu_wait)
 				(*cpu_wait)();
 		}
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
diff -urN ./linux-2.6.18.1/arch/mips/kernel/scall32-o32.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall32-o32.S
--- ./linux-2.6.18.1/arch/mips/kernel/scall32-o32.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall32-o32.S	2007-05-19 23:58:35.000000000 +0900
@@ -84,7 +84,7 @@
 1:	sw	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	lw	a2, TI_FLAGS($28)	# current->work
diff -urN ./linux-2.6.18.1/arch/mips/kernel/scall64-64.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall64-64.S
--- ./linux-2.6.18.1/arch/mips/kernel/scall64-64.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall64-64.S	2007-05-19 23:58:35.000000000 +0900
@@ -72,7 +72,7 @@
 1:	sd	v0, PT_R2(sp)		# result
 
 n64_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
diff -urN ./linux-2.6.18.1/arch/mips/kernel/scall64-n32.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall64-n32.S
--- ./linux-2.6.18.1/arch/mips/kernel/scall64-n32.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall64-n32.S	2007-05-19 23:58:35.000000000 +0900
@@ -69,7 +69,7 @@
 	sd	v0, PT_R0(sp)		# set flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L  a2, TI_FLAGS($28)	# current->work
diff -urN ./linux-2.6.18.1/arch/mips/kernel/scall64-o32.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall64-o32.S
--- ./linux-2.6.18.1/arch/mips/kernel/scall64-o32.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/scall64-o32.S	2007-05-19 23:58:35.000000000 +0900
@@ -98,7 +98,7 @@
 1:	sd	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make need_resched and
+	raw_local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
diff -urN ./linux-2.6.18.1/arch/mips/kernel/semaphore.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/semaphore.c
--- ./linux-2.6.18.1/arch/mips/kernel/semaphore.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/semaphore.c	2007-05-19 23:58:35.000000000 +0900
@@ -36,7 +36,7 @@
  * sem->count and sem->waking atomic.  Scalability isn't an issue because
  * this lock is used on UP only so it's just an empty variable.
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -67,7 +67,7 @@
 		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
 		: "r" (incr), "m" (sem->count));
 	} else {
-		static DEFINE_SPINLOCK(semaphore_lock);
+		static DEFINE_RAW_SPINLOCK(semaphore_lock);
 		unsigned long flags;
 
 		spin_lock_irqsave(&semaphore_lock, flags);
@@ -80,7 +80,7 @@
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -94,7 +94,7 @@
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -104,7 +104,7 @@
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -133,9 +133,9 @@
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -165,4 +165,10 @@
 	return retval;
 }
 
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
+
+int fastcall compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+EXPORT_SYMBOL(compat_sem_is_locked);
diff -urN ./linux-2.6.18.1/arch/mips/kernel/signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/signal.c
--- ./linux-2.6.18.1/arch/mips/kernel/signal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/signal.c	2007-05-19 23:58:35.000000000 +0900
@@ -416,6 +416,10 @@
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
diff -urN ./linux-2.6.18.1/arch/mips/kernel/signal32.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/signal32.c
--- ./linux-2.6.18.1/arch/mips/kernel/signal32.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/signal32.c	2007-05-19 23:58:35.000000000 +0900
@@ -807,6 +807,10 @@
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
diff -urN ./linux-2.6.18.1/arch/mips/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/smp.c
--- ./linux-2.6.18.1/arch/mips/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -115,7 +115,22 @@
 	cpu_idle();
 }
 
-DEFINE_SPINLOCK(smp_call_lock);
+DEFINE_RAW_SPINLOCK(smp_call_lock);
+
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them.
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i) && i != cpu)
+			core_send_ipi(i, SMP_RESCHEDULE_YOURSELF);
+}
 
 struct call_data_struct *call_data;
 
@@ -303,6 +318,8 @@
 	return 0;
 }
 
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
+
 static void flush_tlb_all_ipi(void *info)
 {
 	local_flush_tlb_all();
@@ -360,6 +377,7 @@
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		smp_on_other_tlbs(flush_tlb_mm_ipi, (void *)mm);
@@ -369,6 +387,7 @@
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_mm(mm);
 
 	preempt_enable();
@@ -392,6 +411,8 @@
 	struct mm_struct *mm = vma->vm_mm;
 
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		struct flush_tlb_data fd;
 
@@ -405,6 +426,7 @@
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_range(vma, start, end);
 	preempt_enable();
 }
@@ -435,6 +457,8 @@
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) {
 		struct flush_tlb_data fd;
 
@@ -447,6 +471,7 @@
 			if (smp_processor_id() != i)
 				cpu_context(i, vma->vm_mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_page(vma, page);
 	preempt_enable();
 }
diff -urN ./linux-2.6.18.1/arch/mips/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/time.c
--- ./linux-2.6.18.1/arch/mips/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -10,7 +10,13 @@
  * under  the terms of  the GNU General  Public License as published by the
  * Free Software Foundation;  either version 2 of the  License, or (at your
  * option) any later version.
+ *
+ * This implementation of High Res Timers uses two timers. One is the system
+ * timer. The second is used for the high res timers. The high res timers 
+ * require the CPU to have count/compare registers. The mips_set_next_event()
+ * function schedules the next high res timer interrupt. 
  */
+#include <linux/clocksource.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -23,6 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/clockchips.h>
 
 #include <asm/bootinfo.h>
 #include <asm/cache.h>
@@ -49,7 +56,27 @@
  */
 extern volatile unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+/* any missed timer interrupts */
+int missed_timer_count;
+
+DEFINE_RAW_SPINLOCK(rtc_lock);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+static void mips_set_next_event(unsigned long evt);
+static void mips_set_mode(int mode, void *priv);
+
+static struct clock_event lapic_clockevent = {
+	.name = "mips clockevent interface",
+	.capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_HAS_IRQHANDLER
+#ifdef CONFIG_SMP
+			| CLOCK_CAP_UPDATE
+#endif
+	,
+	.shift = 32,
+	.set_next_event = mips_set_next_event,
+};
+#endif
 
 /*
  * By default we provide the null RTC ops
@@ -68,25 +95,36 @@
 int (*rtc_mips_set_time)(unsigned long) = null_rtc_set_time;
 int (*rtc_mips_set_mmss)(unsigned long);
 
-
 /* usecs per counter cycle, shifted to left by 32 bits */
 static unsigned int sll32_usecs_per_cycle;
 
 /* how many counter cycles in a jiffy */
 static unsigned long cycles_per_jiffy __read_mostly;
 
+static unsigned long hrt_cycles_per_jiffy __read_mostly;
+
+
 /* Cycle counter value at the previous timer interrupt.. */
 static unsigned int timerhi, timerlo;
 
 /* expirelo is the count value for next CPU timer interrupt */
 static unsigned int expirelo;
 
-
 /*
  * Null timer ack for systems not needing one (e.g. i8254).
  */
 static void null_timer_ack(void) { /* nothing */ }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * Set the next event
+ */
+static void mips_set_next_event(unsigned long evt)
+{
+	write_c0_compare(read_c0_count() + evt);
+}
+#endif
+
 /*
  * Null high precision timer functions for systems lacking one.
  */
@@ -100,7 +138,6 @@
 	/* nothing */
 }
 
-
 /*
  * Timer ack for an R4k-compatible timer of a known frequency.
  */
@@ -110,14 +147,15 @@
 
 #ifndef CONFIG_SOC_PNX8550	/* pnx8550 resets to zero */
 	/* Ack this timer interrupt and set the next one.  */
-	expirelo += cycles_per_jiffy;
+	expirelo += hrt_cycles_per_jiffy;
 #endif
 	write_c0_compare(expirelo);
 
 	/* Check to see if we have missed any timer interrupts.  */
-	while (((count = read_c0_count()) - expirelo) < 0x7fffffff) {
-		/* missed_timer_count++; */
-		expirelo = count + cycles_per_jiffy;
+	count = read_c0_count();
+	if ((count - expirelo) < 0x7fffffff) {
+		/* missed_timer_count++;  */
+		expirelo = count + hrt_cycles_per_jiffy;
 		write_c0_compare(expirelo);
 	}
 }
@@ -146,92 +184,39 @@
 	write_c0_count(count);
 }
 
-int (*mips_timer_state)(void);
-void (*mips_timer_ack)(void);
-unsigned int (*mips_hpt_read)(void);
-void (*mips_hpt_init)(unsigned int);
-
-
-/*
- * This version of gettimeofday has microsecond resolution and better than
- * microsecond precision on fast machines with cycle counter.
- */
-void do_gettimeofday(struct timeval *tv)
+static cycle_t read_mips_hpt(void)
 {
-	unsigned long seq;
-	unsigned long lost;
-	unsigned long usec, sec;
-	unsigned long max_ntp_tick;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		usec = do_gettimeoffset();
-
-		lost = jiffies - wall_jiffies;
-
-		/*
-		 * If time_adjust is negative then NTP is slowing the clock
-		 * so make sure not to go into next possible interval.
-		 * Better to lose some accuracy than have time go backwards..
-		 */
-		if (unlikely(time_adjust < 0)) {
-			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
-			usec = min(usec, max_ntp_tick);
-
-			if (lost)
-				usec += lost * max_ntp_tick;
-		} else if (unlikely(lost))
-			usec += lost * (USEC_PER_SEC / HZ);
-
-		sec = xtime.tv_sec;
-		usec += (xtime.tv_nsec / 1000);
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
+	cycle_t ret;
+	ret = (cycle_t)mips_hpt_read();
+	return ret;
 }
 
-EXPORT_SYMBOL(do_gettimeofday);
+static struct clocksource clocksource_tsc = {
+	.name		= "MIPS",
+	.rating		= 250,
+	.read		= read_mips_hpt,
+	.mask		= 0xffffffff,
+	.mult		= 0,
+	.shift		= 24,
+	.is_continuous	= 1,
+};
 
-int do_settimeofday(struct timespec *tv)
+static int __init init_tsc_clocksource(void)
 {
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
+	u64 temp;
 
-	/*
-	 * This is revolting.  We need to set "xtime" correctly.  However,
-	 * the value in this location is the value at the most recent update
-	 * of wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= do_gettimeoffset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * tick_nsec;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+	temp = (u64) 1000000000 << clocksource_tsc.shift;
+	do_div(temp, mips_hpt_frequency);
+	clocksource_tsc.mult = (unsigned)temp;
 
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
+	return clocksource_register(&clocksource_tsc);
 }
+module_init(init_tsc_clocksource);
 
-EXPORT_SYMBOL(do_settimeofday);
+int (*mips_timer_state)(void);
+void (*mips_timer_ack)(void);
+unsigned int (*mips_hpt_read)(void);
+void (*mips_hpt_init)(unsigned int);
 
 /*
  * Gettimeoffset routines.  These routines returns the time duration
@@ -250,11 +235,9 @@
 	return 0;
 }
 
-
 /* The function pointer to one of the gettimeoffset funcs.  */
 unsigned long (*do_gettimeoffset)(void) = null_gettimeoffset;
 
-
 static unsigned long fixed_rate_gettimeoffset(void)
 {
 	u32 count;
@@ -396,6 +379,29 @@
 /* last time when xtime and rtc are sync'ed up */
 static long last_rtc_update;
 
+unsigned long read_persistent_clock(void)
+{
+	unsigned long sec;
+	sec =  rtc_mips_get_time();
+	return sec;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+	if (ntp_synced() &&
+	   xtime.tv_sec > last_rtc_update + 660 &&
+	   (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
+	   (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
+		if (rtc_mips_set_mmss(xtime.tv_sec) == 0) {
+			last_rtc_update = xtime.tv_sec;
+		}
+		else {
+			/* do it again in 60 s */
+			last_rtc_update = xtime.tv_sec - 600;
+		}
+	}
+}
+
 /*
  * local_timer_interrupt() does profiling and process accounting
  * on a per-CPU basis.
@@ -410,6 +416,7 @@
 {
 	if (current->pid)
 		profile_tick(CPU_PROFILING, regs);
+
 	update_process_times(user_mode(regs));
 }
 
@@ -438,7 +445,7 @@
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. rtc_mips_set_time() has to be
+	 * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	if (ntp_synced() &&
@@ -518,6 +525,15 @@
 EXPORT_SYMBOL(null_perf_irq);
 EXPORT_SYMBOL(perf_irq);
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+void event_timer_handler(struct pt_regs *regs)
+{
+	c0_timer_ack();
+	if (lapic_clockevent.event_handler)
+		lapic_clockevent.event_handler(regs,NULL);
+}
+#endif
+
 asmlinkage void ll_timer_interrupt(int irq, struct pt_regs *regs)
 {
 	int r2 = cpu_has_mips_r2;
@@ -531,6 +547,15 @@
 	 * performance counter interrupt was pending, so we have to run the
 	 * performance counter interrupt handler anyway.
 	 */
+#ifdef CONFIG_HIGH_RES_TIMERS
+	/*
+	 * Run the event handler
+	 */
+	if (!r2 || (read_c0_cause() & (1 << 26)))
+		if (lapic_clockevent.event_handler)
+			lapic_clockevent.event_handler(regs,NULL);
+#endif
+
 	if (!r2 || (read_c0_cause() & (1 << 26)))
 		if (perf_irq(regs))
 			goto out;
@@ -563,7 +588,7 @@
  *      b) (optional) calibrate and set the mips_hpt_frequency
  *	    (only needed if you intended to use fixed_rate_gettimeoffset
  *	     or use cpu counter as timer interrupt source)
- * 2) setup xtime based on rtc_mips_get_time().
+ * 2) setup xtime based on rtc_get_time().
  * 3) choose a appropriate gettimeoffset routine.
  * 4) calculate a couple of cached variables for later usage
  * 5) plat_timer_setup() -
@@ -578,7 +603,7 @@
 
 static struct irqaction timer_irqaction = {
 	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED,
+	.flags = IRQF_NODELAY | IRQF_DISABLED,
 	.name = "timer",
 };
 
@@ -627,6 +652,9 @@
 
 void __init time_init(void)
 {
+#ifdef CONFIG_HIGH_RES_TIMERS
+	u64 temp;
+#endif
 	if (board_time_init)
 		board_time_init();
 
@@ -688,6 +716,12 @@
 		/* Calculate cache parameters.  */
 		cycles_per_jiffy = (mips_hpt_frequency + HZ / 2) / HZ;
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+		hrt_cycles_per_jiffy = ( (CONFIG_CPU_SPEED * 1000000) + HZ / 2) / HZ;
+#else 
+		hrt_cycles_per_jiffy = cycles_per_jiffy;
+#endif
+
 		/* sll32_usecs_per_cycle = 10^6 * 2^32 / mips_counter_freq  */
 		do_div64_32(sll32_usecs_per_cycle,
 			    1000000, mips_hpt_frequency / 2,
@@ -776,3 +810,128 @@
 {
 	return (unsigned long long)jiffies*(1000000000/HZ);
 }
+
+
+#ifdef CONFIG_SMP
+/*
+ * We have to synchronize the master CPU with all the slave CPUs
+ */
+static atomic_t cpus_started;
+static atomic_t cpus_ready;
+static atomic_t cpus_count;
+/*
+ * Master processor inits
+ */
+static void sync_cpus_init(int v)
+{
+	atomic_set(&cpus_count, 0);
+	mb();
+	atomic_set(&cpus_started, v);
+	mb();
+	atomic_set(&cpus_ready, v);
+	mb();
+}
+
+/*
+ * Called by the master processor
+ */
+static void sync_cpus_master(int v)
+{
+	atomic_set(&cpus_count, 0);
+	mb();
+	atomic_set(&cpus_started, v);
+	mb();
+	/* Wait here till all other CPUs are now ready */
+	while (atomic_read(&cpus_count) != (num_online_cpus() -1) )
+		mb();
+	atomic_set(&cpus_ready, v);
+	mb();
+}
+/*
+ * Called by the slave processors
+ */
+static void sync_cpus_slave(int v)
+{
+        /* Check if the master has been through this */
+        while (atomic_read(&cpus_started) != v)
+                mb();
+        atomic_inc(&cpus_count);
+        mb();
+        while (atomic_read(&cpus_ready) != v)
+                mb();
+}
+/*
+ * Called by the slave CPUs when done syncing the count register
+ * with the master processor
+ */
+static void sync_cpus_slave_exit(int v)
+{
+	while (atomic_read(&cpus_started) != v)
+		mb();
+	atomic_inc(&cpus_count);
+	mb();
+}
+
+#define LOOPS	100
+static u32 c0_count[NR_CPUS];		/* Count register per CPU */
+static u32 c[NR_CPUS][LOOPS + 1];	/* Count register per CPU per loop for syncing */
+
+/*
+ * Slave processors execute this via IPI
+ */
+static void sync_c0_count_slave(void *info)
+{
+	int cpus = 1, loop, prev_count = 0, cpu = smp_processor_id();
+	unsigned long flags;
+	u32 diff_count; /* CPU count registers are 32-bit */
+	local_irq_save(flags);
+        
+	for(loop = 0; loop <= LOOPS; loop++) {
+		/* Sync with the Master processor */
+		sync_cpus_slave(cpus++);
+		c[cpu][loop] = c0_count[cpu] = read_c0_count();
+		mb();
+		sync_cpus_slave(cpus++);
+		diff_count = c0_count[0] - c0_count[cpu];
+		diff_count += prev_count;
+		diff_count += read_c0_count();
+		write_c0_count(diff_count);
+		prev_count = (prev_count >> 1) +
+			((int)(c0_count[0] - c0_count[cpu]) >> 1);
+        }
+
+	/* Slave processor is done syncing count register with Master */
+	sync_cpus_slave_exit(cpus++);
+	printk("SMP: Slave processor %d done syncing count \n", cpu);
+	local_irq_restore(flags);
+}
+
+/*
+ * Master kicks off the syncing process
+ */
+void sync_c0_count_master(void)
+{
+	int cpus = 0, loop, cpu = smp_processor_id();
+	unsigned long flags;
+
+	printk("SMP: Starting to sync the c0 count register ... \n");
+	sync_cpus_init(cpus++);
+
+	/* Kick off the slave processors to also start the syncing process */
+	smp_call_function(sync_c0_count_slave, NULL, 0, 0);
+	local_irq_save(flags);
+
+	for (loop = 0; loop <= LOOPS; loop++) {
+		/* Wait for all the CPUs here */
+		sync_cpus_master(cpus++);
+		c[cpu][loop] = c0_count[cpu] = read_c0_count();
+		mb();
+		/* Do syncing once more */
+		sync_cpus_master(cpus++);
+	}
+	sync_cpus_master(cpus++);
+	local_irq_restore(flags);
+
+	printk("SMP: Syncing process completed accross CPUs ... \n");
+}
+#endif /* CONFIG_SMP */
diff -urN ./linux-2.6.18.1/arch/mips/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/traps.c
--- ./linux-2.6.18.1/arch/mips/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -274,7 +274,7 @@
 	printk("\n");
 }
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs)
 {
diff -urN ./linux-2.6.18.1/arch/mips/mm/init.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/mm/init.c
--- ./linux-2.6.18.1/arch/mips/mm/init.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/mm/init.c	2007-05-19 23:58:35.000000000 +0900
@@ -36,7 +36,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long highstart_pfn, highend_pfn;
 
diff -urN ./linux-2.6.18.1/arch/mips/sibyte/cfe/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/cfe/smp.c
--- ./linux-2.6.18.1/arch/mips/sibyte/cfe/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/cfe/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -107,4 +107,8 @@
  */
 void prom_cpus_done(void)
 {
+#ifdef CONFIG_HIGH_RES_TIMERS
+	extern void sync_c0_count_master(void);
+	sync_c0_count_master();
+#endif
 }
diff -urN ./linux-2.6.18.1/arch/mips/sibyte/sb1250/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/sb1250/irq.c
--- ./linux-2.6.18.1/arch/mips/sibyte/sb1250/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/sb1250/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -85,7 +85,7 @@
 /* Store the CPU id (not the logical number) */
 int sb1250_irq_owner[SB1250_NR_IRQS];
 
-DEFINE_SPINLOCK(sb1250_imr_lock);
+DEFINE_RAW_SPINLOCK(sb1250_imr_lock);
 
 void sb1250_mask_irq(int cpu, int irq)
 {
@@ -262,7 +262,7 @@
 
 static struct irqaction sb1250_dummy_action = {
 	.handler = sb1250_dummy_handler,
-	.flags   = 0,
+	.flags   = IRQF_NODELAY,
 	.mask    = CPU_MASK_NONE,
 	.name    = "sb1250-private",
 	.next    = NULL,
@@ -372,6 +372,10 @@
 #ifdef CONFIG_KGDB
 	imask |= STATUSF_IP6;
 #endif
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+	imask |= STATUSF_IP7;
+#endif
 	/* Enable necessary IPs, disable the rest */
 	change_c0_status(ST0_IM, imask);
 
@@ -465,6 +469,10 @@
 	else
 #endif
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+	if (pending & CAUSEF_IP7)
+		event_timer_handler(regs);
+#endif
 	if (pending & CAUSEF_IP4)
 		sb1250_timer_interrupt(regs);
 
diff -urN ./linux-2.6.18.1/arch/mips/sibyte/sb1250/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/sb1250/smp.c
--- ./linux-2.6.18.1/arch/mips/sibyte/sb1250/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/sb1250/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -59,7 +59,7 @@
 {
 	extern void sb1250_time_init(void);
 	sb1250_time_init();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
diff -urN ./linux-2.6.18.1/arch/mips/sibyte/swarm/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/swarm/setup.c
--- ./linux-2.6.18.1/arch/mips/sibyte/swarm/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/mips/sibyte/swarm/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -131,6 +131,12 @@
 		rtc_mips_set_time = m41t81_set_time;
 	}
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+	/*
+	 * set the mips_hpt_frequency here
+	 */
+	mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000;
+#endif
 	printk("This kernel optimized for "
 #ifdef CONFIG_SIMULATION
 	       "simulation"
diff -urN ./linux-2.6.18.1/arch/powerpc/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/Kconfig
--- ./linux-2.6.18.1/arch/powerpc/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -26,18 +26,15 @@
 	bool
 	default y
 
-config GENERIC_HARDIRQS
+config GENERIC_TIME
 	bool
 	default y
 
-config IRQ_PER_CPU
+config GENERIC_HARDIRQS
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
+config IRQ_PER_CPU
 	bool
 	default y
 
@@ -596,6 +593,18 @@
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "fs/Kconfig.binfmt"
 
 # We optimistically allocate largepages from the VM, so make the limit
diff -urN ./linux-2.6.18.1/arch/powerpc/boot/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/boot/Makefile
--- ./linux-2.6.18.1/arch/powerpc/boot/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/boot/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -29,6 +29,14 @@
 OBJCOPY_COFF_ARGS := -O aixcoff-rs6000 --set-start 0x500000
 OBJCOPY_MIB_ARGS  := -O aixcoff-rs6000 -R .stab -R .stabstr -R .comment
 
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring	:=
+space		:= $(nullstring) # end of the line
+pg_flag		= $(nullstring) -pg # end of the line
+CFLAGS		:= $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 zlib       := inffast.c inflate.c inftrees.c
 zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
 zliblinuxheader := zlib.h zconf.h zutil.h
@@ -44,7 +52,7 @@
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
 
 quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = sed "s@__attribute_used__@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+	cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.<linux/module.h>@@;s@.include.<linux/spinlock.h>@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
 
 quiet_cmd_copy_zlibheader = COPY    $@
       cmd_copy_zlibheader = sed "s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/Makefile
--- ./linux-2.6.18.1/arch/powerpc/kernel/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -10,10 +10,11 @@
 CFLAGS_btext.o		+= -fPIC
 endif
 
-obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
+obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   init_task.o process.o systbl.o idle.o
 obj-y				+= vdso32/
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-$(CONFIG_PPC64)		+= setup_64.o binfmt_elf32.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o cpu_setup_power4.o \
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/entry_32.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/entry_32.S
--- ./linux-2.6.18.1/arch/powerpc/kernel/entry_32.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/entry_32.S	2007-05-19 23:58:35.000000000 +0900
@@ -638,7 +638,7 @@
 	/* Check current_thread_info()->flags */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	do_work
 
 restore_user:
@@ -856,7 +856,7 @@
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -870,7 +870,7 @@
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK
 	beq	restore_user
@@ -978,3 +978,85 @@
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_MCOUNT
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	stw	r3,20(r1)
+	cmpwi	r5,0
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	bl	__trace
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function
+ * preamble, before the stack frame is created.  An example of this preamble
+ * code is:
+ *
+ *	mflr	r0
+ *	lis	r12,-16354
+ *	stw	r0,4(r1)
+ *	addi	r0,r12,-19652
+ *	bl	0xc00034c8 <_mcount>
+ *	mflr	r0
+ *	stwu	r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too. */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3	/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	bl	__trace
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/irq.c
--- ./linux-2.6.18.1/arch/powerpc/kernel/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -91,8 +91,6 @@
 #endif
 
 #ifdef CONFIG_PPC64
-EXPORT_SYMBOL(irq_desc);
-
 int distribute_irqs = 1;
 #endif /* CONFIG_PPC64 */
 
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/ppc_ksyms.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/ppc_ksyms.c
--- ./linux-2.6.18.1/arch/powerpc/kernel/ppc_ksyms.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/ppc_ksyms.c	2007-05-19 23:58:35.000000000 +0900
@@ -16,7 +16,6 @@
 #include <linux/bitops.h>
 
 #include <asm/page.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -189,7 +188,6 @@
 
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(console_drivers);
 EXPORT_SYMBOL(cacheable_memcpy);
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/semaphore.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/semaphore.c
--- ./linux-2.6.18.1/arch/powerpc/kernel/semaphore.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/semaphore.c	2007-05-19 23:58:35.000000000 +0900
@@ -31,7 +31,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -50,7 +50,7 @@
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -63,7 +63,7 @@
 	__sem_update_count(sem, 1);
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -73,7 +73,7 @@
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -101,9 +101,9 @@
 	 */
 	wake_up(&sem->wait);
 }
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore *sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -132,4 +132,10 @@
 	wake_up(&sem->wait);
 	return retval;
 }
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+EXPORT_SYMBOL(compat_sem_is_locked);
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/smp.c
--- ./linux-2.6.18.1/arch/powerpc/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -148,6 +148,16 @@
 		smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE);
+}
+
 #ifdef CONFIG_DEBUGGER
 void smp_send_debugger_break(int cpu)
 {
@@ -184,7 +194,7 @@
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
+static  __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/time.c
--- ./linux-2.6.18.1/arch/powerpc/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -73,6 +73,9 @@
 #endif
 #include <asm/smp.h>
 
+unsigned long cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;
 #ifdef CONFIG_PPC_ISERIES
@@ -115,8 +118,6 @@
 u64 tb_to_ns_scale;
 unsigned tb_to_ns_shift;
 
-struct gettimeofday_struct do_gtod;
-
 extern unsigned long wall_jiffies;
 
 extern struct timezone sys_tz;
@@ -407,162 +408,8 @@
         }
 }
 
-/*
- * This version of gettimeofday has microsecond resolution.
- */
-static inline void __do_gettimeofday(struct timeval *tv)
-{
-	unsigned long sec, usec;
-	u64 tb_ticks, xsec;
-	struct gettimeofday_vars *temp_varp;
-	u64 temp_tb_to_xs, temp_stamp_xsec;
-
-	/*
-	 * These calculations are faster (gets rid of divides)
-	 * if done in units of 1/2^20 rather than microseconds.
-	 * The conversion to microseconds at the end is done
-	 * without a divide (and in fact, without a multiply)
-	 */
-	temp_varp = do_gtod.varp;
-
-	/* Sampling the time base must be done after loading
-	 * do_gtod.varp in order to avoid racing with update_gtod.
-	 */
-	data_barrier(temp_varp);
-	tb_ticks = get_tb() - temp_varp->tb_orig_stamp;
-	temp_tb_to_xs = temp_varp->tb_to_xs;
-	temp_stamp_xsec = temp_varp->stamp_xsec;
-	xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs);
-	sec = xsec / XSEC_PER_SEC;
-	usec = (unsigned long)xsec & (XSEC_PER_SEC - 1);
-	usec = SCALE_XSEC(usec, 1000000);
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-void do_gettimeofday(struct timeval *tv)
-{
-	if (__USE_RTC()) {
-		/* do this the old way */
-		unsigned long flags, seq;
-		unsigned int sec, nsec, usec;
-
-		do {
-			seq = read_seqbegin_irqsave(&xtime_lock, flags);
-			sec = xtime.tv_sec;
-			nsec = xtime.tv_nsec + tb_ticks_since(tb_last_jiffy);
-		} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-		usec = nsec / 1000;
-		while (usec >= 1000000) {
-			usec -= 1000000;
-			++sec;
-		}
-		tv->tv_sec = sec;
-		tv->tv_usec = usec;
-		return;
-	}
-	__do_gettimeofday(tv);
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-/*
- * There are two copies of tb_to_xs and stamp_xsec so that no
- * lock is needed to access and use these values in
- * do_gettimeofday.  We alternate the copies and as long as a
- * reasonable time elapses between changes, there will never
- * be inconsistent values.  ntpd has a minimum of one minute
- * between updates.
- */
-static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
-			       u64 new_tb_to_xs)
-{
-	unsigned temp_idx;
-	struct gettimeofday_vars *temp_varp;
-
-	temp_idx = (do_gtod.var_idx == 0);
-	temp_varp = &do_gtod.vars[temp_idx];
-
-	temp_varp->tb_to_xs = new_tb_to_xs;
-	temp_varp->tb_orig_stamp = new_tb_stamp;
-	temp_varp->stamp_xsec = new_stamp_xsec;
-	smp_mb();
-	do_gtod.varp = temp_varp;
-	do_gtod.var_idx = temp_idx;
-
-	/*
-	 * tb_update_count is used to allow the userspace gettimeofday code
-	 * to assure itself that it sees a consistent view of the tb_to_xs and
-	 * stamp_xsec variables.  It reads the tb_update_count, then reads
-	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
-	 * the two values of tb_update_count match and are even then the
-	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
-	 * loops back and reads them again until this criteria is met.
-	 * We expect the caller to have done the first increment of
-	 * vdso_data->tb_update_count already.
-	 */
-	vdso_data->tb_orig_stamp = new_tb_stamp;
-	vdso_data->stamp_xsec = new_stamp_xsec;
-	vdso_data->tb_to_xs = new_tb_to_xs;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
-	smp_wmb();
-	++(vdso_data->tb_update_count);
-}
-
-/*
- * When the timebase - tb_orig_stamp gets too big, we do a manipulation
- * between tb_orig_stamp and stamp_xsec. The goal here is to keep the
- * difference tb - tb_orig_stamp small enough to always fit inside a
- * 32 bits number. This is a requirement of our fast 32 bits userland
- * implementation in the vdso. If we "miss" a call to this function
- * (interrupt latency, CPU locked in a spinlock, ...) and we end up
- * with a too big difference, then the vdso will fallback to calling
- * the syscall
- */
-static __inline__ void timer_recalc_offset(u64 cur_tb)
-{
-	unsigned long offset;
-	u64 new_stamp_xsec;
-	u64 tlen, t2x;
-	u64 tb, xsec_old, xsec_new;
-	struct gettimeofday_vars *varp;
-
-	if (__USE_RTC())
-		return;
-	tlen = current_tick_length();
-	offset = cur_tb - do_gtod.varp->tb_orig_stamp;
-	if (tlen == last_tick_len && offset < 0x80000000u)
-		return;
-	if (tlen != last_tick_len) {
-		t2x = mulhdu(tlen << TICKLEN_SHIFT, ticklen_to_xs);
-		last_tick_len = tlen;
-	} else
-		t2x = do_gtod.varp->tb_to_xs;
-	new_stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
-	do_div(new_stamp_xsec, 1000000000);
-	new_stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
-
-	++vdso_data->tb_update_count;
-	smp_mb();
-
-	/*
-	 * Make sure time doesn't go backwards for userspace gettimeofday.
-	 */
-	tb = get_tb();
-	varp = do_gtod.varp;
-	xsec_old = mulhdu(tb - varp->tb_orig_stamp, varp->tb_to_xs)
-		+ varp->stamp_xsec;
-	xsec_new = mulhdu(tb - cur_tb, t2x) + new_stamp_xsec;
-	if (xsec_new < xsec_old)
-		new_stamp_xsec += xsec_old - xsec_new;
-
-	update_gtod(cur_tb, new_stamp_xsec, t2x);
-}
-
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -610,11 +457,7 @@
 				tb_ticks_per_sec   = new_tb_ticks_per_sec;
 				calc_cputime_factors();
 				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
-				do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
 				tb_to_xs = divres.result_low;
-				do_gtod.varp->tb_to_xs = tb_to_xs;
-				vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
-				vdso_data->tb_to_xs = tb_to_xs;
 			}
 			else {
 				printk( "Titan recalibrate: FAILED (difference > 4 percent)\n"
@@ -781,81 +624,6 @@
 	return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift;
 }
 
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, new_sec = tv->tv_sec;
-	long wtm_nsec, new_nsec = tv->tv_nsec;
-	unsigned long flags;
-	u64 new_xsec;
-	unsigned long tb_delta;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-
-	/*
-	 * Updating the RTC is not the job of this code. If the time is
-	 * stepped under NTP, the RTC will be updated after STA_UNSYNC
-	 * is cleared.  Tools like clock/hwclock either copy the RTC
-	 * to the system time, in which case there is no point in writing
-	 * to the RTC again, or write to the RTC but then they don't call
-	 * settimeofday to perform this operation.
-	 */
-#ifdef CONFIG_PPC_ISERIES
-	if (first_settimeofday) {
-		iSeries_tb_recal();
-		first_settimeofday = 0;
-	}
-#endif
-
-	/* Make userspace gettimeofday spin until we're done. */
-	++vdso_data->tb_update_count;
-	smp_mb();
-
-	/*
-	 * Subtract off the number of nanoseconds since the
-	 * beginning of the last tick.
-	 * Note that since we don't increment jiffies_64 anywhere other
-	 * than in do_timer (since we don't have a lost tick problem),
-	 * wall_jiffies will always be the same as jiffies,
-	 * and therefore the (jiffies - wall_jiffies) computation
-	 * has been removed.
-	 */
-	tb_delta = tb_ticks_since(tb_last_jiffy);
-	tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */
-	new_nsec -= SCALE_XSEC(tb_delta, 1000000000);
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec);
-
- 	set_normalized_timespec(&xtime, new_sec, new_nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	/* In case of a large backwards jump in time with NTP, we want the 
-	 * clock to be updated as soon as the PLL is again in lock.
-	 */
-	last_rtc_update = new_sec - 658;
-
-	ntp_clear();
-
-	new_xsec = xtime.tv_nsec;
-	if (new_xsec != 0) {
-		new_xsec *= XSEC_PER_SEC;
-		do_div(new_xsec, NSEC_PER_SEC);
-	}
-	new_xsec += (u64)xtime.tv_sec * XSEC_PER_SEC;
-	update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs);
-
-	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
-	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
 
 static int __init get_freq(char *name, int cells, unsigned long *val)
 {
@@ -1024,20 +792,6 @@
 
 	xtime.tv_sec = tm;
 	xtime.tv_nsec = 0;
-	do_gtod.varp = &do_gtod.vars[0];
-	do_gtod.var_idx = 0;
-	do_gtod.varp->tb_orig_stamp = tb_last_jiffy;
-	__get_cpu_var(last_jiffy) = tb_last_jiffy;
-	do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
-	do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
-	do_gtod.varp->tb_to_xs = tb_to_xs;
-	do_gtod.tb_to_us = tb_to_us;
-
-	vdso_data->tb_orig_stamp = tb_last_jiffy;
-	vdso_data->tb_update_count = 0;
-	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
-	vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
-	vdso_data->tb_to_xs = tb_to_xs;
 
 	time_freq = 0;
 
@@ -1050,7 +804,6 @@
 	set_dec(tb_ticks_per_jiffy);
 }
 
-
 #define FEBRUARY	2
 #define	STARTOFTIME	1970
 #define SECDAY		86400L
@@ -1195,3 +948,36 @@
 	dr->result_low  = ((u64)y << 32) + z;
 
 }
+
+
+/* powerpc clocksource code */
+
+#include <linux/clocksource.h>
+static cycle_t timebase_read(void)
+{
+	return (cycle_t)get_tb();
+}
+
+struct clocksource clocksource_timebase = {
+	.name = "timebase",
+	.rating = 200,
+	.read = timebase_read,
+	.mask = (cycle_t)-1,
+	.mult = 0,
+	.shift = 22,
+};
+
+
+/* XXX - this should be calculated or properly externed! */
+static int __init init_timebase_clocksource(void)
+{
+	if (__USE_RTC())
+		return -ENODEV;
+
+	clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec,
+					clocksource_timebase.shift);
+	return clocksource_register(&clocksource_timebase);
+}
+
+module_init(init_timebase_clocksource);
+
diff -urN ./linux-2.6.18.1/arch/powerpc/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/traps.c
--- ./linux-2.6.18.1/arch/powerpc/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -93,7 +93,7 @@
  * Trap & Exception support
  */
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char *str, struct pt_regs *regs, long err)
 {
@@ -164,6 +164,11 @@
 			return;
 	}
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	memset(&info, 0, sizeof(info));
 	info.si_signo = signr;
 	info.si_code = code;
diff -urN ./linux-2.6.18.1/arch/powerpc/lib/locks.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/lib/locks.c
--- ./linux-2.6.18.1/arch/powerpc/lib/locks.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/lib/locks.c	2007-05-19 23:58:35.000000000 +0900
@@ -24,7 +24,7 @@
 #include <asm/iseries/hv_call.h>
 #include <asm/smp.h>
 
-void __spin_yield(raw_spinlock_t *lock)
+void __spin_yield(__raw_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
@@ -79,7 +79,7 @@
 }
 #endif
 
-void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+void __raw_spin_unlock_wait(__raw_spinlock_t *lock)
 {
 	while (lock->slock) {
 		HMT_low();
diff -urN ./linux-2.6.18.1/arch/powerpc/mm/fault.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/mm/fault.c
--- ./linux-2.6.18.1/arch/powerpc/mm/fault.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/mm/fault.c	2007-05-19 23:58:35.000000000 +0900
@@ -149,8 +149,8 @@
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
-			    unsigned long error_code)
+int __kprobes notrace do_page_fault(struct pt_regs *regs,
+		unsigned long address, unsigned long error_code)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
diff -urN ./linux-2.6.18.1/arch/powerpc/mm/init_32.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/mm/init_32.c
--- ./linux-2.6.18.1/arch/powerpc/mm/init_32.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/mm/init_32.c	2007-05-19 23:58:35.000000000 +0900
@@ -56,7 +56,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
diff -urN ./linux-2.6.18.1/arch/powerpc/mm/tlb_64.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/mm/tlb_64.c
--- ./linux-2.6.18.1/arch/powerpc/mm/tlb_64.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/mm/tlb_64.c	2007-05-19 23:58:35.000000000 +0900
@@ -37,7 +37,7 @@
 /* This is declared as we are using the more or less generic
  * include/asm-powerpc/tlb.h file -- tgall
  */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/cell/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/cell/smp.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/cell/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/cell/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -133,7 +133,7 @@
 		iic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit cell_give_timebase(void)
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/chrp/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/chrp/smp.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/chrp/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/chrp/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -45,7 +45,7 @@
 	mpic_setup_this_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit smp_chrp_give_timebase(void)
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/chrp/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/chrp/time.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/chrp/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/chrp/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -27,7 +27,7 @@
 #include <asm/sections.h>
 #include <asm/time.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 static int nvram_as1 = NVRAM_AS1;
 static int nvram_as0 = NVRAM_AS0;
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/iseries/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/iseries/setup.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/iseries/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/iseries/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -594,12 +594,14 @@
 static void iseries_shared_idle(void)
 {
 	while (1) {
-		while (!need_resched() && !hvlpevent_is_pending()) {
+		while (!need_resched() && !need_resched_delayed()
+				&& !hvlpevent_is_pending()) {
 			local_irq_disable();
 			ppc64_runlatch_off();
 
 			/* Recheck with irqs off */
-			if (!need_resched() && !hvlpevent_is_pending())
+			if (!need_resched() && !need_resched_delayed()
+					&& !hvlpevent_is_pending())
 				yield_shared_processor();
 
 			HMT_medium();
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/powermac/feature.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/powermac/feature.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/powermac/feature.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/powermac/feature.c	2007-05-19 23:58:35.000000000 +0900
@@ -59,7 +59,7 @@
  * We use a single global lock to protect accesses. Each driver has
  * to take care of its own locking
  */
-DEFINE_SPINLOCK(feature_lock);
+DEFINE_RAW_SPINLOCK(feature_lock);
 
 #define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
 #define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/powermac/nvram.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/powermac/nvram.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/powermac/nvram.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/powermac/nvram.c	2007-05-19 23:58:35.000000000 +0900
@@ -80,7 +80,7 @@
 static int core99_bank = 0;
 static int nvram_partitions[3];
 // XXX Turn that into a sem
-static DEFINE_SPINLOCK(nv_lock);
+static DEFINE_RAW_SPINLOCK(nv_lock);
 
 static int (*core99_write_bank)(int bank, u8* datas);
 static int (*core99_erase_bank)(int bank);
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/powermac/pic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/powermac/pic.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/powermac/pic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/powermac/pic.c	2007-05-19 23:58:35.000000000 +0900
@@ -63,7 +63,7 @@
 static int max_real_irqs;
 static u32 level_mask[4];
 
-static DEFINE_SPINLOCK(pmac_pic_lock);
+static DEFINE_RAW_SPINLOCK(pmac_pic_lock);
 
 #define NR_MASK_WORDS	((NR_IRQS + 31) / 32)
 static unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/pseries/setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/pseries/setup.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/pseries/setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/pseries/setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -483,7 +483,8 @@
 		set_thread_flag(TIF_POLLING_NRFLAG);
 
 		while (get_tb() < start_snooze) {
-			if (need_resched() || cpu_is_offline(cpu))
+			if (need_resched() || need_resched_delayed() ||
+			    cpu_is_offline(cpu))
 				goto out;
 			ppc64_runlatch_off();
 			HMT_low();
@@ -494,7 +495,8 @@
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb();
 		local_irq_disable();
-		if (need_resched() || cpu_is_offline(cpu))
+		if (need_resched() || need_resched_delayed() ||
+		    cpu_is_offline(cpu))
 			goto out;
 	}
 
diff -urN ./linux-2.6.18.1/arch/powerpc/platforms/pseries/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/pseries/smp.c
--- ./linux-2.6.18.1/arch/powerpc/platforms/pseries/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/powerpc/platforms/pseries/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -344,7 +344,7 @@
 }
 #endif /* CONFIG_XICS */
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned long timebase = 0;
 
 static void __devinit pSeries_give_timebase(void)
diff -urN ./linux-2.6.18.1/arch/ppc/8260_io/enet.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8260_io/enet.c
--- ./linux-2.6.18.1/arch/ppc/8260_io/enet.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8260_io/enet.c	2007-05-19 23:58:35.000000000 +0900
@@ -116,7 +116,7 @@
 	scc_t	*sccp;
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
diff -urN ./linux-2.6.18.1/arch/ppc/8260_io/fcc_enet.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8260_io/fcc_enet.c
--- ./linux-2.6.18.1/arch/ppc/8260_io/fcc_enet.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8260_io/fcc_enet.c	2007-05-19 23:58:35.000000000 +0900
@@ -376,7 +376,7 @@
 	volatile fcc_enet_t	*ep;
 	struct	net_device_stats stats;
 	uint	tx_free;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
diff -urN ./linux-2.6.18.1/arch/ppc/8xx_io/commproc.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8xx_io/commproc.c
--- ./linux-2.6.18.1/arch/ppc/8xx_io/commproc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8xx_io/commproc.c	2007-05-19 23:58:35.000000000 +0900
@@ -356,7 +356,7 @@
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /*
  * 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up...
diff -urN ./linux-2.6.18.1/arch/ppc/8xx_io/enet.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8xx_io/enet.c
--- ./linux-2.6.18.1/arch/ppc/8xx_io/enet.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8xx_io/enet.c	2007-05-19 23:58:35.000000000 +0900
@@ -143,7 +143,7 @@
 	unsigned char *rx_vaddr[RX_RING_SIZE];
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
diff -urN ./linux-2.6.18.1/arch/ppc/8xx_io/fec.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8xx_io/fec.c
--- ./linux-2.6.18.1/arch/ppc/8xx_io/fec.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/8xx_io/fec.c	2007-05-19 23:58:35.000000000 +0900
@@ -164,7 +164,7 @@
 
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
diff -urN ./linux-2.6.18.1/arch/ppc/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/Kconfig
--- ./linux-2.6.18.1/arch/ppc/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -12,13 +12,6 @@
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_HWEIGHT
 	bool
 	default y
@@ -955,6 +948,18 @@
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "mm/Kconfig"
 
 source "fs/Kconfig.binfmt"
diff -urN ./linux-2.6.18.1/arch/ppc/boot/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/boot/Makefile
--- ./linux-2.6.18.1/arch/ppc/boot/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/boot/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -14,6 +14,15 @@
 #
 
 CFLAGS	 	+= -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include
+
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring :=
+space      := $(nullstring) # end of the line
+pg_flag     = $(nullstring) -pg # end of the line
+CFLAGS     := $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 HOSTCFLAGS	+= -Iarch/$(ARCH)/boot/include
 
 BOOT_TARGETS	= zImage zImage.initrd znetboot znetboot.initrd
diff -urN ./linux-2.6.18.1/arch/ppc/kernel/dma-mapping.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/dma-mapping.c
--- ./linux-2.6.18.1/arch/ppc/kernel/dma-mapping.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/dma-mapping.c	2007-05-19 23:58:35.000000000 +0900
@@ -70,7 +70,7 @@
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
diff -urN ./linux-2.6.18.1/arch/ppc/kernel/entry.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/entry.S
--- ./linux-2.6.18.1/arch/ppc/kernel/entry.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/entry.S	2007-05-19 23:58:35.000000000 +0900
@@ -856,7 +856,7 @@
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -870,7 +870,7 @@
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING
 	beq	restore_user
diff -urN ./linux-2.6.18.1/arch/ppc/kernel/semaphore.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/semaphore.c
--- ./linux-2.6.18.1/arch/ppc/kernel/semaphore.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/semaphore.c	2007-05-19 23:58:35.000000000 +0900
@@ -29,7 +29,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -48,7 +48,7 @@
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -70,7 +70,7 @@
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -100,7 +100,7 @@
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -129,3 +129,8 @@
 	wake_up(&sem->wait);
 	return retval;
 }
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
diff -urN ./linux-2.6.18.1/arch/ppc/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/smp.c
--- ./linux-2.6.18.1/arch/ppc/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -137,6 +137,16 @@
 	smp_message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0);
+}
+
 #ifdef CONFIG_XMON
 void smp_send_xmon_break(int cpu)
 {
@@ -161,7 +171,7 @@
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
diff -urN ./linux-2.6.18.1/arch/ppc/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/time.c
--- ./linux-2.6.18.1/arch/ppc/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -65,6 +65,9 @@
 
 #include <asm/time.h>
 
+unsigned long cpu_khz;   /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
 unsigned long disarm_decr[NR_CPUS];
 
 extern struct timezone sys_tz;
@@ -103,7 +106,7 @@
 }
 
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
diff -urN ./linux-2.6.18.1/arch/ppc/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/traps.c
--- ./linux-2.6.18.1/arch/ppc/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -71,7 +71,7 @@
  * Trap & Exception support
  */
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 int die(const char * str, struct pt_regs * fp, long err)
 {
@@ -106,6 +106,10 @@
 		debugger(regs);
 		die("Exception in kernel mode", regs, signr);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	info.si_signo = signr;
 	info.si_errno = 0;
 	info.si_code = code;
diff -urN ./linux-2.6.18.1/arch/ppc/lib/locks.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/lib/locks.c
--- ./linux-2.6.18.1/arch/ppc/lib/locks.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/lib/locks.c	2007-05-19 23:58:35.000000000 +0900
@@ -42,7 +42,7 @@
 	return ret;
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	int cpu = smp_processor_id();
 	unsigned int stuck = INIT_STUCK;
@@ -62,9 +62,9 @@
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	lock->owner_cpu = cpu;
 }
-EXPORT_SYMBOL(_raw_spin_lock);
+EXPORT_SYMBOL(__raw_spin_lock);
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	if (__spin_trylock(&lock->lock))
 		return 0;
@@ -72,9 +72,9 @@
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	return 1;
 }
-EXPORT_SYMBOL(_raw_spin_trylock);
+EXPORT_SYMBOL(__raw_spin_trylock);
 
-void _raw_spin_unlock(spinlock_t *lp)
+void __raw_spin_unlock(raw_spinlock_t *lp)
 {
   	if ( !lp->lock )
 		printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
@@ -88,13 +88,13 @@
 	wmb();
 	lp->lock = 0;
 }
-EXPORT_SYMBOL(_raw_spin_unlock);
+EXPORT_SYMBOL(__raw_spin_unlock);
 
 /*
  * For rwlocks, zero is unlocked, -1 is write-locked,
  * positive is read-locked.
  */
-static __inline__ int __read_trylock(rwlock_t *rw)
+static __inline__ int __read_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -114,13 +114,13 @@
 	return tmp;
 }
 
-int _raw_read_trylock(rwlock_t *rw)
+int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
-EXPORT_SYMBOL(_raw_read_trylock);
+EXPORT_SYMBOL(__raw_read_trylock);
 
-void _raw_read_lock(rwlock_t *rw)
+void __raw_read_lock(rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -135,9 +135,9 @@
 		}
 	}
 }
-EXPORT_SYMBOL(_raw_read_lock);
+EXPORT_SYMBOL(__raw_read_lock);
 
-void _raw_read_unlock(rwlock_t *rw)
+void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	if ( rw->lock == 0 )
 		printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
@@ -146,9 +146,9 @@
 	wmb();
 	atomic_dec((atomic_t *) &(rw)->lock);
 }
-EXPORT_SYMBOL(_raw_read_unlock);
+EXPORT_SYMBOL(__raw_read_unlock);
 
-void _raw_write_lock(rwlock_t *rw)
+void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -164,18 +164,18 @@
 	}
 	wmb();
 }
-EXPORT_SYMBOL(_raw_write_lock);
+EXPORT_SYMBOL(__raw_write_lock);
 
-int _raw_write_trylock(rwlock_t *rw)
+int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (cmpxchg(&rw->lock, 0, -1) != 0)
 		return 0;
 	wmb();
 	return 1;
 }
-EXPORT_SYMBOL(_raw_write_trylock);
+EXPORT_SYMBOL(__raw_write_trylock);
 
-void _raw_write_unlock(rwlock_t *rw)
+void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	if (rw->lock >= 0)
 		printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
@@ -184,6 +184,6 @@
 	wmb();
 	rw->lock = 0;
 }
-EXPORT_SYMBOL(_raw_write_unlock);
+EXPORT_SYMBOL(__raw_write_unlock);
 
 #endif
diff -urN ./linux-2.6.18.1/arch/ppc/mm/fault.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/mm/fault.c
--- ./linux-2.6.18.1/arch/ppc/mm/fault.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/mm/fault.c	2007-05-19 23:58:35.000000000 +0900
@@ -89,7 +89,7 @@
  * the error_code parameter is ESR for a data fault, 0 for an instruction
  * fault.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
+int notrace do_page_fault(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code)
 {
 	struct vm_area_struct * vma;
diff -urN ./linux-2.6.18.1/arch/ppc/mm/init.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/mm/init.c
--- ./linux-2.6.18.1/arch/ppc/mm/init.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/mm/init.c	2007-05-19 23:58:35.000000000 +0900
@@ -55,7 +55,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/apus_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/apus_setup.c
--- ./linux-2.6.18.1/arch/ppc/platforms/apus_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/apus_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -275,6 +275,7 @@
 	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	__bus_speed = bus_speed;
 	__speed_test_failed = speed_test_failed;
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/ev64260.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/ev64260.c
--- ./linux-2.6.18.1/arch/ppc/platforms/ev64260.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/ev64260.c	2007-05-19 23:58:35.000000000 +0900
@@ -550,6 +550,7 @@
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/gemini_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/gemini_setup.c
--- ./linux-2.6.18.1/arch/ppc/platforms/gemini_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/gemini_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -459,6 +459,7 @@
 	divisor = 4;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 unsigned long __init gemini_find_end_of_memory(void)
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/hdpu.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/hdpu.c
--- ./linux-2.6.18.1/arch/ppc/platforms/hdpu.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/hdpu.c	2007-05-19 23:58:35.000000000 +0900
@@ -55,7 +55,7 @@
 static void hdpu_set_l1pe(void);
 static void hdpu_cpustate_set(unsigned char new_state);
 #ifdef CONFIG_SMP
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 extern int smp_tb_synchronized;
 
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/powerpmc250.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/powerpmc250.c
--- ./linux-2.6.18.1/arch/ppc/platforms/powerpmc250.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/powerpmc250.c	2007-05-19 23:58:35.000000000 +0900
@@ -163,6 +163,7 @@
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/prep_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/prep_setup.c
--- ./linux-2.6.18.1/arch/ppc/platforms/prep_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/prep_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -940,6 +940,7 @@
 					(freq/divisor)/1000000,
 					(freq/divisor)%1000000);
 			tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+			cpu_khz = (freq / divisor) / 1000;
 			tb_ticks_per_jiffy = freq / HZ / divisor;
 		}
 	}
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/prpmc750.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/prpmc750.c
--- ./linux-2.6.18.1/arch/ppc/platforms/prpmc750.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/prpmc750.c	2007-05-19 23:58:35.000000000 +0900
@@ -268,6 +268,7 @@
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void prpmc750_restart(char *cmd)
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/prpmc800.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/prpmc800.c
--- ./linux-2.6.18.1/arch/ppc/platforms/prpmc800.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/prpmc800.c	2007-05-19 23:58:35.000000000 +0900
@@ -327,6 +327,7 @@
 		tb_ticks_per_second = 100000000 / 4;
 		tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 		tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+		cpu_khz = tb_ticks_per_second / 1000;
 		return;
 	}
 
@@ -367,6 +368,7 @@
 	tb_ticks_per_second = (tbl_end - tbl_start) * 2;
 	tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 	tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+	cpu_khz = tb_ticks_per_second / 1000;
 }
 
 static void prpmc800_restart(char *cmd)
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/sbc82xx.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/sbc82xx.c
--- ./linux-2.6.18.1/arch/ppc/platforms/sbc82xx.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/sbc82xx.c	2007-05-19 23:58:35.000000000 +0900
@@ -65,7 +65,7 @@
 
 static volatile char *sbc82xx_i8259_map;
 static char sbc82xx_i8259_mask = 0xff;
-static DEFINE_SPINLOCK(sbc82xx_i8259_lock);
+static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock);
 
 static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr)
 {
diff -urN ./linux-2.6.18.1/arch/ppc/platforms/spruce.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/spruce.c
--- ./linux-2.6.18.1/arch/ppc/platforms/spruce.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/platforms/spruce.c	2007-05-19 23:58:35.000000000 +0900
@@ -147,6 +147,7 @@
 	freq = SPRUCE_BUS_SPEED;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static int
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/cpm2_common.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/cpm2_common.c
--- ./linux-2.6.18.1/arch/ppc/syslib/cpm2_common.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/cpm2_common.c	2007-05-19 23:58:35.000000000 +0900
@@ -114,7 +114,7 @@
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /* 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up... */
 static rh_block_t cpm_boot_dpmem_rh_block[16];
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/ibm44x_common.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ibm44x_common.c
--- ./linux-2.6.18.1/arch/ppc/syslib/ibm44x_common.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ibm44x_common.c	2007-05-19 23:58:35.000000000 +0900
@@ -63,6 +63,7 @@
 {
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/m8260_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/m8260_setup.c
--- ./linux-2.6.18.1/arch/ppc/syslib/m8260_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/m8260_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -79,6 +79,7 @@
         divisor = 4;
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 /* The 8260 has an internal 1-second timer update register that
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/m8xx_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/m8xx_setup.c
--- ./linux-2.6.18.1/arch/ppc/syslib/m8xx_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/m8xx_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -218,6 +218,7 @@
         printk("Decrementer Frequency = %d/%d\n", freq, divisor);
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 
 	/* Perform some more timer/timebase initialization.  This used
 	 * to be done elsewhere, but other changes caused it to get
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/mpc52xx_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/mpc52xx_setup.c
--- ./linux-2.6.18.1/arch/ppc/syslib/mpc52xx_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/mpc52xx_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -215,6 +215,7 @@
 
 	tb_ticks_per_jiffy = xlbfreq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000);
+	cpu_khz = (xlbfreq / divisor) / 1000;
 }
 
 
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/ocp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ocp.c
--- ./linux-2.6.18.1/arch/ppc/syslib/ocp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ocp.c	2007-05-19 23:58:35.000000000 +0900
@@ -44,11 +44,11 @@
 #include <linux/pm.h>
 #include <linux/bootmem.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/io.h>
 #include <asm/ocp.h>
 #include <asm/errno.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 //#define DBG(x)	printk x
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/open_pic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/open_pic.c
--- ./linux-2.6.18.1/arch/ppc/syslib/open_pic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/open_pic.c	2007-05-19 23:58:35.000000000 +0900
@@ -526,7 +526,7 @@
 }
 
 #if defined(CONFIG_SMP) || defined(CONFIG_PM)
-static DEFINE_SPINLOCK(openpic_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic_setup_lock);
 #endif
 
 #ifdef CONFIG_SMP
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/open_pic2.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/open_pic2.c
--- ./linux-2.6.18.1/arch/ppc/syslib/open_pic2.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/open_pic2.c	2007-05-19 23:58:35.000000000 +0900
@@ -380,7 +380,7 @@
 			   vec);
 }
 
-static DEFINE_SPINLOCK(openpic2_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic2_setup_lock);
 
 /*
  *  Initialize a timer interrupt (and disable it)
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/ppc4xx_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ppc4xx_setup.c
--- ./linux-2.6.18.1/arch/ppc/syslib/ppc4xx_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ppc4xx_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -172,6 +172,7 @@
 	freq = bip->bi_tbfreq;
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero.
 	   ** At 200 Mhz, time base will rollover in ~2925 years.
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/ppc85xx_setup.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ppc85xx_setup.c
--- ./linux-2.6.18.1/arch/ppc/syslib/ppc85xx_setup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/ppc85xx_setup.c	2007-05-19 23:58:35.000000000 +0900
@@ -57,6 +57,7 @@
         divisor = 8;
         tb_ticks_per_jiffy = freq / divisor / HZ;
         tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+        cpu_khz = (freq / divisor) / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
diff -urN ./linux-2.6.18.1/arch/ppc/syslib/todc_time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/todc_time.c
--- ./linux-2.6.18.1/arch/ppc/syslib/todc_time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/ppc/syslib/todc_time.c	2007-05-19 23:58:35.000000000 +0900
@@ -506,6 +506,7 @@
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
diff -urN ./linux-2.6.18.1/arch/sparc64/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/Kconfig
--- ./linux-2.6.18.1/arch/sparc64/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -26,7 +26,7 @@
 	bool
 	default y
 
-config TIME_INTERPOLATION
+config GENERIC_TIME
 	bool
 	default y
 
diff -urN ./linux-2.6.18.1/arch/sparc64/defconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/defconfig
--- ./linux-2.6.18.1/arch/sparc64/defconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/defconfig	2007-05-19 23:58:35.000000000 +0900
@@ -7,7 +7,7 @@
 CONFIG_SPARC64=y
 CONFIG_64BIT=y
 CONFIG_MMU=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_ARCH_MAY_HAVE_PC_FDC=y
 CONFIG_SPARC64_PAGE_SIZE_8KB=y
 # CONFIG_SPARC64_PAGE_SIZE_64KB is not set
diff -urN ./linux-2.6.18.1/arch/sparc64/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/kernel/time.c
--- ./linux-2.6.18.1/arch/sparc64/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/kernel/time.c	2007-05-20 00:11:12.000000000 +0900
@@ -31,6 +31,7 @@
 #include <linux/profile.h>
 #include <linux/miscdevice.h>
 #include <linux/rtc.h>
+#include <linux/clocksource.h>
 
 #include <asm/oplib.h>
 #include <asm/mostek.h>
@@ -621,7 +622,7 @@
 	if (!mregs && !dregs) {
 		prom_printf("Something wrong, clock regs not mapped yet.\n");
 		prom_halt();
-	}		
+	}
 
 	if (mregs) {
 		spin_lock_irq(&mostek_lock);
@@ -821,7 +822,7 @@
 	}
 
 	set_system_time();
-	
+
 	local_irq_restore(flags);
 
 	return 0;
@@ -976,22 +977,33 @@
 
 #endif /* CONFIG_CPU_FREQ */
 
-static struct time_interpolator sparc64_cpu_interpolator = {
-	.source		=	TIME_SOURCE_CPU,
-	.shift		=	16,
-	.mask		=	0xffffffffffffffffLL
+static cycle_t read_itc(void)
+{
+        return (cycle_t)get_cycles());
+}
+
+static struct clocksource clocksource_sparc64_itc = {
+        .name           = "sparc64_itc",
+        .rating         = 300,
+        .read           = read_itc,
+        .mask           = 0xffffffffffffffffLL,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 16,
+        .is_continuous  = 1,
 };
 
+
 /* The quotient formula is taken from the IA64 port. */
 #define SPARC64_NSEC_PER_CYC_SHIFT	10UL
 void __init time_init(void)
 {
 	unsigned long clock = sparc64_init_timers();
 
-	sparc64_cpu_interpolator.frequency = clock;
-	register_time_interpolator(&sparc64_cpu_interpolator);
+	clocksource_sparc64_itc.mult = clocksource_hz2mult(clock,
+						clocksource_sparc64_itc.shift);
+	clocksource_register(&clocksource_sparc64_itc);
 
-	/* Now that the interpolator is registered, it is
+	/* Now that the clocksource is registered, it is
 	 * safe to start the timer ticking.
 	 */
 	sparc64_start_timers();
@@ -1026,11 +1038,11 @@
 	unsigned long flags;
 	u8 tmp;
 
-	/* 
+	/*
 	 * Not having a register set can lead to trouble.
 	 * Also starfire doesn't have a tod clock.
 	 */
-	if (!mregs && !dregs) 
+	if (!mregs && !dregs)
 		return -1;
 
 	if (mregs) {
diff -urN ./linux-2.6.18.1/arch/sparc64/kernel/time.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/kernel/time.c.orig
--- ./linux-2.6.18.1/arch/sparc64/kernel/time.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/sparc64/kernel/time.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1380 @@
+/* $Id: time.c,v 1.42 2002/01/23 14:33:55 davem Exp $
+ * time.c: UltraSparc timer and TOD clock support.
+ *
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1998 Eddie C. Dost   (ecd@skynet.be)
+ *
+ * Based largely on code which is:
+ *
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/mc146818rtc.h>
+#include <linux/delay.h>
+#include <linux/profile.h>
+#include <linux/bcd.h>
+#include <linux/jiffies.h>
+#include <linux/cpufreq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/miscdevice.h>
+#include <linux/rtc.h>
+#include <linux/clocksource.h>
+
+#include <asm/oplib.h>
+#include <asm/mostek.h>
+#include <asm/timer.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/of_device.h>
+#include <asm/starfire.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/cpudata.h>
+#include <asm/uaccess.h>
+#include <asm/prom.h>
+
+DEFINE_SPINLOCK(mostek_lock);
+DEFINE_SPINLOCK(rtc_lock);
+void __iomem *mstk48t02_regs = NULL;
+#ifdef CONFIG_PCI
+unsigned long ds1287_regs = 0UL;
+#endif
+
+extern unsigned long wall_jiffies;
+
+static void __iomem *mstk48t08_regs;
+static void __iomem *mstk48t59_regs;
+
+static int set_rtc_mmss(unsigned long);
+
+#define TICK_PRIV_BIT	(1UL << 63)
+
+#ifdef CONFIG_SMP
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (in_lock_functions(pc))
+		return regs->u_regs[UREG_RETPC];
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
+static void tick_disable_protection(void)
+{
+	/* Set things up so user can access tick register for profiling
+	 * purposes.  Also workaround BB_ERRATA_1 by doing a dummy
+	 * read back of %tick after writing it.
+	 */
+	__asm__ __volatile__(
+	"	ba,pt	%%xcc, 1f\n"
+	"	 nop\n"
+	"	.align	64\n"
+	"1:	rd	%%tick, %%g2\n"
+	"	add	%%g2, 6, %%g2\n"
+	"	andn	%%g2, %0, %%g2\n"
+	"	wrpr	%%g2, 0, %%tick\n"
+	"	rdpr	%%tick, %%g0"
+	: /* no outputs */
+	: "r" (TICK_PRIV_BIT)
+	: "g2");
+}
+
+static void tick_init_tick(unsigned long offset)
+{
+	tick_disable_protection();
+
+	__asm__ __volatile__(
+	"	rd	%%tick, %%g1\n"
+	"	andn	%%g1, %1, %%g1\n"
+	"	ba,pt	%%xcc, 1f\n"
+	"	 add	%%g1, %0, %%g1\n"
+	"	.align	64\n"
+	"1:	wr	%%g1, 0x0, %%tick_cmpr\n"
+	"	rd	%%tick_cmpr, %%g0"
+	: /* no outputs */
+	: "r" (offset), "r" (TICK_PRIV_BIT)
+	: "g1");
+}
+
+static unsigned long tick_get_tick(void)
+{
+	unsigned long ret;
+
+	__asm__ __volatile__("rd	%%tick, %0\n\t"
+			     "mov	%0, %0"
+			     : "=r" (ret));
+
+	return ret & ~TICK_PRIV_BIT;
+}
+
+static unsigned long tick_get_compare(void)
+{
+	unsigned long ret;
+
+	__asm__ __volatile__("rd	%%tick_cmpr, %0\n\t"
+			     "mov	%0, %0"
+			     : "=r" (ret));
+
+	return ret;
+}
+
+static unsigned long tick_add_compare(unsigned long adj)
+{
+	unsigned long new_compare;
+
+	/* Workaround for Spitfire Errata (#54 I think??), I discovered
+	 * this via Sun BugID 4008234, mentioned in Solaris-2.5.1 patch
+	 * number 103640.
+	 *
+	 * On Blackbird writes to %tick_cmpr can fail, the
+	 * workaround seems to be to execute the wr instruction
+	 * at the start of an I-cache line, and perform a dummy
+	 * read back from %tick_cmpr right after writing to it. -DaveM
+	 */
+	__asm__ __volatile__("rd	%%tick_cmpr, %0\n\t"
+			     "ba,pt	%%xcc, 1f\n\t"
+			     " add	%0, %1, %0\n\t"
+			     ".align	64\n"
+			     "1:\n\t"
+			     "wr	%0, 0, %%tick_cmpr\n\t"
+			     "rd	%%tick_cmpr, %%g0"
+			     : "=&r" (new_compare)
+			     : "r" (adj));
+
+	return new_compare;
+}
+
+static unsigned long tick_add_tick(unsigned long adj, unsigned long offset)
+{
+	unsigned long new_tick, tmp;
+
+	/* Also need to handle Blackbird bug here too. */
+	__asm__ __volatile__("rd	%%tick, %0\n\t"
+			     "add	%0, %2, %0\n\t"
+			     "wrpr	%0, 0, %%tick\n\t"
+			     "andn	%0, %4, %1\n\t"
+			     "ba,pt	%%xcc, 1f\n\t"
+			     " add	%1, %3, %1\n\t"
+			     ".align	64\n"
+			     "1:\n\t"
+			     "wr	%1, 0, %%tick_cmpr\n\t"
+			     "rd	%%tick_cmpr, %%g0"
+			     : "=&r" (new_tick), "=&r" (tmp)
+			     : "r" (adj), "r" (offset), "r" (TICK_PRIV_BIT));
+
+	return new_tick;
+}
+
+static struct sparc64_tick_ops tick_operations __read_mostly = {
+	.init_tick	=	tick_init_tick,
+	.get_tick	=	tick_get_tick,
+	.get_compare	=	tick_get_compare,
+	.add_tick	=	tick_add_tick,
+	.add_compare	=	tick_add_compare,
+	.softint_mask	=	1UL << 0,
+};
+
+struct sparc64_tick_ops *tick_ops __read_mostly = &tick_operations;
+
+static void stick_init_tick(unsigned long offset)
+{
+	/* Writes to the %tick and %stick register are not
+	 * allowed on sun4v.  The Hypervisor controls that
+	 * bit, per-strand.
+	 */
+	if (tlb_type != hypervisor) {
+		tick_disable_protection();
+
+		/* Let the user get at STICK too. */
+		__asm__ __volatile__(
+		"	rd	%%asr24, %%g2\n"
+		"	andn	%%g2, %0, %%g2\n"
+		"	wr	%%g2, 0, %%asr24"
+		: /* no outputs */
+		: "r" (TICK_PRIV_BIT)
+		: "g1", "g2");
+	}
+
+	__asm__ __volatile__(
+	"	rd	%%asr24, %%g1\n"
+	"	andn	%%g1, %1, %%g1\n"
+	"	add	%%g1, %0, %%g1\n"
+	"	wr	%%g1, 0x0, %%asr25"
+	: /* no outputs */
+	: "r" (offset), "r" (TICK_PRIV_BIT)
+	: "g1");
+}
+
+static unsigned long stick_get_tick(void)
+{
+	unsigned long ret;
+
+	__asm__ __volatile__("rd	%%asr24, %0"
+			     : "=r" (ret));
+
+	return ret & ~TICK_PRIV_BIT;
+}
+
+static unsigned long stick_get_compare(void)
+{
+	unsigned long ret;
+
+	__asm__ __volatile__("rd	%%asr25, %0"
+			     : "=r" (ret));
+
+	return ret;
+}
+
+static unsigned long stick_add_tick(unsigned long adj, unsigned long offset)
+{
+	unsigned long new_tick, tmp;
+
+	__asm__ __volatile__("rd	%%asr24, %0\n\t"
+			     "add	%0, %2, %0\n\t"
+			     "wr	%0, 0, %%asr24\n\t"
+			     "andn	%0, %4, %1\n\t"
+			     "add	%1, %3, %1\n\t"
+			     "wr	%1, 0, %%asr25"
+			     : "=&r" (new_tick), "=&r" (tmp)
+			     : "r" (adj), "r" (offset), "r" (TICK_PRIV_BIT));
+
+	return new_tick;
+}
+
+static unsigned long stick_add_compare(unsigned long adj)
+{
+	unsigned long new_compare;
+
+	__asm__ __volatile__("rd	%%asr25, %0\n\t"
+			     "add	%0, %1, %0\n\t"
+			     "wr	%0, 0, %%asr25"
+			     : "=&r" (new_compare)
+			     : "r" (adj));
+
+	return new_compare;
+}
+
+static struct sparc64_tick_ops stick_operations __read_mostly = {
+	.init_tick	=	stick_init_tick,
+	.get_tick	=	stick_get_tick,
+	.get_compare	=	stick_get_compare,
+	.add_tick	=	stick_add_tick,
+	.add_compare	=	stick_add_compare,
+	.softint_mask	=	1UL << 16,
+};
+
+/* On Hummingbird the STICK/STICK_CMPR register is implemented
+ * in I/O space.  There are two 64-bit registers each, the
+ * first holds the low 32-bits of the value and the second holds
+ * the high 32-bits.
+ *
+ * Since STICK is constantly updating, we have to access it carefully.
+ *
+ * The sequence we use to read is:
+ * 1) read high
+ * 2) read low
+ * 3) read high again, if it rolled re-read both low and high again.
+ *
+ * Writing STICK safely is also tricky:
+ * 1) write low to zero
+ * 2) write high
+ * 3) write low
+ */
+#define HBIRD_STICKCMP_ADDR	0x1fe0000f060UL
+#define HBIRD_STICK_ADDR	0x1fe0000f070UL
+
+static unsigned long __hbird_read_stick(void)
+{
+	unsigned long ret, tmp1, tmp2, tmp3;
+	unsigned long addr = HBIRD_STICK_ADDR+8;
+
+	__asm__ __volatile__("ldxa	[%1] %5, %2\n"
+			     "1:\n\t"
+			     "sub	%1, 0x8, %1\n\t"
+			     "ldxa	[%1] %5, %3\n\t"
+			     "add	%1, 0x8, %1\n\t"
+			     "ldxa	[%1] %5, %4\n\t"
+			     "cmp	%4, %2\n\t"
+			     "bne,a,pn	%%xcc, 1b\n\t"
+			     " mov	%4, %2\n\t"
+			     "sllx	%4, 32, %4\n\t"
+			     "or	%3, %4, %0\n\t"
+			     : "=&r" (ret), "=&r" (addr),
+			       "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3)
+			     : "i" (ASI_PHYS_BYPASS_EC_E), "1" (addr));
+
+	return ret;
+}
+
+static unsigned long __hbird_read_compare(void)
+{
+	unsigned long low, high;
+	unsigned long addr = HBIRD_STICKCMP_ADDR;
+
+	__asm__ __volatile__("ldxa	[%2] %3, %0\n\t"
+			     "add	%2, 0x8, %2\n\t"
+			     "ldxa	[%2] %3, %1"
+			     : "=&r" (low), "=&r" (high), "=&r" (addr)
+			     : "i" (ASI_PHYS_BYPASS_EC_E), "2" (addr));
+
+	return (high << 32UL) | low;
+}
+
+static void __hbird_write_stick(unsigned long val)
+{
+	unsigned long low = (val & 0xffffffffUL);
+	unsigned long high = (val >> 32UL);
+	unsigned long addr = HBIRD_STICK_ADDR;
+
+	__asm__ __volatile__("stxa	%%g0, [%0] %4\n\t"
+			     "add	%0, 0x8, %0\n\t"
+			     "stxa	%3, [%0] %4\n\t"
+			     "sub	%0, 0x8, %0\n\t"
+			     "stxa	%2, [%0] %4"
+			     : "=&r" (addr)
+			     : "0" (addr), "r" (low), "r" (high),
+			       "i" (ASI_PHYS_BYPASS_EC_E));
+}
+
+static void __hbird_write_compare(unsigned long val)
+{
+	unsigned long low = (val & 0xffffffffUL);
+	unsigned long high = (val >> 32UL);
+	unsigned long addr = HBIRD_STICKCMP_ADDR + 0x8UL;
+
+	__asm__ __volatile__("stxa	%3, [%0] %4\n\t"
+			     "sub	%0, 0x8, %0\n\t"
+			     "stxa	%2, [%0] %4"
+			     : "=&r" (addr)
+			     : "0" (addr), "r" (low), "r" (high),
+			       "i" (ASI_PHYS_BYPASS_EC_E));
+}
+
+static void hbtick_init_tick(unsigned long offset)
+{
+	unsigned long val;
+
+	tick_disable_protection();
+
+	/* XXX This seems to be necessary to 'jumpstart' Hummingbird
+	 * XXX into actually sending STICK interrupts.  I think because
+	 * XXX of how we store %tick_cmpr in head.S this somehow resets the
+	 * XXX {TICK + STICK} interrupt mux.  -DaveM
+	 */
+	__hbird_write_stick(__hbird_read_stick());
+
+	val = __hbird_read_stick() & ~TICK_PRIV_BIT;
+	__hbird_write_compare(val + offset);
+}
+
+static unsigned long hbtick_get_tick(void)
+{
+	return __hbird_read_stick() & ~TICK_PRIV_BIT;
+}
+
+static unsigned long hbtick_get_compare(void)
+{
+	return __hbird_read_compare();
+}
+
+static unsigned long hbtick_add_tick(unsigned long adj, unsigned long offset)
+{
+	unsigned long val;
+
+	val = __hbird_read_stick() + adj;
+	__hbird_write_stick(val);
+
+	val &= ~TICK_PRIV_BIT;
+	__hbird_write_compare(val + offset);
+
+	return val;
+}
+
+static unsigned long hbtick_add_compare(unsigned long adj)
+{
+	unsigned long val = __hbird_read_compare() + adj;
+
+	val &= ~TICK_PRIV_BIT;
+	__hbird_write_compare(val);
+
+	return val;
+}
+
+static struct sparc64_tick_ops hbtick_operations __read_mostly = {
+	.init_tick	=	hbtick_init_tick,
+	.get_tick	=	hbtick_get_tick,
+	.get_compare	=	hbtick_get_compare,
+	.add_tick	=	hbtick_add_tick,
+	.add_compare	=	hbtick_add_compare,
+	.softint_mask	=	1UL << 0,
+};
+
+/* timer_interrupt() needs to keep up the real-time clock,
+ * as well as call the "do_timer()" routine every clocktick
+ *
+ * NOTE: On SUN5 systems the ticker interrupt comes in using 2
+ *       interrupts, one at level14 and one with softint bit 0.
+ */
+unsigned long timer_tick_offset __read_mostly;
+
+static unsigned long timer_ticks_per_nsec_quotient __read_mostly;
+
+#define TICK_SIZE (tick_nsec / 1000)
+
+static inline void timer_check_rtc(void)
+{
+	/* last time the cmos clock got updated */
+	static long last_rtc_update;
+
+	/* Determine when to update the Mostek clock. */
+	if (ntp_synced() &&
+	    xtime.tv_sec > last_rtc_update + 660 &&
+	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
+	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
+		if (set_rtc_mmss(xtime.tv_sec) == 0)
+			last_rtc_update = xtime.tv_sec;
+		else
+			last_rtc_update = xtime.tv_sec - 600;
+			/* do it again in 60 s */
+	}
+}
+
+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	unsigned long ticks, compare, pstate;
+
+	write_seqlock(&xtime_lock);
+
+	do {
+#ifndef CONFIG_SMP
+		profile_tick(CPU_PROFILING, regs);
+		update_process_times(user_mode(regs));
+#endif
+		do_timer(regs);
+
+		/* Guarantee that the following sequences execute
+		 * uninterrupted.
+		 */
+		__asm__ __volatile__("rdpr	%%pstate, %0\n\t"
+				     "wrpr	%0, %1, %%pstate"
+				     : "=r" (pstate)
+				     : "i" (PSTATE_IE));
+
+		compare = tick_ops->add_compare(timer_tick_offset);
+		ticks = tick_ops->get_tick();
+
+		/* Restore PSTATE_IE. */
+		__asm__ __volatile__("wrpr	%0, 0x0, %%pstate"
+				     : /* no outputs */
+				     : "r" (pstate));
+	} while (time_after_eq(ticks, compare));
+
+	timer_check_rtc();
+
+	write_sequnlock(&xtime_lock);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_SMP
+void timer_tick_interrupt(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+
+	do_timer(regs);
+
+	timer_check_rtc();
+
+	write_sequnlock(&xtime_lock);
+}
+#endif
+
+/* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */
+static void __init kick_start_clock(void)
+{
+	void __iomem *regs = mstk48t02_regs;
+	u8 sec, tmp;
+	int i, count;
+
+	prom_printf("CLOCK: Clock was stopped. Kick start ");
+
+	spin_lock_irq(&mostek_lock);
+
+	/* Turn on the kick start bit to start the oscillator. */
+	tmp = mostek_read(regs + MOSTEK_CREG);
+	tmp |= MSTK_CREG_WRITE;
+	mostek_write(regs + MOSTEK_CREG, tmp);
+	tmp = mostek_read(regs + MOSTEK_SEC);
+	tmp &= ~MSTK_STOP;
+	mostek_write(regs + MOSTEK_SEC, tmp);
+	tmp = mostek_read(regs + MOSTEK_HOUR);
+	tmp |= MSTK_KICK_START;
+	mostek_write(regs + MOSTEK_HOUR, tmp);
+	tmp = mostek_read(regs + MOSTEK_CREG);
+	tmp &= ~MSTK_CREG_WRITE;
+	mostek_write(regs + MOSTEK_CREG, tmp);
+
+	spin_unlock_irq(&mostek_lock);
+
+	/* Delay to allow the clock oscillator to start. */
+	sec = MSTK_REG_SEC(regs);
+	for (i = 0; i < 3; i++) {
+		while (sec == MSTK_REG_SEC(regs))
+			for (count = 0; count < 100000; count++)
+				/* nothing */ ;
+		prom_printf(".");
+		sec = MSTK_REG_SEC(regs);
+	}
+	prom_printf("\n");
+
+	spin_lock_irq(&mostek_lock);
+
+	/* Turn off kick start and set a "valid" time and date. */
+	tmp = mostek_read(regs + MOSTEK_CREG);
+	tmp |= MSTK_CREG_WRITE;
+	mostek_write(regs + MOSTEK_CREG, tmp);
+	tmp = mostek_read(regs + MOSTEK_HOUR);
+	tmp &= ~MSTK_KICK_START;
+	mostek_write(regs + MOSTEK_HOUR, tmp);
+	MSTK_SET_REG_SEC(regs,0);
+	MSTK_SET_REG_MIN(regs,0);
+	MSTK_SET_REG_HOUR(regs,0);
+	MSTK_SET_REG_DOW(regs,5);
+	MSTK_SET_REG_DOM(regs,1);
+	MSTK_SET_REG_MONTH(regs,8);
+	MSTK_SET_REG_YEAR(regs,1996 - MSTK_YEAR_ZERO);
+	tmp = mostek_read(regs + MOSTEK_CREG);
+	tmp &= ~MSTK_CREG_WRITE;
+	mostek_write(regs + MOSTEK_CREG, tmp);
+
+	spin_unlock_irq(&mostek_lock);
+
+	/* Ensure the kick start bit is off. If it isn't, turn it off. */
+	while (mostek_read(regs + MOSTEK_HOUR) & MSTK_KICK_START) {
+		prom_printf("CLOCK: Kick start still on!\n");
+
+		spin_lock_irq(&mostek_lock);
+
+		tmp = mostek_read(regs + MOSTEK_CREG);
+		tmp |= MSTK_CREG_WRITE;
+		mostek_write(regs + MOSTEK_CREG, tmp);
+
+		tmp = mostek_read(regs + MOSTEK_HOUR);
+		tmp &= ~MSTK_KICK_START;
+		mostek_write(regs + MOSTEK_HOUR, tmp);
+
+		tmp = mostek_read(regs + MOSTEK_CREG);
+		tmp &= ~MSTK_CREG_WRITE;
+		mostek_write(regs + MOSTEK_CREG, tmp);
+
+		spin_unlock_irq(&mostek_lock);
+	}
+
+	prom_printf("CLOCK: Kick start procedure successful.\n");
+}
+
+/* Return nonzero if the clock chip battery is low. */
+static int __init has_low_battery(void)
+{
+	void __iomem *regs = mstk48t02_regs;
+	u8 data1, data2;
+
+	spin_lock_irq(&mostek_lock);
+
+	data1 = mostek_read(regs + MOSTEK_EEPROM);	/* Read some data. */
+	mostek_write(regs + MOSTEK_EEPROM, ~data1);	/* Write back the complement. */
+	data2 = mostek_read(regs + MOSTEK_EEPROM);	/* Read back the complement. */
+	mostek_write(regs + MOSTEK_EEPROM, data1);	/* Restore original value. */
+
+	spin_unlock_irq(&mostek_lock);
+
+	return (data1 == data2);	/* Was the write blocked? */
+}
+
+/* Probe for the real time clock chip. */
+static void __init set_system_time(void)
+{
+	unsigned int year, mon, day, hour, min, sec;
+	void __iomem *mregs = mstk48t02_regs;
+#ifdef CONFIG_PCI
+	unsigned long dregs = ds1287_regs;
+#else
+	unsigned long dregs = 0UL;
+#endif
+	u8 tmp;
+
+	if (!mregs && !dregs) {
+		prom_printf("Something wrong, clock regs not mapped yet.\n");
+		prom_halt();
+	}
+
+	if (mregs) {
+		spin_lock_irq(&mostek_lock);
+
+		/* Traditional Mostek chip. */
+		tmp = mostek_read(mregs + MOSTEK_CREG);
+		tmp |= MSTK_CREG_READ;
+		mostek_write(mregs + MOSTEK_CREG, tmp);
+
+		sec = MSTK_REG_SEC(mregs);
+		min = MSTK_REG_MIN(mregs);
+		hour = MSTK_REG_HOUR(mregs);
+		day = MSTK_REG_DOM(mregs);
+		mon = MSTK_REG_MONTH(mregs);
+		year = MSTK_CVT_YEAR( MSTK_REG_YEAR(mregs) );
+	} else {
+		/* Dallas 12887 RTC chip. */
+
+		do {
+			sec  = CMOS_READ(RTC_SECONDS);
+			min  = CMOS_READ(RTC_MINUTES);
+			hour = CMOS_READ(RTC_HOURS);
+			day  = CMOS_READ(RTC_DAY_OF_MONTH);
+			mon  = CMOS_READ(RTC_MONTH);
+			year = CMOS_READ(RTC_YEAR);
+		} while (sec != CMOS_READ(RTC_SECONDS));
+
+		if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+			BCD_TO_BIN(sec);
+			BCD_TO_BIN(min);
+			BCD_TO_BIN(hour);
+			BCD_TO_BIN(day);
+			BCD_TO_BIN(mon);
+			BCD_TO_BIN(year);
+		}
+		if ((year += 1900) < 1970)
+			year += 100;
+	}
+
+	xtime.tv_sec = mktime(year, mon, day, hour, min, sec);
+	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+	set_normalized_timespec(&wall_to_monotonic,
+ 	                        -xtime.tv_sec, -xtime.tv_nsec);
+
+	if (mregs) {
+		tmp = mostek_read(mregs + MOSTEK_CREG);
+		tmp &= ~MSTK_CREG_READ;
+		mostek_write(mregs + MOSTEK_CREG, tmp);
+
+		spin_unlock_irq(&mostek_lock);
+	}
+}
+
+/* davem suggests we keep this within the 4M locked kernel image */
+static u32 starfire_get_time(void)
+{
+	static char obp_gettod[32];
+	static u32 unix_tod;
+
+	sprintf(obp_gettod, "h# %08x unix-gettod",
+		(unsigned int) (long) &unix_tod);
+	prom_feval(obp_gettod);
+
+	return unix_tod;
+}
+
+static int starfire_set_time(u32 val)
+{
+	/* Do nothing, time is set using the service processor
+	 * console on this platform.
+	 */
+	return 0;
+}
+
+static u32 hypervisor_get_time(void)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	register unsigned long arg1 asm("%o1");
+	int retries = 10000;
+
+retry:
+	func = HV_FAST_TOD_GET;
+	arg0 = 0;
+	arg1 = 0;
+	__asm__ __volatile__("ta	%6"
+			     : "=&r" (func), "=&r" (arg0), "=&r" (arg1)
+			     : "0" (func), "1" (arg0), "2" (arg1),
+			       "i" (HV_FAST_TRAP));
+	if (arg0 == HV_EOK)
+		return arg1;
+	if (arg0 == HV_EWOULDBLOCK) {
+		if (--retries > 0) {
+			udelay(100);
+			goto retry;
+		}
+		printk(KERN_WARNING "SUN4V: tod_get() timed out.\n");
+		return 0;
+	}
+	printk(KERN_WARNING "SUN4V: tod_get() not supported.\n");
+	return 0;
+}
+
+static int hypervisor_set_time(u32 secs)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	int retries = 10000;
+
+retry:
+	func = HV_FAST_TOD_SET;
+	arg0 = secs;
+	__asm__ __volatile__("ta	%4"
+			     : "=&r" (func), "=&r" (arg0)
+			     : "0" (func), "1" (arg0),
+			       "i" (HV_FAST_TRAP));
+	if (arg0 == HV_EOK)
+		return 0;
+	if (arg0 == HV_EWOULDBLOCK) {
+		if (--retries > 0) {
+			udelay(100);
+			goto retry;
+		}
+		printk(KERN_WARNING "SUN4V: tod_set() timed out.\n");
+		return -EAGAIN;
+	}
+	printk(KERN_WARNING "SUN4V: tod_set() not supported.\n");
+	return -EOPNOTSUPP;
+}
+
+static int __init clock_model_matches(char *model)
+{
+	if (strcmp(model, "mk48t02") &&
+	    strcmp(model, "mk48t08") &&
+	    strcmp(model, "mk48t59") &&
+	    strcmp(model, "m5819") &&
+	    strcmp(model, "m5819p") &&
+	    strcmp(model, "m5823") &&
+	    strcmp(model, "ds1287"))
+		return 0;
+
+	return 1;
+}
+
+static int __devinit clock_probe(struct of_device *op, const struct of_device_id *match)
+{
+	struct device_node *dp = op->node;
+	char *model = of_get_property(dp, "model", NULL);
+	unsigned long size, flags;
+	void __iomem *regs;
+
+	if (!model || !clock_model_matches(model))
+		return -ENODEV;
+
+	/* On an Enterprise system there can be multiple mostek clocks.
+	 * We should only match the one that is on the central FHC bus.
+	 */
+	if (!strcmp(dp->parent->name, "fhc") &&
+	    strcmp(dp->parent->parent->name, "central") != 0)
+		return -ENODEV;
+
+	size = (op->resource[0].end - op->resource[0].start) + 1;
+	regs = of_ioremap(&op->resource[0], 0, size, "clock");
+	if (!regs)
+		return -ENOMEM;
+
+#ifdef CONFIG_PCI
+	if (!strcmp(model, "ds1287") ||
+	    !strcmp(model, "m5819") ||
+	    !strcmp(model, "m5819p") ||
+	    !strcmp(model, "m5823")) {
+		ds1287_regs = (unsigned long) regs;
+	} else
+#endif
+	if (model[5] == '0' && model[6] == '2') {
+		mstk48t02_regs = regs;
+	} else if(model[5] == '0' && model[6] == '8') {
+		mstk48t08_regs = regs;
+		mstk48t02_regs = mstk48t08_regs + MOSTEK_48T08_48T02;
+	} else {
+		mstk48t59_regs = regs;
+		mstk48t02_regs = mstk48t59_regs + MOSTEK_48T59_48T02;
+	}
+
+	printk(KERN_INFO "%s: Clock regs at %p\n", dp->full_name, regs);
+
+	local_irq_save(flags);
+
+	if (mstk48t02_regs != NULL) {
+		/* Report a low battery voltage condition. */
+		if (has_low_battery())
+			prom_printf("NVRAM: Low battery voltage!\n");
+
+		/* Kick start the clock if it is completely stopped. */
+		if (mostek_read(mstk48t02_regs + MOSTEK_SEC) & MSTK_STOP)
+			kick_start_clock();
+	}
+
+	set_system_time();
+
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+static struct of_device_id clock_match[] = {
+	{
+		.name = "eeprom",
+	},
+	{
+		.name = "rtc",
+	},
+	{},
+};
+
+static struct of_platform_driver clock_driver = {
+	.name		= "clock",
+	.match_table	= clock_match,
+	.probe		= clock_probe,
+};
+
+static int __init clock_init(void)
+{
+	if (this_is_starfire) {
+		xtime.tv_sec = starfire_get_time();
+		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+		set_normalized_timespec(&wall_to_monotonic,
+		                        -xtime.tv_sec, -xtime.tv_nsec);
+		return 0;
+	}
+	if (tlb_type == hypervisor) {
+		xtime.tv_sec = hypervisor_get_time();
+		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+		set_normalized_timespec(&wall_to_monotonic,
+		                        -xtime.tv_sec, -xtime.tv_nsec);
+		return 0;
+	}
+
+	return of_register_driver(&clock_driver, &of_bus_type);
+}
+
+/* Must be after subsys_initcall() so that busses are probed.  Must
+ * be before device_initcall() because things like the RTC driver
+ * need to see the clock registers.
+ */
+fs_initcall(clock_init);
+
+/* This is gets the master TICK_INT timer going. */
+static unsigned long sparc64_init_timers(void)
+{
+	struct device_node *dp;
+	struct property *prop;
+	unsigned long clock;
+#ifdef CONFIG_SMP
+	extern void smp_tick_init(void);
+#endif
+
+	dp = of_find_node_by_path("/");
+	if (tlb_type == spitfire) {
+		unsigned long ver, manuf, impl;
+
+		__asm__ __volatile__ ("rdpr %%ver, %0"
+				      : "=&r" (ver));
+		manuf = ((ver >> 48) & 0xffff);
+		impl = ((ver >> 32) & 0xffff);
+		if (manuf == 0x17 && impl == 0x13) {
+			/* Hummingbird, aka Ultra-IIe */
+			tick_ops = &hbtick_operations;
+			prop = of_find_property(dp, "stick-frequency", NULL);
+		} else {
+			tick_ops = &tick_operations;
+			cpu_find_by_instance(0, &dp, NULL);
+			prop = of_find_property(dp, "clock-frequency", NULL);
+		}
+	} else {
+		tick_ops = &stick_operations;
+		prop = of_find_property(dp, "stick-frequency", NULL);
+	}
+	clock = *(unsigned int *) prop->value;
+	timer_tick_offset = clock / HZ;
+
+#ifdef CONFIG_SMP
+	smp_tick_init();
+#endif
+
+	return clock;
+}
+
+static void sparc64_start_timers(void)
+{
+	unsigned long pstate;
+
+	/* Guarantee that the following sequences execute
+	 * uninterrupted.
+	 */
+	__asm__ __volatile__("rdpr	%%pstate, %0\n\t"
+			     "wrpr	%0, %1, %%pstate"
+			     : "=r" (pstate)
+			     : "i" (PSTATE_IE));
+
+	tick_ops->init_tick(timer_tick_offset);
+
+	/* Restore PSTATE_IE. */
+	__asm__ __volatile__("wrpr	%0, 0x0, %%pstate"
+			     : /* no outputs */
+			     : "r" (pstate));
+}
+
+struct freq_table {
+	unsigned long clock_tick_ref;
+	unsigned int ref_freq;
+};
+static DEFINE_PER_CPU(struct freq_table, sparc64_freq_table) = { 0, 0 };
+
+unsigned long sparc64_get_clock_tick(unsigned int cpu)
+{
+	struct freq_table *ft = &per_cpu(sparc64_freq_table, cpu);
+
+	if (ft->clock_tick_ref)
+		return ft->clock_tick_ref;
+	return cpu_data(cpu).clock_tick;
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				    void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	unsigned int cpu = freq->cpu;
+	struct freq_table *ft = &per_cpu(sparc64_freq_table, cpu);
+
+	if (!ft->ref_freq) {
+		ft->ref_freq = freq->old;
+		ft->clock_tick_ref = cpu_data(cpu).clock_tick;
+	}
+	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+		cpu_data(cpu).clock_tick =
+			cpufreq_scale(ft->clock_tick_ref,
+				      ft->ref_freq,
+				      freq->new);
+	}
+
+	return 0;
+}
+
+static struct notifier_block sparc64_cpufreq_notifier_block = {
+	.notifier_call	= sparc64_cpufreq_notifier
+};
+
+#endif /* CONFIG_CPU_FREQ */
+
+static cycle_t read_itc(void)
+{
+        return (cycle_t)get_cycles());
+}
+
+static struct clocksource clocksource_sparc64_itc = {
+        .name           = "sparc64_itc",
+        .rating         = 300,
+        .read           = read_itc,
+        .mask           = 0xffffffffffffffffLL,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 16,
+        .is_continuous  = 1,
+};
+
+
+/* The quotient formula is taken from the IA64 port. */
+#define SPARC64_NSEC_PER_CYC_SHIFT	30UL
+void __init time_init(void)
+{
+	unsigned long clock = sparc64_init_timers();
+
+	clocksource_sparc64_itc.mult = clocksource_hz2mult(clock,
+						clocksource_sparc64_itc.shift);
+	clocksource_register(&clocksource_sparc64_itc);
+
+	/* Now that the clocksource is registered, it is
+	 * safe to start the timer ticking.
+	 */
+	sparc64_start_timers();
+
+	timer_ticks_per_nsec_quotient =
+		(((NSEC_PER_SEC << SPARC64_NSEC_PER_CYC_SHIFT) +
+		  (clock / 2)) / clock);
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_register_notifier(&sparc64_cpufreq_notifier_block,
+				  CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+}
+
+unsigned long long sched_clock(void)
+{
+	unsigned long ticks = tick_ops->get_tick();
+
+	return (ticks * timer_ticks_per_nsec_quotient)
+		>> SPARC64_NSEC_PER_CYC_SHIFT;
+}
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes, chip_minutes;
+	void __iomem *mregs = mstk48t02_regs;
+#ifdef CONFIG_PCI
+	unsigned long dregs = ds1287_regs;
+#else
+	unsigned long dregs = 0UL;
+#endif
+	unsigned long flags;
+	u8 tmp;
+
+	/*
+	 * Not having a register set can lead to trouble.
+	 * Also starfire doesn't have a tod clock.
+	 */
+	if (!mregs && !dregs)
+		return -1;
+
+	if (mregs) {
+		spin_lock_irqsave(&mostek_lock, flags);
+
+		/* Read the current RTC minutes. */
+		tmp = mostek_read(mregs + MOSTEK_CREG);
+		tmp |= MSTK_CREG_READ;
+		mostek_write(mregs + MOSTEK_CREG, tmp);
+
+		chip_minutes = MSTK_REG_MIN(mregs);
+
+		tmp = mostek_read(mregs + MOSTEK_CREG);
+		tmp &= ~MSTK_CREG_READ;
+		mostek_write(mregs + MOSTEK_CREG, tmp);
+
+		/*
+		 * since we're only adjusting minutes and seconds,
+		 * don't interfere with hour overflow. This avoids
+		 * messing with unknown time zones but requires your
+		 * RTC not to be off by more than 15 minutes
+		 */
+		real_seconds = nowtime % 60;
+		real_minutes = nowtime / 60;
+		if (((abs(real_minutes - chip_minutes) + 15)/30) & 1)
+			real_minutes += 30;	/* correct for half hour time zone */
+		real_minutes %= 60;
+
+		if (abs(real_minutes - chip_minutes) < 30) {
+			tmp = mostek_read(mregs + MOSTEK_CREG);
+			tmp |= MSTK_CREG_WRITE;
+			mostek_write(mregs + MOSTEK_CREG, tmp);
+
+			MSTK_SET_REG_SEC(mregs,real_seconds);
+			MSTK_SET_REG_MIN(mregs,real_minutes);
+
+			tmp = mostek_read(mregs + MOSTEK_CREG);
+			tmp &= ~MSTK_CREG_WRITE;
+			mostek_write(mregs + MOSTEK_CREG, tmp);
+
+			spin_unlock_irqrestore(&mostek_lock, flags);
+
+			return 0;
+		} else {
+			spin_unlock_irqrestore(&mostek_lock, flags);
+
+			return -1;
+		}
+	} else {
+		int retval = 0;
+		unsigned char save_control, save_freq_select;
+
+		/* Stolen from arch/i386/kernel/time.c, see there for
+		 * credits and descriptive comments.
+		 */
+		spin_lock_irqsave(&rtc_lock, flags);
+		save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
+		CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
+
+		save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
+		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+		chip_minutes = CMOS_READ(RTC_MINUTES);
+		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+			BCD_TO_BIN(chip_minutes);
+		real_seconds = nowtime % 60;
+		real_minutes = nowtime / 60;
+		if (((abs(real_minutes - chip_minutes) + 15)/30) & 1)
+			real_minutes += 30;
+		real_minutes %= 60;
+
+		if (abs(real_minutes - chip_minutes) < 30) {
+			if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+				BIN_TO_BCD(real_seconds);
+				BIN_TO_BCD(real_minutes);
+			}
+			CMOS_WRITE(real_seconds,RTC_SECONDS);
+			CMOS_WRITE(real_minutes,RTC_MINUTES);
+		} else {
+			printk(KERN_WARNING
+			       "set_rtc_mmss: can't update from %d to %d\n",
+			       chip_minutes, real_minutes);
+			retval = -1;
+		}
+
+		CMOS_WRITE(save_control, RTC_CONTROL);
+		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+		spin_unlock_irqrestore(&rtc_lock, flags);
+
+		return retval;
+	}
+}
+
+#define RTC_IS_OPEN		0x01	/* means /dev/rtc is in use	*/
+static unsigned char mini_rtc_status;	/* bitmapped status byte.	*/
+
+/* months start at 0 now */
+static unsigned char days_in_mo[] =
+{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+
+#define FEBRUARY	2
+#define	STARTOFTIME	1970
+#define SECDAY		86400L
+#define SECYR		(SECDAY * 365)
+#define	leapyear(year)		((year) % 4 == 0 && \
+				 ((year) % 100 != 0 || (year) % 400 == 0))
+#define	days_in_year(a) 	(leapyear(a) ? 366 : 365)
+#define	days_in_month(a) 	(month_days[(a) - 1])
+
+static int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+/*
+ * This only works for the Gregorian calendar - i.e. after 1752 (in the UK)
+ */
+static void GregorianDay(struct rtc_time * tm)
+{
+	int leapsToDate;
+	int lastYear;
+	int day;
+	int MonthOffset[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 };
+
+	lastYear = tm->tm_year - 1;
+
+	/*
+	 * Number of leap corrections to apply up to end of last year
+	 */
+	leapsToDate = lastYear / 4 - lastYear / 100 + lastYear / 400;
+
+	/*
+	 * This year is a leap year if it is divisible by 4 except when it is
+	 * divisible by 100 unless it is divisible by 400
+	 *
+	 * e.g. 1904 was a leap year, 1900 was not, 1996 is, and 2000 was
+	 */
+	day = tm->tm_mon > 2 && leapyear(tm->tm_year);
+
+	day += lastYear*365 + leapsToDate + MonthOffset[tm->tm_mon-1] +
+		   tm->tm_mday;
+
+	tm->tm_wday = day % 7;
+}
+
+static void to_tm(int tim, struct rtc_time *tm)
+{
+	register int    i;
+	register long   hms, day;
+
+	day = tim / SECDAY;
+	hms = tim % SECDAY;
+
+	/* Hours, minutes, seconds are easy */
+	tm->tm_hour = hms / 3600;
+	tm->tm_min = (hms % 3600) / 60;
+	tm->tm_sec = (hms % 3600) % 60;
+
+	/* Number of years in days */
+	for (i = STARTOFTIME; day >= days_in_year(i); i++)
+		day -= days_in_year(i);
+	tm->tm_year = i;
+
+	/* Number of months in days left */
+	if (leapyear(tm->tm_year))
+		days_in_month(FEBRUARY) = 29;
+	for (i = 1; day >= days_in_month(i); i++)
+		day -= days_in_month(i);
+	days_in_month(FEBRUARY) = 28;
+	tm->tm_mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	tm->tm_mday = day + 1;
+
+	/*
+	 * Determine the day of week
+	 */
+	GregorianDay(tm);
+}
+
+/* Both Starfire and SUN4V give us seconds since Jan 1st, 1970,
+ * aka Unix time.  So we have to convert to/from rtc_time.
+ */
+static inline void mini_get_rtc_time(struct rtc_time *time)
+{
+	unsigned long flags;
+	u32 seconds;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	seconds = 0;
+	if (this_is_starfire)
+		seconds = starfire_get_time();
+	else if (tlb_type == hypervisor)
+		seconds = hypervisor_get_time();
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	to_tm(seconds, time);
+	time->tm_year -= 1900;
+	time->tm_mon -= 1;
+}
+
+static inline int mini_set_rtc_time(struct rtc_time *time)
+{
+	u32 seconds = mktime(time->tm_year + 1900, time->tm_mon + 1,
+			     time->tm_mday, time->tm_hour,
+			     time->tm_min, time->tm_sec);
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	err = -ENODEV;
+	if (this_is_starfire)
+		err = starfire_set_time(seconds);
+	else  if (tlb_type == hypervisor)
+		err = hypervisor_set_time(seconds);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return err;
+}
+
+static int mini_rtc_ioctl(struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg)
+{
+	struct rtc_time wtime;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+
+	case RTC_PLL_GET:
+		return -EINVAL;
+
+	case RTC_PLL_SET:
+		return -EINVAL;
+
+	case RTC_UIE_OFF:	/* disable ints from RTC updates.	*/
+		return 0;
+
+	case RTC_UIE_ON:	/* enable ints for RTC updates.	*/
+	        return -EINVAL;
+
+	case RTC_RD_TIME:	/* Read the time/date from RTC	*/
+		/* this doesn't get week-day, who cares */
+		memset(&wtime, 0, sizeof(wtime));
+		mini_get_rtc_time(&wtime);
+
+		return copy_to_user(argp, &wtime, sizeof(wtime)) ? -EFAULT : 0;
+
+	case RTC_SET_TIME:	/* Set the RTC */
+	    {
+		int year;
+		unsigned char leap_yr;
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		if (copy_from_user(&wtime, argp, sizeof(wtime)))
+			return -EFAULT;
+
+		year = wtime.tm_year + 1900;
+		leap_yr = ((!(year % 4) && (year % 100)) ||
+			   !(year % 400));
+
+		if ((wtime.tm_mon < 0 || wtime.tm_mon > 11) || (wtime.tm_mday < 1))
+			return -EINVAL;
+
+		if (wtime.tm_mday < 0 || wtime.tm_mday >
+		    (days_in_mo[wtime.tm_mon] + ((wtime.tm_mon == 1) && leap_yr)))
+			return -EINVAL;
+
+		if (wtime.tm_hour < 0 || wtime.tm_hour >= 24 ||
+		    wtime.tm_min < 0 || wtime.tm_min >= 60 ||
+		    wtime.tm_sec < 0 || wtime.tm_sec >= 60)
+			return -EINVAL;
+
+		return mini_set_rtc_time(&wtime);
+	    }
+	}
+
+	return -EINVAL;
+}
+
+static int mini_rtc_open(struct inode *inode, struct file *file)
+{
+	if (mini_rtc_status & RTC_IS_OPEN)
+		return -EBUSY;
+
+	mini_rtc_status |= RTC_IS_OPEN;
+
+	return 0;
+}
+
+static int mini_rtc_release(struct inode *inode, struct file *file)
+{
+	mini_rtc_status &= ~RTC_IS_OPEN;
+	return 0;
+}
+
+
+static struct file_operations mini_rtc_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= mini_rtc_ioctl,
+	.open		= mini_rtc_open,
+	.release	= mini_rtc_release,
+};
+
+static struct miscdevice rtc_mini_dev =
+{
+	.minor		= RTC_MINOR,
+	.name		= "rtc",
+	.fops		= &mini_rtc_fops,
+};
+
+static int __init rtc_mini_init(void)
+{
+	int retval;
+
+	if (tlb_type != hypervisor && !this_is_starfire)
+		return -ENODEV;
+
+	printk(KERN_INFO "Mini RTC Driver\n");
+
+	retval = misc_register(&rtc_mini_dev);
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
+static void __exit rtc_mini_exit(void)
+{
+	misc_deregister(&rtc_mini_dev);
+}
+
+
+module_init(rtc_mini_init);
+module_exit(rtc_mini_exit);
diff -urN ./linux-2.6.18.1/arch/v850/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/v850/Kconfig
--- ./linux-2.6.18.1/arch/v850/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/v850/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -34,6 +34,10 @@
 	bool
 	default y
 
+config GENERIC_TIME
+	bool
+	default y
+
 config TIME_LOW_RES
 	bool
 	default y
diff -urN ./linux-2.6.18.1/arch/v850/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/v850/kernel/time.c
--- ./linux-2.6.18.1/arch/v850/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/v850/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -99,81 +99,6 @@
 	return IRQ_HANDLED;
 }
 
-/*
- * This version of gettimeofday has near microsecond resolution.
- */
-void do_gettimeofday (struct timeval *tv)
-{
-#if 0 /* DAVIDM later if possible */
-	extern volatile unsigned long lost_ticks;
-	unsigned long lost;
-#endif
-	unsigned long flags;
-	unsigned long usec, sec;
-	unsigned long seq;
-
-	do {
-		seq = read_seqbegin_irqsave(&xtime_lock, flags);
-
-#if 0
-		usec = mach_gettimeoffset ? mach_gettimeoffset () : 0;
-#else
-		usec = 0;
-#endif
-#if 0 /* DAVIDM later if possible */
-		lost = lost_ticks;
-		if (lost)
-			usec += lost * (1000000/HZ);
-#endif
-		sec = xtime.tv_sec;
-		usec += xtime.tv_nsec / 1000;
-	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq (&xtime_lock);
-
-	/* This is revolting. We need to set the xtime.tv_nsec
-	 * correctly. However, the value in this location is
-	 * is value at the last tick.
-	 * Discover what correction gettimeofday
-	 * would have done, and then undo it!
-	 */
-#if 0
-	tv->tv_nsec -= mach_gettimeoffset() * 1000;
-#endif
-
-	while (tv->tv_nsec < 0) {
-		tv->tv_nsec += NSEC_PER_SEC;
-		tv->tv_sec--;
-	}
-
-	xtime.tv_sec = tv->tv_sec;
-	xtime.tv_nsec = tv->tv_nsec;
-
-	ntp_clear();
-
-	write_sequnlock_irq (&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 static int timer_dev_id;
 static struct irqaction timer_irqaction = {
 	timer_interrupt,
diff -urN ./linux-2.6.18.1/arch/x86_64/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/Kconfig
--- ./linux-2.6.18.1/arch/x86_64/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -24,6 +24,14 @@
 	bool
 	default y
 
+config GENERIC_TIME
+	bool
+	default y
+
+config GENERIC_TIME_VSYSCALL
+	bool
+	default y
+
 config LOCKDEP_SUPPORT
 	bool
 	default y
@@ -46,13 +54,6 @@
 config SBUS
 	bool
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-
 config GENERIC_HWEIGHT
 	bool
 	default y
@@ -289,6 +290,14 @@
 	 If the system is EM64T, you should say N unless your system is EM64T 
 	 NUMA. 
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	bool
+
 config K8_NUMA
        bool "Old style AMD Opteron NUMA detection"
        depends on NUMA
@@ -659,3 +668,6 @@
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
+
+source "kernel/time/Kconfig"
+
diff -urN ./linux-2.6.18.1/arch/x86_64/ia32/ia32entry.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/ia32/ia32entry.S
--- ./linux-2.6.18.1/arch/x86_64/ia32/ia32entry.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/ia32/ia32entry.S	2007-05-19 23:58:35.000000000 +0900
@@ -119,7 +119,9 @@
 	cmpl	$(IA32_NR_syscalls-1),%eax
 	ja	ia32_badsys
 	IA32_ARG_FIXUP 1
+	TRACE_SYS_IA32_CALL
 	call	*ia32_sys_call_table(,%rax,8)
+	TRACE_SYS_RET
 	movq	%rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
@@ -227,7 +229,9 @@
 	cmpl $IA32_NR_syscalls-1,%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP 1
+	TRACE_SYS_IA32_CALL
 	call *ia32_sys_call_table(,%rax,8)
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%r10)
 	cli
@@ -320,8 +324,10 @@
 	cmpl $(IA32_NR_syscalls-1),%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP
+	TRACE_SYS_IA32_CALL
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	jmp int_ret_from_sys_call 
 
@@ -390,7 +396,7 @@
 
 	.section .rodata,"a"
 	.align 8
-ia32_sys_call_table:
+ENTRY(ia32_sys_call_table)
 	.quad sys_restart_syscall
 	.quad sys_exit
 	.quad stub32_fork
@@ -713,4 +719,7 @@
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+#ifdef CONFIG_LATENCY_TRACE
+.globl ia32_syscall_end
+#endif
 ia32_syscall_end:		
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/Makefile
--- ./linux-2.6.18.1/arch/x86_64/kernel/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -8,7 +8,7 @@
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		pci-dma.o pci-nommu.o alternative.o
+		pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_X86_MCE)         += mce.o
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/apic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/apic.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/apic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/apic.c	2007-05-19 23:58:35.000000000 +0900
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/module.h>
+#include <linux/clockchips.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -36,9 +37,9 @@
 #include <asm/idle.h>
 #include <asm/proto.h>
 #include <asm/timex.h>
+#include <asm/hpet.h>
 
 int apic_verbosity;
-int apic_runs_main_timer;
 int apic_calibrate_pmtmr __initdata;
 
 int disable_apic_timer __initdata;
@@ -52,6 +53,25 @@
 /* Using APIC to generate smp_local_timer_interrupt? */
 int using_apic_timer __read_mostly = 0;
 
+
+static unsigned int calibration_result;
+
+static void lapic_next_event(unsigned long delta, struct clock_event *evt);
+static void lapic_timer_setup(int mode, struct clock_event *evt);
+
+static struct clock_event lapic_clockevent = {
+	.name = "lapic",
+	.capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE
+#ifdef CONFIG_SMP
+			| CLOCK_CAP_UPDATE
+#endif
+	,
+	.shift = 32,
+	.set_mode = lapic_timer_setup,
+	.set_next_event = lapic_next_event,
+};
+static DEFINE_PER_CPU(struct clock_event, lapic_events);
+
 static void apic_pm_activate(void);
 
 void enable_NMI_through_LVT0 (void * dummy)
@@ -527,8 +547,7 @@
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
 	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-	local_save_flags(flags);
-	local_irq_disable();
+	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
 	return 0;
@@ -696,13 +715,16 @@
 
 #define APIC_DIVISOR 16
 
-static void __setup_APIC_LVTT(unsigned int clocks)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot)
 {
 	unsigned int lvtt_value, tmp_value, ver;
 	int cpu = smp_processor_id();
 
 	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+
 
 	if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
 		lvtt_value |= APIC_LVT_MASKED;
@@ -717,48 +739,34 @@
 				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
 				| APIC_TDR_DIV_16);
 
-	apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
+	if (!oneshot)
+		apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
-static void setup_APIC_timer(unsigned int clocks)
+static void lapic_next_event(unsigned long delta, struct clock_event *evt)
+{
+       apic_write(APIC_TMICT, delta);
+}
+
+static void lapic_timer_setup(int mode, struct clock_event *evt)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-
-	/* wait for irq slice */
- 	if (vxtime.hpet_address && hpet_use_timer) {
- 		int trigger = hpet_readl(HPET_T0_CMP);
- 		while (hpet_readl(HPET_COUNTER) >= trigger)
- 			/* do nothing */ ;
- 		while (hpet_readl(HPET_COUNTER) <  trigger)
- 			/* do nothing */ ;
- 	} else {
-		int c1, c2;
-		outb_p(0x00, 0x43);
-		c2 = inb_p(0x40);
-		c2 |= inb_p(0x40) << 8;
-		do {
-			c1 = c2;
-			outb_p(0x00, 0x43);
-			c2 = inb_p(0x40);
-			c2 |= inb_p(0x40) << 8;
-		} while (c2 - c1 < 300);
-	}
-	__setup_APIC_LVTT(clocks);
-	/* Turn off PIT interrupt if we use APIC timer as main timer.
-	   Only works with the PM timer right now
-	   TBD fix it for HPET too. */
-	if (vxtime.mode == VXTIME_PMTMR &&
-		smp_processor_id() == boot_cpu_id &&
-		apic_runs_main_timer == 1 &&
-		!cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
-		stop_timer_interrupt();
-		apic_runs_main_timer++;
-	}
+	__setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC);
 	local_irq_restore(flags);
 }
 
+
+static void __devinit setup_APIC_timer(void)
+{
+	struct clock_event *levt = &__get_cpu_var(lapic_events);
+
+	memcpy(levt, &lapic_clockevent, sizeof(*levt));
+
+	register_local_clockevent(levt);
+}
+
 /*
  * In this function we calibrate APIC bus clocks to the external
  * timer. Unfortunately we cannot use jiffies and the timer irq
@@ -778,12 +786,13 @@
 {
 	int apic, apic_start, tsc, tsc_start;
 	int result;
+	u64 wallclock_nsecs;
 	/*
 	 * Put whatever arbitrary (but long enough) timeout
 	 * value into the APIC clock, we just want to get the
 	 * counter running for calibration.
 	 */
-	__setup_APIC_LVTT(1000000000);
+	__setup_APIC_LVTT(1000000000, 0);
 
 	apic_start = apic_read(APIC_TMCCT);
 #ifdef CONFIG_X86_PM_TIMER
@@ -791,6 +800,8 @@
 		pmtimer_wait(5000);  /* 5ms wait */
 		apic = apic_read(APIC_TMCCT);
 		result = (apic_start - apic) * 1000L / 5;
+		printk("using pmtimer for lapic calibration\n");
+		wallclock_nsecs = 5000000;
 	} else
 #endif
 	{
@@ -804,6 +815,8 @@
 
 		result = (apic_start - apic) * 1000L * cpu_khz /
 					(tsc - tsc_start);
+		wallclock_nsecs = ((u64)tsc - (u64)tsc_start) * 1000000 / (u64)cpu_khz;
+
 	}
 	printk("result %d\n", result);
 
@@ -811,11 +824,22 @@
 	printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
 		result / 1000 / 1000, result / 1000 % 1000);
 
+
+
+
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc(apic_start - apic, wallclock_nsecs, 32);
+
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
+
 	return result * APIC_DIVISOR / HZ;
 }
 
-static unsigned int calibration_result;
-
 void __init setup_boot_APIC_clock (void)
 {
 	if (disable_apic_timer) { 
@@ -832,7 +856,7 @@
 	/*
 	 * Now set up the timer for real.
 	 */
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 
 	local_irq_enable();
 }
@@ -840,7 +864,7 @@
 void __cpuinit setup_secondary_APIC_clock(void)
 {
 	local_irq_disable(); /* FIXME: Do we need this? --RR */
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 	local_irq_enable();
 }
 
@@ -887,6 +911,13 @@
 	    !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
 		disable_APIC_timer();
 		cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
+#ifdef CONFIG_HIGH_RES_TIMERS
+		printk("Disabling NO_HZ and high resolution timers "
+			"due to timer broadcasting\n");
+		for_each_possible_cpu(cpu)
+			per_cpu(lapic_events, cpu).capabilities &=
+				~CLOCK_CAP_NEXTEVT;
+#endif
 	}
 }
 EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
@@ -945,8 +976,6 @@
 #ifdef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
-	if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
-		main_timer_handler(regs);
 	/*
 	 * We take the 'long' return path, and there every subsystem
 	 * grabs the appropriate locks (kernel lock/ irq lock).
@@ -969,6 +998,8 @@
  */
 void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
+	int cpu = smp_processor_id();
+	struct clock_event *evt = &per_cpu(lapic_events, cpu);
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
@@ -986,7 +1017,7 @@
 	 */
 	exit_idle();
 	irq_enter();
-	smp_local_timer_interrupt(regs);
+	evt->event_handler(regs);
 	irq_exit();
 }
 
@@ -1161,26 +1192,11 @@
 	return 1;
 } 
 
-static __init int setup_apicmaintimer(char *str)
-{
-	apic_runs_main_timer = 1;
-	nohpet = 1;
-	return 1;
-}
-__setup("apicmaintimer", setup_apicmaintimer);
-
-static __init int setup_noapicmaintimer(char *str)
-{
-	apic_runs_main_timer = -1;
-	return 1;
-}
-__setup("noapicmaintimer", setup_noapicmaintimer);
-
 static __init int setup_apicpmtimer(char *s)
 {
 	apic_calibrate_pmtmr = 1;
 	notsc_setup(NULL);
-	return setup_apicmaintimer(NULL);
+	return 1;
 }
 __setup("apicpmtimer", setup_apicpmtimer);
 
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/early_printk.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/early_printk.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/early_printk.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/early_printk.c	2007-05-19 23:58:35.000000000 +0900
@@ -203,7 +203,7 @@
 
 void early_printk(const char *fmt, ...)
 {
-	char buf[512];
+	static char buf[512];
 	int n;
 	va_list ap;
 
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/entry.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/entry.S
--- ./linux-2.6.18.1/arch/x86_64/kernel/entry.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/entry.S	2007-05-19 23:58:35.000000000 +0900
@@ -45,6 +45,47 @@
 
 	.code64
 
+#ifdef CONFIG_LATENCY_TRACE
+
+ENTRY(mcount)
+	cmpq $0, mcount_enabled
+	jz out
+
+	push %rbp
+	mov %rsp,%rbp
+
+	push %r11
+	push %r10
+	push %r9
+	push %r8
+	push %rdi
+	push %rsi
+	push %rdx
+	push %rcx
+	push %rax
+
+	mov 0x0(%rbp),%rax
+	mov 0x8(%rbp),%rdi
+	mov 0x8(%rax),%rsi
+
+	call   __trace
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+	pop %r10
+	pop %r11
+
+	pop %rbp
+out:
+	ret
+
+#endif
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
@@ -221,7 +262,9 @@
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
+	TRACE_SYS_CALL
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
+	TRACE_SYS_RET
 	movq %rax,RAX-ARGOFFSET(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
@@ -255,8 +298,8 @@
 	/* edx:	work, edi: workmask */	
 sysret_careful:
 	CFI_RESTORE_STATE
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz sysret_signal
 	TRACE_IRQS_ON
 	sti
 	pushq %rdi
@@ -279,7 +322,7 @@
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	cli
@@ -303,7 +346,9 @@
 	cmpq $__NR_syscall_max,%rax
 	ja  1f
 	movq %r10,%rcx	/* fixup for C */
+	TRACE_SYS_CALL
 	call *sys_call_table(,%rax,8)
+ 	TRACE_SYS_RET
 1:	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */
 	jmp int_ret_from_sys_call
@@ -349,8 +394,8 @@
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz int_very_careful
 	TRACE_IRQS_ON
 	sti
 	pushq %rdi
@@ -387,7 +432,7 @@
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi	
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 int_restore_rest:
 	RESTORE_REST
 	cli
@@ -585,8 +630,8 @@
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz    retint_signal
 	TRACE_IRQS_ON
 	sti
 	pushq %rdi
@@ -612,7 +657,7 @@
 	RESTORE_REST
 	cli
 	TRACE_IRQS_OFF
-	movl $_TIF_NEED_RESCHED,%edi
+	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
 
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/head64.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/head64.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/head64.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/head64.c	2007-05-19 23:58:35.000000000 +0900
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -74,7 +75,7 @@
 	boot_cpu_data.x86_mask = eax & 0xf;
 }
 
-void __init x86_64_start_kernel(char * real_mode_data)
+void __init notrace x86_64_start_kernel(char * real_mode_data)
 {
 	char *s;
 	int i;
@@ -99,6 +100,7 @@
  		cpu_pda(i) = &boot_cpu_pda[i];
 
 	pda_init(0);
+
 	copy_bootdata(real_mode_data);
 #ifdef CONFIG_SMP
 	cpu_set(0, cpu_online_map);
@@ -120,5 +122,6 @@
 		panic("Kernel too big for kernel mapping\n");
 
 	setup_boot_cpu_data();
+
 	start_kernel();
 }
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/hpet.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/hpet.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/hpet.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/hpet.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,475 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/hpet.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+
+int nohpet __initdata = 0;
+
+unsigned long hpet_address;
+static unsigned long hpet_period;			/* fsecs / HPET clock */
+unsigned long hpet_tick;				/* HPET clocks / interrupt */
+int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */
+
+#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
+
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+
+#define TICK_COUNT 100000000
+
+unsigned int __init hpet_calibrate_tsc(void)
+{
+	int tsc_start, hpet_start;
+	int tsc_now, hpet_now;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	local_irq_disable();
+
+	hpet_start = hpet_readl(HPET_COUNTER);
+	rdtscl(tsc_start);
+
+	do {
+		local_irq_disable();
+		hpet_now = hpet_readl(HPET_COUNTER);
+		tsc_now = get_cycles_sync();
+		local_irq_restore(flags);
+	} while ((tsc_now - tsc_start) < TICK_COUNT &&
+		 (hpet_now - hpet_start) < TICK_COUNT);
+
+	return (tsc_now - tsc_start) * 1000000000L
+		/ ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+
+
+
+#ifdef	CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+	struct hpet_data	hd;
+	unsigned int 		ntimer;
+
+	if (!hpet_address)
+        	return 0;
+
+	memset(&hd, 0, sizeof (hd));
+
+	ntimer = hpet_readl(HPET_ID);
+	ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+	ntimer++;
+
+	/*
+	 * Register with driver.
+	 * Timer0 and Timer1 is used by platform.
+	 */
+	hd.hd_phys_address = hpet_address;
+	hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
+	hd.hd_nirqs = ntimer;
+	hd.hd_flags = HPET_DATA_PLATFORM;
+	hpet_reserve_timer(&hd, 0);
+#ifdef	CONFIG_HPET_EMULATE_RTC
+	hpet_reserve_timer(&hd, 1);
+#endif
+	hd.hd_irq[0] = HPET_LEGACY_8254;
+	hd.hd_irq[1] = HPET_LEGACY_RTC;
+	if (ntimer > 2) {
+		struct hpet		*hpet;
+		struct hpet_timer	*timer;
+		int			i;
+
+		hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+		timer = &hpet->hpet_timers[2];
+		for (i = 2; i < ntimer; timer++, i++)
+			hd.hd_irq[i] = (timer->hpet_config &
+					Tn_INT_ROUTE_CNF_MASK) >>
+				Tn_INT_ROUTE_CNF_SHIFT;
+
+	}
+
+	hpet_alloc(&hd);
+	return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+
+static int hpet_timer_stop_set_go(unsigned long tick)
+{
+	unsigned int cfg;
+
+/*
+ * Stop the timers and reset the main counter.
+ */
+
+	cfg = hpet_readl(HPET_CFG);
+	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+	hpet_writel(cfg, HPET_CFG);
+	hpet_writel(0, HPET_COUNTER);
+	hpet_writel(0, HPET_COUNTER + 4);
+
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+	if (hpet_use_timer) {
+		hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+		    HPET_TN_32BIT, HPET_T0_CFG);
+		hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+		hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
+		cfg |= HPET_CFG_LEGACY;
+	}
+/*
+ * Go!
+ */
+
+	cfg |= HPET_CFG_ENABLE;
+	hpet_writel(cfg, HPET_CFG);
+
+	return 0;
+}
+
+int hpet_arch_init(void)
+{
+	unsigned int id;
+
+	if (!hpet_address)
+		return -1;
+	set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
+	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+
+/*
+ * Read the period, compute tick and quotient.
+ */
+
+	id = hpet_readl(HPET_ID);
+
+	if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
+		return -1;
+
+	hpet_period = hpet_readl(HPET_PERIOD);
+	if (hpet_period < 100000 || hpet_period > 100000000)
+		return -1;
+
+	hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
+
+	hpet_use_timer = (id & HPET_ID_LEGSUP);
+
+	return hpet_timer_stop_set_go(hpet_tick);
+}
+
+int hpet_reenable(void)
+{
+	return hpet_timer_stop_set_go(hpet_tick);
+}
+
+int hpet_stop(void)
+{
+	return hpet_timer_stop_set_go(0);
+}
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ 	64
+#define RTC_NUM_INTS 		1
+
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
+
+int is_hpet_enabled(void)
+{
+	return hpet_address != 0;
+}
+
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+	unsigned int cfg, cnt;
+	unsigned long flags;
+
+	if (!is_hpet_enabled())
+		return 0;
+	/*
+	 * Set the counter 1 and enable the interrupts.
+	 */
+	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+		hpet_rtc_int_freq = PIE_freq;
+	else
+		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+	local_irq_save(flags);
+	cnt = hpet_readl(HPET_COUNTER);
+	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
+	local_irq_restore(flags);
+
+	cfg = hpet_readl(HPET_T1_CFG);
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_T1_CFG);
+
+	return 1;
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+	unsigned int cfg, cnt;
+
+	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+		cfg = hpet_readl(HPET_T1_CFG);
+		cfg &= ~HPET_TN_ENABLE;
+		hpet_writel(cfg, HPET_T1_CFG);
+		return;
+	}
+
+	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+		hpet_rtc_int_freq = PIE_freq;
+	else
+		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+	/* It is more accurate to use the comparator value than current count.*/
+	cnt = hpet_t1_cmp;
+	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
+	hpet_writel(cnt, HPET_T1_CMP);
+	hpet_t1_cmp = cnt;
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (bit_mask & RTC_UIE)
+		UIE_on = 0;
+	if (bit_mask & RTC_PIE)
+		PIE_on = 0;
+	if (bit_mask & RTC_AIE)
+		AIE_on = 0;
+
+	return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+	int timer_init_reqd = 0;
+
+	if (!is_hpet_enabled())
+		return 0;
+
+	if (!(PIE_on | AIE_on | UIE_on))
+		timer_init_reqd = 1;
+
+	if (bit_mask & RTC_UIE) {
+		UIE_on = 1;
+	}
+	if (bit_mask & RTC_PIE) {
+		PIE_on = 1;
+		PIE_count = 0;
+	}
+	if (bit_mask & RTC_AIE) {
+		AIE_on = 1;
+	}
+
+	if (timer_init_reqd)
+		hpet_rtc_timer_init();
+
+	return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	alarm_time.tm_hour = hrs;
+	alarm_time.tm_min = min;
+	alarm_time.tm_sec = sec;
+
+	return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	PIE_freq = freq;
+	PIE_count = 0;
+
+	return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+	if (!is_hpet_enabled())
+		return 0;
+
+	return 1;
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct rtc_time curr_time;
+	unsigned long rtc_int_flag = 0;
+	int call_rtc_interrupt = 0;
+
+	hpet_rtc_timer_reinit();
+
+	if (UIE_on | AIE_on) {
+		rtc_get_rtc_time(&curr_time);
+	}
+	if (UIE_on) {
+		if (curr_time.tm_sec != prev_update_sec) {
+			/* Set update int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag = RTC_UF;
+			prev_update_sec = curr_time.tm_sec;
+		}
+	}
+	if (PIE_on) {
+		PIE_count++;
+		if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+			/* Set periodic int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag |= RTC_PF;
+			PIE_count = 0;
+		}
+	}
+	if (AIE_on) {
+		if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+		    (curr_time.tm_min == alarm_time.tm_min) &&
+		    (curr_time.tm_hour == alarm_time.tm_hour)) {
+			/* Set alarm int info, call real rtc int routine */
+			call_rtc_interrupt = 1;
+			rtc_int_flag |= RTC_AF;
+		}
+	}
+	if (call_rtc_interrupt) {
+		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+		rtc_interrupt(rtc_int_flag, dev_id, regs);
+	}
+	return IRQ_HANDLED;
+}
+#endif
+
+static int __init nohpet_setup(char *s)
+{
+	nohpet = 1;
+	return 1;
+}
+
+__setup("nohpet", nohpet_setup);
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+static cycle_t __vsyscall_fn vread_hpet(void)
+{
+	return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
+struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+	.vread		= vread_hpet,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem *hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	return clocksource_register(&clocksource_hpet);
+}
+
+module_init(init_hpet_clocksource);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/i8259.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/i8259.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/i8259.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/i8259.c	2007-05-19 23:58:35.000000000 +0900
@@ -43,17 +43,10 @@
 	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
 	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
 
-#define BUILD_15_IRQS(x) \
-	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
-	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
-	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-	BI(x,c) BI(x,d) BI(x,e)
-
 /*
  * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
  * (these are usually mapped to vectors 0x20-0x2f)
  */
-BUILD_16_IRQS(0x0)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 /*
@@ -66,19 +59,14 @@
  *
  * (these are usually mapped into the 0x30-0xff vector range)
  */
-		   BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
 BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
 BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
-
-#ifdef CONFIG_PCI_MSI
-	BUILD_15_IRQS(0xe)
-#endif
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
 
 #endif
 
 #undef BUILD_16_IRQS
-#undef BUILD_15_IRQS
 #undef BI
 
 
@@ -91,26 +79,11 @@
 	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
 	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 
-#define IRQLIST_15(x) \
-	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
-	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
-	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-	IRQ(x,c), IRQ(x,d), IRQ(x,e)
-
 void (*interrupt[NR_IRQS])(void) = {
-	IRQLIST_16(0x0),
-
-#ifdef CONFIG_X86_IO_APIC
-			 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
+					  IRQLIST_16(0x2), IRQLIST_16(0x3),
 	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
 	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-	IRQLIST_16(0xc), IRQLIST_16(0xd)
-
-#ifdef CONFIG_PCI_MSI
-	, IRQLIST_15(0xe)
-#endif
-
-#endif
+	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
 };
 
 #undef IRQ
@@ -126,46 +99,21 @@
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
-
-static void end_8259A_irq (unsigned int irq)
-{
-	if (irq > 256) { 
-		char var;
-		printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, task_thread_info(current));
-
-		BUG(); 
-	}
-
-	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
-	    irq_desc[irq].action)
-		enable_8259A_irq(irq);
-}
-
-#define shutdown_8259A_irq	disable_8259A_irq
-
 static void mask_and_ack_8259A(unsigned int);
 
-static unsigned int startup_8259A_irq(unsigned int irq)
-{ 
-	enable_8259A_irq(irq);
-	return 0; /* never anything pending */
-}
-
-static struct hw_interrupt_type i8259A_irq_type = {
-	.typename = "XT-PIC",
-	.startup = startup_8259A_irq,
-	.shutdown = shutdown_8259A_irq,
-	.enable = enable_8259A_irq,
-	.disable = disable_8259A_irq,
-	.ack = mask_and_ack_8259A,
-	.end = end_8259A_irq,
+static struct irq_chip i8259A_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_8259A_irq,
+	.unmask		= enable_8259A_irq,
+	.mask_ack	= mask_and_ack_8259A,
 };
 
 /*
  * 8259A PIC functions to handle ISA devices:
  */
 
+DEFINE_RAW_SPINLOCK(i8259A_lock);
+
 /*
  * This contains the irq mask for both 8259A irq controllers,
  */
@@ -234,7 +182,7 @@
 {
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
-	irq_desc[irq].chip = &i8259A_irq_type;
+	set_irq_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
 	enable_irq(irq);
 }
 
@@ -368,9 +316,9 @@
 		 * in AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_irq_type.ack = disable_8259A_irq;
+		i8259A_chip.mask_ack = disable_8259A_irq;
 	else
-		i8259A_irq_type.ack = mask_and_ack_8259A;
+		i8259A_chip.mask_ack = mask_and_ack_8259A;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
@@ -447,7 +395,28 @@
  * IRQ2 is cascade interrupt to second interrupt controller
  */
 
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+	[0 ... FIRST_EXTERNAL_VECTOR - 1] = -1,
+	[FIRST_EXTERNAL_VECTOR + 0] = 0,
+	[FIRST_EXTERNAL_VECTOR + 1] = 1,
+	[FIRST_EXTERNAL_VECTOR + 2] = 2,
+	[FIRST_EXTERNAL_VECTOR + 3] = 3,
+	[FIRST_EXTERNAL_VECTOR + 4] = 4,
+	[FIRST_EXTERNAL_VECTOR + 5] = 5,
+	[FIRST_EXTERNAL_VECTOR + 6] = 6,
+	[FIRST_EXTERNAL_VECTOR + 7] = 7,
+	[FIRST_EXTERNAL_VECTOR + 8] = 8,
+	[FIRST_EXTERNAL_VECTOR + 9] = 9,
+	[FIRST_EXTERNAL_VECTOR + 10] = 10,
+	[FIRST_EXTERNAL_VECTOR + 11] = 11,
+	[FIRST_EXTERNAL_VECTOR + 12] = 12,
+	[FIRST_EXTERNAL_VECTOR + 13] = 13,
+	[FIRST_EXTERNAL_VECTOR + 14] = 14,
+	[FIRST_EXTERNAL_VECTOR + 15] = 15,
+	[FIRST_EXTERNAL_VECTOR + 16 ... NR_VECTORS - 1] = -1
+};
 
 void __init init_ISA_irqs (void)
 {
@@ -467,12 +436,13 @@
 			/*
 			 * 16 old-style INTA-cycle interrupts:
 			 */
-			irq_desc[i].chip = &i8259A_irq_type;
+			set_irq_chip_and_handler(i, &i8259A_chip,
+						 handle_level_irq);
 		} else {
 			/*
 			 * 'high' PCI IRQs filled in on demand
 			 */
-			irq_desc[i].chip = &no_irq_type;
+			irq_desc[i].chip = &no_irq_chip;
 		}
 	}
 }
@@ -546,8 +516,6 @@
 	 */
 	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (i >= NR_IRQS)
-			break;
 		if (vector != IA32_SYSCALL_VECTOR)
 			set_intr_gate(vector, interrupt[i]);
 	}
@@ -557,7 +525,7 @@
 	 * IRQ0 must be given a fixed assignment and initialized,
 	 * because it's used before the IO-APIC is set up.
 	 */
-	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+	__get_cpu_var(vector_irq)[FIRST_DEVICE_VECTOR] = 0;
 
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/io_apic.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/io_apic.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/io_apic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/io_apic.c	2007-05-19 23:58:35.000000000 +0900
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/pci.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
 #include <linux/sysdev.h>
@@ -41,6 +42,9 @@
 #include <asm/acpi.h>
 #include <asm/dma.h>
 #include <asm/nmi.h>
+#include <asm/msidef.h>
+
+static int assign_irq_vector(int irq, cpumask_t mask);
 
 #define __apicdebuginit  __init
 
@@ -55,8 +59,8 @@
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -81,14 +85,6 @@
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
-#ifdef CONFIG_PCI_MSI
-#define vector_to_irq(vector) 	\
-	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
-#else
-#define vector_to_irq(vector)	(vector)
-#endif
-
 #define __DO_ACTION(R, ACTION, FINAL)					\
 									\
 {									\
@@ -104,6 +100,9 @@
 		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
 		reg ACTION;						\
 		io_apic_modify(entry->apic, reg);			\
+		 /* Force POST flush by reading: */			\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+									\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
@@ -112,11 +111,35 @@
 }
 
 #ifdef CONFIG_SMP
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+	int apic, pin;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	BUG_ON(irq >= NR_IRQS);
+	for (;;) {
+		unsigned int reg;
+		apic = entry->apic;
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~0x000000ff;
+		reg |= vector;
+		io_apic_modify(apic, reg);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
+	int vector;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -124,7 +147,13 @@
 
 	cpus_and(mask, tmp, CPU_MASK_ALL);
 
-	dest = cpu_mask_to_apicid(mask);
+	vector = assign_irq_vector(irq, mask);
+	if (vector < 0)
+		return;
+
+	cpus_clear(tmp);
+	cpu_set(vector >> 8, tmp);
+	dest = cpu_mask_to_apicid(tmp);
 
 	/*
 	 * Only the high 8 bits are valid.
@@ -132,14 +161,12 @@
 	dest = SET_APIC_LOGICAL_ID(dest);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	set_irq_info(irq, mask);
+	__target_IO_APIC_irq(irq, dest, vector & 0xff);
+	set_native_irq_info(irq, mask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 #endif
 
-static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
-
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -170,10 +197,8 @@
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+DO_ACTION( __mask,             0, |= 0x00010000, ) /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, ) /* mask = 0 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -695,64 +720,6 @@
 	return MPBIOS_trigger(idx);
 }
 
-static int next_irq = 16;
-
-/*
- * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
- * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
- * from ACPI, which can reach 800 in large boxen.
- *
- * Compact the sparse GSI space into a sequential IRQ series and reuse
- * vectors if possible.
- */
-int gsi_irq_sharing(int gsi)
-{
-	int i, tries, vector;
-
-	BUG_ON(gsi >= NR_IRQ_VECTORS);
-
-	if (platform_legacy_irq(gsi))
-		return gsi;
-
-	if (gsi_2_irq[gsi] != 0xFF)
-		return (int)gsi_2_irq[gsi];
-
-	tries = NR_IRQS;
-  try_again:
-	vector = assign_irq_vector(gsi);
-
-	/*
-	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
-	 * use of vector and if found, return that IRQ.  However, we never want
-	 * to share legacy IRQs, which usually have a different trigger mode
-	 * than PCI.
-	 */
-	for (i = 0; i < NR_IRQS; i++)
-		if (IO_APIC_VECTOR(i) == vector)
-			break;
-	if (platform_legacy_irq(i)) {
-		if (--tries >= 0) {
-			IO_APIC_VECTOR(i) = 0;
-			goto try_again;
-		}
-		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
-	}
-	if (i < NR_IRQS) {
-		gsi_2_irq[gsi] = i;
-		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
-				gsi, vector, i);
-		return i;
-	}
-
-	i = next_irq++;
-	BUG_ON(i >= NR_IRQS);
-	gsi_2_irq[gsi] = i;
-	IO_APIC_VECTOR(i) = vector;
-	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
-			gsi, vector, i);
-	return i;
-}
-
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -782,7 +749,6 @@
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
-			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -830,46 +796,83 @@
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+unsigned int irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_EXTERNAL_VECTOR, 0 };
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq, cpumask_t mask)
 {
-	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
-	unsigned long flags;
-	int vector;
-
-	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
-
-	spin_lock_irqsave(&vector_lock, flags);
-
-	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
-		spin_unlock_irqrestore(&vector_lock, flags);
-		return IO_APIC_VECTOR(irq);
-	}
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	static struct {
+		int vector;
+		int offset;
+	} pos[NR_CPUS] = { [ 0 ... NR_CPUS - 1] = {FIRST_DEVICE_VECTOR, 0} };
+	int old_vector = -1;
+	int cpu;
+
+	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
+
+	if (IO_APIC_VECTOR(irq) > 0)
+		old_vector = IO_APIC_VECTOR(irq);
+	if ((old_vector > 0) && cpu_isset(old_vector >> 8, mask)) {
+		return old_vector;
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		int vector, offset;
+		vector = pos[cpu].vector;
+		offset = pos[cpu].offset;
 next:
-	current_vector += 8;
-	if (current_vector == IA32_SYSCALL_VECTOR)
-		goto next;
-
-	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		/* If we run out of vectors on large boxen, must share them. */
-		offset = (offset + 1) % 8;
-		current_vector = FIRST_DEVICE_VECTOR + offset;
+		vector += 8;
+		if (vector >= FIRST_SYSTEM_VECTOR) {
+			/* If we run out of vectors on large boxen, must share them. */
+			offset = (offset + 1) % 8;
+			vector = FIRST_DEVICE_VECTOR + offset;
+		}
+		if (unlikely(pos[cpu].vector == vector))
+			continue;
+		if (vector == IA32_SYSCALL_VECTOR)
+			goto next;
+		if (per_cpu(vector_irq, cpu)[vector] != -1)
+			goto next;
+		/* Found one! */
+		pos[cpu].vector = vector;
+		pos[cpu].offset = offset;
+		if (old_vector >= 0) {
+			int old_cpu = old_vector >> 8;
+			old_vector &= 0xff;
+			per_cpu(vector_irq, old_cpu)[old_vector] = -1;
+		}
+		per_cpu(vector_irq, cpu)[vector] = irq;
+		vector |= cpu << 8;
+		IO_APIC_VECTOR(irq) = vector;
+		return vector;
 	}
+	return -ENOSPC;
+}
 
-	vector = current_vector;
-	vector_irq[vector] = irq;
-	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = vector;
+static int assign_irq_vector(int irq, cpumask_t mask)
+{
+	int vector;
+	unsigned long flags;
 
+	spin_lock_irqsave(&vector_lock, flags);
+	vector = __assign_irq_vector(irq, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
-
 	return vector;
 }
 
 extern void (*interrupt[NR_IRQS])(void);
-static struct hw_interrupt_type ioapic_level_type;
-static struct hw_interrupt_type ioapic_edge_type;
+
+static struct irq_chip ioapic_chip;
 
 #define IOAPIC_AUTO	-1
 #define IOAPIC_EDGE	0
@@ -877,16 +880,16 @@
 
 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
-	unsigned idx;
-
-	idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
-
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-			trigger == IOAPIC_LEVEL)
-		irq_desc[idx].chip = &ioapic_level_type;
-	else
-		irq_desc[idx].chip = &ioapic_edge_type;
-	set_intr_gate(vector, interrupt[idx]);
+			trigger == IOAPIC_LEVEL) {
+#ifdef CONFIG_PREEMPT_HARDIRQS
+		set_irq_chip_and_handler(irq, &ioapic_chip, handle_level_irq);
+#else
+		set_irq_chip_and_handler(irq, &ioapic_chip, handle_fasteoi_irq);
+#endif
+	} else {
+		set_irq_chip_and_handler(irq, &ioapic_chip, handle_edge_irq);
+	}
 }
 
 static void __init setup_IO_APIC_irqs(void)
@@ -936,8 +939,15 @@
 			continue;
 
 		if (IO_APIC_IRQ(irq)) {
-			vector = assign_irq_vector(irq);
-			entry.vector = vector;
+			cpumask_t mask;
+			vector = assign_irq_vector(irq, TARGET_CPUS);
+			if (vector < 0)
+				continue;
+
+			cpus_clear(mask);
+			cpu_set(vector >> 8, mask);
+			entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+			entry.vector = vector & 0xff;
 
 			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
 			if (!apic && (irq < 16))
@@ -987,7 +997,7 @@
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_edge_type;
+	set_irq_chip_and_handler(0, &ioapic_chip, handle_edge_irq);
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1106,17 +1116,12 @@
 		);
 	}
 	}
-	if (use_pci_vector())
-		printk(KERN_INFO "Using vector-based indexing\n");
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for (i = 0; i < NR_IRQS; i++) {
 		struct irq_pin_list *entry = irq_2_pin + i;
 		if (entry->pin < 0)
 			continue;
- 		if (use_pci_vector() && !platform_legacy_irq(i))
-			printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
-		else
-			printk(KERN_DEBUG "IRQ%d ", i);
+		printk(KERN_DEBUG "IRQ%d ", i);
 		for (;;) {
 			printk("-> %d:%d", entry->apic, entry->pin);
 			if (!entry->next)
@@ -1502,7 +1507,7 @@
  * an edge even if it isn't on the 8259A...
  */
 
-static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(unsigned int irq)
 {
 	int was_pending = 0;
 	unsigned long flags;
@@ -1519,107 +1524,16 @@
 	return was_pending;
 }
 
-/*
- * Once we have recorded IRQ_PENDING already, we can mask the
- * interrupt for real. This prevents IRQ storms from unhandled
- * devices.
- */
-static void ack_edge_ioapic_irq(unsigned int irq)
-{
-	move_irq(irq);
-	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
-					== (IRQ_PENDING | IRQ_DISABLED))
-		mask_IO_APIC_irq(irq);
-	ack_APIC_irq();
-}
-
-/*
- * Level triggered interrupts can just be masked,
- * and shutting down and starting up the interrupt
- * is the same as enabling and disabling them -- except
- * with a startup need to return a "was pending" value.
- *
- * Level triggered interrupts are special because we
- * do not touch any IO-APIC register while handling
- * them. We ack the APIC in the end-IRQ handler, not
- * in the start-IRQ-handler. Protection against reentrance
- * from the same interrupt is still provided, both by the
- * generic IRQ layer and by the fact that an unacked local
- * APIC does not accept IRQs.
- */
-static unsigned int startup_level_ioapic_irq (unsigned int irq)
-{
-	unmask_IO_APIC_irq(irq);
-
-	return 0; /* don't check for pending */
-}
-
-static void end_level_ioapic_irq (unsigned int irq)
-{
-	move_irq(irq);
-	ack_APIC_irq();
-}
-
-#ifdef CONFIG_PCI_MSI
-static unsigned int startup_edge_ioapic_vector(unsigned int vector)
+static int ioapic_retrigger_irq(unsigned int irq)
 {
-	int irq = vector_to_irq(vector);
+	cpumask_t mask;
+	unsigned vector;
 
-	return startup_edge_ioapic_irq(irq);
-}
+	vector = irq_vector[irq];
+	cpus_clear(mask);
+	cpu_set(vector >> 8, mask);
 
-static void ack_edge_ioapic_vector(unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	ack_edge_ioapic_irq(irq);
-}
-
-static unsigned int startup_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	return startup_level_ioapic_irq (irq);
-}
-
-static void end_level_ioapic_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	move_native_irq(vector);
-	end_level_ioapic_irq(irq);
-}
-
-static void mask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	mask_IO_APIC_irq(irq);
-}
-
-static void unmask_IO_APIC_vector (unsigned int vector)
-{
-	int irq = vector_to_irq(vector);
-
-	unmask_IO_APIC_irq(irq);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_vector (unsigned int vector,
-					cpumask_t cpu_mask)
-{
-	int irq = vector_to_irq(vector);
-
-	set_native_irq_info(vector, cpu_mask);
-	set_ioapic_affinity_irq(irq, cpu_mask);
-}
-#endif // CONFIG_SMP
-#endif // CONFIG_PCI_MSI
-
-static int ioapic_retrigger(unsigned int irq)
-{
-	send_IPI_self(IO_APIC_VECTOR(irq));
+	send_IPI_mask(mask, vector & 0xff);
 
 	return 1;
 }
@@ -1633,32 +1547,47 @@
  * races.
  */
 
-static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
-	.typename = "IO-APIC-edge",
-	.startup 	= startup_edge_ioapic,
-	.shutdown 	= shutdown_edge_ioapic,
-	.enable 	= enable_edge_ioapic,
-	.disable 	= disable_edge_ioapic,
-	.ack 		= ack_edge_ioapic,
-	.end 		= end_edge_ioapic,
-#ifdef CONFIG_SMP
-	.set_affinity = set_ioapic_affinity,
+static void ack_apic_edge(unsigned int irq)
+{
+	move_native_irq(irq);
+	ack_APIC_irq();
+}
+
+static void ack_apic_level(unsigned int irq)
+{
+	int do_unmask_irq = 0;
+
+#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+	/* If we are moving the irq we need to mask it */
+	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+		do_unmask_irq = 1;
+		mask_IO_APIC_irq(irq);
+	}
 #endif
-	.retrigger	= ioapic_retrigger,
-};
 
-static struct hw_interrupt_type ioapic_level_type __read_mostly = {
-	.typename = "IO-APIC-level",
-	.startup 	= startup_level_ioapic,
-	.shutdown 	= shutdown_level_ioapic,
-	.enable 	= enable_level_ioapic,
-	.disable 	= disable_level_ioapic,
-	.ack 		= mask_and_ack_level_ioapic,
-	.end 		= end_level_ioapic,
+	/*
+	 * We must acknowledge the irq before we move it or the acknowledge will
+	 * not propogate properly.
+	 */
+	ack_APIC_irq();
+
+	/* Now we can move and renable the irq */
+	move_masked_irq(irq);
+	if (unlikely(do_unmask_irq))
+		unmask_IO_APIC_irq(irq);
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+	.name 		= "IO-APIC",
+	.startup 	= startup_ioapic_irq,
+	.mask	 	= mask_IO_APIC_irq,
+	.unmask	 	= unmask_IO_APIC_irq,
+	.ack 		= ack_apic_edge,
+	.eoi 		= ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity = set_ioapic_affinity,
+	.set_affinity 	= set_ioapic_affinity_irq,
 #endif
-	.retrigger	= ioapic_retrigger,
+	.retrigger	= ioapic_retrigger_irq,
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -1678,11 +1607,6 @@
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
 		int tmp = irq;
-		if (use_pci_vector()) {
-			if (!platform_legacy_irq(tmp))
-				if ((tmp = vector_to_irq(tmp)) == -1)
-					continue;
-		}
 		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
 			/*
 			 * Hmm.. We don't have an entry for this,
@@ -1693,7 +1617,7 @@
 				make_8259A_irq(irq);
 			else
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_type;
+				irq_desc[irq].chip = &no_irq_chip;
 		}
 	}
 }
@@ -1812,8 +1736,6 @@
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-int timer_uses_ioapic_pin_0;
-
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -1831,8 +1753,7 @@
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	vector = assign_irq_vector(0);
-	set_intr_gate(vector, interrupt[0]);
+	vector = assign_irq_vector(0, TARGET_CPUS);
 
 	/*
 	 * Subtle, code in do_timer_interrupt() expects an AEOI
@@ -1851,9 +1772,6 @@
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 
-	if (pin1 == 0)
-		timer_uses_ioapic_pin_0 = 1;
-
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
@@ -2069,6 +1987,124 @@
 
 device_initcall(ioapic_init_sysfs);
 
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+	/* Allocate an unused irq */
+	int irq;
+	int new;
+	int vector = 0;
+	unsigned long flags;
+
+	irq = -ENOSPC;
+	spin_lock_irqsave(&vector_lock, flags);
+	for (new = (NR_IRQS - 1); new >= 0; new--) {
+		if (platform_legacy_irq(new))
+			continue;
+		if (irq_vector[new] != 0)
+			continue;
+		vector = __assign_irq_vector(new, TARGET_CPUS);
+		if (likely(vector > 0))
+			irq = new;
+		break;
+	}
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	if (irq >= 0) {
+		dynamic_irq_init(irq);
+	}
+	return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	unsigned long flags;
+
+	dynamic_irq_cleanup(irq);
+
+	spin_lock_irqsave(&vector_lock, flags);
+	irq_vector[irq] = 0;
+	spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_msg_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+	/* For now always this code always uses physical delivery
+	 * mode.
+	 */
+	int vector;
+	unsigned dest;
+
+	vector = assign_irq_vector(irq, TARGET_CPUS);
+	if (vector >= 0) {
+		cpumask_t tmp;
+
+		cpus_clear(tmp);
+		cpu_set(vector >> 8, tmp);
+		dest = cpu_mask_to_apicid(tmp);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo =
+			MSI_ADDR_BASE_LO |
+			((INT_DEST_MODE == 0) ?
+				MSI_ADDR_DEST_MODE_PHYSICAL:
+				MSI_ADDR_DEST_MODE_LOGICAL) |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_ADDR_REDIRECTION_CPU:
+				MSI_ADDR_REDIRECTION_LOWPRI) |
+			MSI_ADDR_DEST_ID(dest);
+
+		msg->data =
+			MSI_DATA_TRIGGER_EDGE |
+			MSI_DATA_LEVEL_ASSERT |
+			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+				MSI_DATA_DELIVERY_FIXED:
+				MSI_DATA_DELIVERY_LOWPRI) |
+			MSI_DATA_VECTOR(vector);
+	}
+	return vector;
+}
+
+static void msi_msg_teardown(unsigned int irq)
+{
+	return;
+}
+
+static void msi_msg_set_affinity(unsigned int irq, cpumask_t mask, struct msi_msg *msg)
+{
+	int vector;
+	unsigned dest;
+
+	vector = assign_irq_vector(irq, mask);
+	if (vector > 0) {
+		cpumask_t tmp;
+
+		cpus_clear(tmp);
+		cpu_set(vector >> 8, tmp);
+		dest = cpu_mask_to_apicid(tmp);
+
+		msg->data &= ~MSI_DATA_VECTOR_MASK;
+		msg->data |= MSI_DATA_VECTOR(vector);
+		msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+		msg->address_lo |= MSI_ADDR_DEST_ID(dest);
+	}
+}
+
+struct msi_ops arch_msi_ops = {
+	.needs_64bit_address = 0,
+	.setup = msi_msg_setup,
+	.teardown = msi_msg_teardown,
+	.target = msi_msg_set_affinity,
+};
+
+#endif
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -2107,6 +2143,8 @@
 {
 	struct IO_APIC_route_entry entry;
 	unsigned long flags;
+	int vector;
+	cpumask_t mask;
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -2115,6 +2153,20 @@
 	}
 
 	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= 16)
+		add_pin_to_irq(irq, ioapic, pin);
+
+
+	vector = assign_irq_vector(irq, TARGET_CPUS);
+	if (vector < 0)
+		return vector;
+
+	cpus_clear(mask);
+	cpu_set(vector >> 8, mask);
+
+	/*
 	 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
 	 * Note that we mask (disable) IRQs now -- these get enabled when the
 	 * corresponding device driver registers for this IRQ.
@@ -2124,19 +2176,11 @@
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
 	entry.trigger = triggering;
 	entry.polarity = polarity;
 	entry.mask = 1;					 /* Disabled (masked) */
-
-	irq = gsi_irq_sharing(irq);
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
-
-	entry.vector = assign_irq_vector(irq);
+	entry.vector = vector & 0xff;
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
 		"IRQ %d Mode:%i Active:%i)\n", ioapic, 
@@ -2151,7 +2195,7 @@
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
-	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
+	set_native_irq_info(irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/irq.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/irq.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/irq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/irq.c	2007-05-19 23:58:35.000000000 +0900
@@ -79,7 +79,8 @@
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].chip->typename);
+		seq_printf(p, " %8s", irq_desc[i].chip->name);
+		seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq));
 
 		seq_printf(p, "  %s", action->name);
 		for (action=action->next; action; action = action->next)
@@ -116,7 +117,18 @@
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	/* high bit used in ret_from_ code  */
-	unsigned irq = ~regs->orig_rax;
+	unsigned vector = ~regs->orig_rax;
+	unsigned irq;
+
+	exit_idle();
+	irq_enter();
+	irq = __get_cpu_var(vector_irq)[vector];
+
+#ifdef CONFIG_LATENCY_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->rip, irq, 0);
 
 	if (unlikely(irq >= NR_IRQS)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
@@ -124,12 +136,24 @@
 		BUG();
 	}
 
-	exit_idle();
-	irq_enter();
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	stack_overflow_check(regs);
 #endif
-	__do_IRQ(irq, regs);
+#ifdef CONFIG_NO_HZ
+       if (idle_cpu(smp_processor_id())) {
+               update_jiffies();
+               /*
+                * Force polling-idle loops to break out into
+                * the sched-timer setting code, to make sure
+                * that timer interval changes due to __mod_timer()
+                * in IRQ context get properly propagated:
+                */
+               if (tsk_is_polling(current))
+                       set_need_resched();
+       }
+#endif
+
+	generic_handle_irq(irq, regs);
 	irq_exit();
 
 	return 1;
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/mpparse.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/mpparse.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/mpparse.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/mpparse.c	2007-05-19 23:58:35.000000000 +0900
@@ -909,20 +909,11 @@
 	return;
 }
 
-#define MAX_GSI_NUM	4096
-
 int mp_register_gsi(u32 gsi, int triggering, int polarity)
 {
 	int			ioapic = -1;
 	int			ioapic_pin = 0;
 	int			idx, bit = 0;
-	static int		pci_irq = 16;
-	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, to the IRQs
-	 * assigned to actual devices.
-	 */
-	static int		gsi_to_irq[MAX_GSI_NUM];
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
@@ -955,42 +946,11 @@
 	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
 		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
 			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi_to_irq[gsi];
+		return gsi;
 	}
 
 	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
 
-	if (triggering == ACPI_LEVEL_SENSITIVE) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
-				gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_fadt.sci_int)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-
 	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
 		triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 		polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/nmi.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/nmi.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/nmi.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/nmi.c	2007-05-19 23:58:35.000000000 +0900
@@ -37,7 +37,7 @@
  * This is maintained separately from nmi_active because the NMI
  * watchdog may also be driven from the I/O APIC timer.
  */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
+static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock);
 static unsigned int lapic_nmi_owner;
 #define LAPIC_NMI_WATCHDOG	(1<<0)
 #define LAPIC_NMI_RESERVED	(1<<1)
@@ -127,7 +127,9 @@
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_enable_in_hardirq();
+#endif
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -526,12 +528,42 @@
  	touch_softlockup_watchdog();
 }
 
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
 void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
+	int cpu = safe_smp_processor_id();
 
 	sum = read_pda(apic_timer_irqs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 	if (__get_cpu_var(nmi_touch)) {
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
@@ -549,6 +581,11 @@
 		 */
 		local_inc(&__get_cpu_var(alert_counter));
 		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
+			int i;
+
+			for (i = 0; i < NR_CPUS; i++)
+				nmi_show_regs[i] = 1;
+
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
 				local_set(&__get_cpu_var(alert_counter), 0);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/pmtimer.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/pmtimer.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/pmtimer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/pmtimer.c	2007-05-19 23:58:35.000000000 +0900
@@ -24,15 +24,6 @@
 #include <asm/msr.h>
 #include <asm/vsyscall.h>
 
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/kernel/acpi/boot.c */
-u32 pmtmr_ioport __read_mostly;
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_delay;
-static u32 last_pmtmr_tick;
-
 #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
 
 static inline u32 cyc2us(u32 cycles)
@@ -48,38 +39,6 @@
 	return (cycles >> 10);
 }
 
-int pmtimer_mark_offset(void)
-{
-	static int first_run = 1;
-	unsigned long tsc;
-	u32 lost;
-
-	u32 tick = inl(pmtmr_ioport);
-	u32 delta;
-
-	delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
-
-	last_pmtmr_tick = tick;
-	monotonic_base += delta * NSEC_PER_USEC;
-
-	delta += offset_delay;
-
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-	rdtscll(tsc);
-	vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-
-	return lost - 1;
-}
-
 static unsigned pmtimer_wait_tick(void)
 {
 	u32 a, b;
@@ -101,23 +60,6 @@
 	} while (cyc2us(b - a) < us);
 }
 
-void pmtimer_resume(void)
-{
-	last_pmtmr_tick = inl(pmtmr_ioport);
-}
-
-unsigned int do_gettimeoffset_pm(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = last_pmtmr_tick;
-	now = inl(pmtmr_ioport);
-	delta = (now - offset) & ACPI_PM_MASK;
-
-	return offset_delay + cyc2us(delta);
-}
-
-
 static int __init nopmtimer_setup(char *s)
 {
 	pmtmr_ioport = 0;
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/process.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/process.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/process.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/process.c	2007-05-19 23:58:35.000000000 +0900
@@ -113,11 +113,15 @@
 
 	current_thread_info()->status &= ~TS_POLLING;
 	smp_mb__after_clear_bit();
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
 		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
-		else
+		if (!need_resched() && !need_resched_delayed()) {
+			if (!hrtimer_stop_sched_tick())
+				safe_halt();
+			else
+				local_irq_enable();
+			hrtimer_restart_sched_tick();
+		} else
 			local_irq_enable();
 	}
 	current_thread_info()->status |= TS_POLLING;
@@ -131,6 +135,14 @@
 static void poll_idle (void)
 {
 	local_irq_enable();
+	while (!need_resched() && !need_resched_delayed()) {
+		hrtimer_stop_sched_tick();
+		local_irq_enable();
+		while (!need_resched() && !need_resched_delayed() && !rcu_pending(smp_processor_id()) && !local_softirq_pending())
+			rep_nop();
+		hrtimer_restart_sched_tick();
+		local_irq_enable();
+	}
 
 	asm volatile(
 		"2:"
@@ -206,7 +218,9 @@
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -218,14 +232,16 @@
 				idle = default_idle;
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
+			stop_critical_timing();
 			enter_idle();
 			idle();
 			__exit_idle();
 		}
-
-		preempt_enable_no_resched();
-		schedule();
+		local_irq_disable();
+		__preempt_enable_no_resched();
+		__schedule();
 		preempt_disable();
+		local_irq_enable();
 	}
 }
 
@@ -240,13 +256,16 @@
 {
 	local_irq_enable();
 
-	while (!need_resched()) {
+	while (!need_resched() && !need_resched_delayed()) {
+		if (hrtimer_stop_sched_tick())
+			break;
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (need_resched())
+		if (need_resched() && !need_resched_delayed())
 			break;
 		__mwait(0, 0);
 	}
+	hrtimer_restart_sched_tick();
 }
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -346,13 +365,14 @@
 	struct thread_struct *t = &me->thread;
 
 	if (me->thread.io_bitmap_ptr) { 
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss;
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		tss = &per_cpu(init_tss, get_cpu());
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/setup64.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/setup64.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/setup64.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/setup64.c	2007-05-19 23:58:35.000000000 +0900
@@ -116,7 +116,7 @@
 	}
 } 
 
-void pda_init(int cpu)
+void notrace pda_init(int cpu)
 { 
 	struct x8664_pda *pda = cpu_pda(cpu);
 
@@ -185,7 +185,7 @@
  * 'CPU state barrier', nothing should get across.
  * A lot of state is already set up in PDA init.
  */
-void __cpuinit cpu_init (void)
+void __cpuinit notrace cpu_init (void)
 {
 	int cpu = stack_smp_processor_id();
 	struct tss_struct *t = &per_cpu(init_tss, cpu);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/signal.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/signal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/signal.c	2007-05-19 23:58:35.000000000 +0900
@@ -431,6 +431,13 @@
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/smp.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/smp.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/smp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/smp.c	2007-05-19 23:58:35.000000000 +0900
@@ -57,7 +57,7 @@
 		struct mm_struct *flush_mm;
 		unsigned long flush_va;
 #define FLUSH_ALL	-1ULL
-		spinlock_t tlbstate_lock;
+		raw_spinlock_t tlbstate_lock;
 	};
 	char pad[SMP_CACHE_BYTES];
 } ____cacheline_aligned;
@@ -296,10 +296,20 @@
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/smpboot.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/smpboot.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/smpboot.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/smpboot.c	2007-05-19 23:58:35.000000000 +0900
@@ -204,7 +204,7 @@
    latency and low latency is the primary objective here. -AK */
 #define no_cpu_relax() barrier()
 
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
+static __cpuinitdata __DEFINE_RAW_SPINLOCK(tsc_sync_lock);
 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
 static int notscsync __cpuinitdata;
 
@@ -530,7 +530,7 @@
 /*
  * Setup code on secondary processor (after comming out of the trampoline)
  */
-void __cpuinit start_secondary(void)
+void __cpuinit notrace start_secondary(void)
 {
 	/*
 	 * Dont put anything before smp_callin(), SMP
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/time.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -39,149 +39,29 @@
 #include <asm/sections.h>
 #include <linux/cpufreq.h>
 #include <linux/hpet.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/apic.h>
 #endif
+#include <asm/delay.h>
 
-#ifdef CONFIG_CPU_FREQ
-static void cpufreq_delayed_get(void);
-#endif
 extern void i8254_timer_resume(void);
 extern int using_apic_timer;
+extern struct clock_event pit_clockevent;
 
-static char *time_init_gtod(void);
 
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
-DEFINE_SPINLOCK(i8253_lock);
-
-int nohpet __initdata = 0;
-static int notsc __initdata = 0;
+DEFINE_RAW_SPINLOCK(i8253_lock);
 
 #define USEC_PER_TICK (USEC_PER_SEC / HZ)
 #define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
-#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
 
-#define NS_SCALE	10 /* 2^10, carefully chosen */
-#define US_SCALE	32 /* 2^32, arbitralrily chosen */
 
-unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
-EXPORT_SYMBOL(cpu_khz);
-static unsigned long hpet_period;			/* fsecs / HPET clock */
-unsigned long hpet_tick;				/* HPET clocks / interrupt */
-int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */
-unsigned long vxtime_hz = PIT_TICK_RATE;
 int report_lost_ticks;				/* command line option */
-unsigned long long monotonic_base;
-
-struct vxtime_data __vxtime __section_vxtime;	/* for vsyscalls */
-
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
-struct timespec __xtime __section_xtime;
-struct timezone __sys_tz __section_sys_tz;
-
-/*
- * do_gettimeoffset() returns microseconds since last timer interrupt was
- * triggered by hardware. A memory read of HPET is slower than a register read
- * of TSC, but much more reliable. It's also synchronized to the timer
- * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
- * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
- * This is not a problem, because jiffies hasn't updated either. They are bound
- * together by xtime_lock.
- */
-
-static inline unsigned int do_gettimeoffset_tsc(void)
-{
-	unsigned long t;
-	unsigned long x;
-	t = get_cycles_sync();
-	if (t < vxtime.last_tsc) 
-		t = vxtime.last_tsc; /* hack */
-	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
-	return x;
-}
-
-static inline unsigned int do_gettimeoffset_hpet(void)
-{
-	/* cap counter read to one tick to avoid inconsistencies */
-	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
-	return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
-}
-
-unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
-
-/*
- * This version of gettimeofday() has microsecond resolution and better than
- * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
- * MHz) HPET timer.
- */
-
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq, t;
- 	unsigned int sec, usec;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		sec = xtime.tv_sec;
-		usec = xtime.tv_nsec / NSEC_PER_USEC;
-
-		/* i386 does some correction here to keep the clock 
-		   monotonous even when ntpd is fixing drift.
-		   But they didn't work for me, there is a non monotonic
-		   clock anyways with ntp.
-		   I dropped all corrections now until a real solution can
-		   be found. Note when you fix it here you need to do the same
-		   in arch/x86_64/kernel/vsyscall.c and export all needed
-		   variables in vmlinux.lds. -AK */ 
-
-		t = (jiffies - wall_jiffies) * USEC_PER_TICK +
-			do_gettimeoffset();
-		usec += t;
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	tv->tv_sec = sec + usec / USEC_PER_SEC;
-	tv->tv_usec = usec % USEC_PER_SEC;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-/*
- * settimeofday() first undoes the correction that gettimeofday would do
- * on the time, and then saves it. This is ugly, but has been like this for
- * ages already.
- */
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-
-	nsec -= do_gettimeoffset() * NSEC_PER_USEC +
-		(jiffies - wall_jiffies) * NSEC_PER_TICK;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
 
-EXPORT_SYMBOL(do_settimeofday);
+volatile unsigned long jiffies  = INITIAL_JIFFIES;
 
 unsigned long profile_pc(struct pt_regs *regs)
 {
@@ -277,84 +157,9 @@
 }
 
 
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	unsigned long seq;
- 	u32 last_offset, this_offset, offset;
-	unsigned long long base;
-
-	if (vxtime.mode == VXTIME_HPET) {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last;
-			base = monotonic_base;
-			this_offset = hpet_readl(HPET_COUNTER);
-		} while (read_seqretry(&xtime_lock, seq));
-		offset = (this_offset - last_offset);
-		offset *= NSEC_PER_TICK / hpet_tick;
-	} else {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last_tsc;
-			base = monotonic_base;
-		} while (read_seqretry(&xtime_lock, seq));
-		this_offset = get_cycles_sync();
-		/* FIXME: 1000 or 1000000? */
-		offset = (this_offset - last_offset)*1000 / cpu_khz;
-	}
-	return base + offset;
-}
-EXPORT_SYMBOL(monotonic_clock);
-
-static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
-{
-	static long lost_count;
-	static int warned;
-	if (report_lost_ticks) {
-		printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost);
-		print_symbol("rip %s)\n", regs->rip);
-	}
-
-	if (lost_count == 1000 && !warned) {
-		printk(KERN_WARNING "warning: many lost ticks.\n"
-		       KERN_WARNING "Your time source seems to be instable or "
-		   		"some driver is hogging interupts\n");
-		print_symbol("rip %s\n", regs->rip);
-		if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
-			printk(KERN_WARNING "Falling back to HPET\n");
-			if (hpet_use_timer)
-				vxtime.last = hpet_readl(HPET_T0_CMP) - 
-							hpet_tick;
-			else
-				vxtime.last = hpet_readl(HPET_COUNTER);
-			vxtime.mode = VXTIME_HPET;
-			do_gettimeoffset = do_gettimeoffset_hpet;
-		}
-		/* else should fall back to PIT, but code missing. */
-		warned = 1;
-	} else
-		lost_count++;
-
-#ifdef CONFIG_CPU_FREQ
-	/* In some cases the CPU can change frequency without us noticing
-	   Give cpufreq a change to catch up. */
-	if ((lost_count+1) % 25 == 0)
-		cpufreq_delayed_get();
-#endif
-}
-
 void main_timer_handler(struct pt_regs *regs)
 {
 	static unsigned long rtc_update = 0;
-	unsigned long tsc;
-	int delay = 0, offset = 0, lost = 0;
-
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
  * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -362,92 +167,11 @@
  * variables, because both do_timer() and us change them -arca+vojtech
  */
 
-	write_seqlock(&xtime_lock);
-
-	if (vxtime.hpet_address)
-		offset = hpet_readl(HPET_COUNTER);
-
-	if (hpet_use_timer) {
-		/* if we're using the hpet timer functionality,
-		 * we can more accurately know the counter value
-		 * when the timer interrupt occured.
-		 */
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		delay = hpet_readl(HPET_COUNTER) - offset;
-	} else if (!pmtmr_ioport) {
-		spin_lock(&i8253_lock);
-		outb_p(0x00, 0x43);
-		delay = inb_p(0x40);
-		delay |= inb(0x40) << 8;
-		spin_unlock(&i8253_lock);
-		delay = LATCH - 1 - delay;
-	}
-
-	tsc = get_cycles_sync();
-
-	if (vxtime.mode == VXTIME_HPET) {
-		if (offset - vxtime.last > hpet_tick) {
-			lost = (offset - vxtime.last) / hpet_tick - 1;
-		}
-
-		monotonic_base += 
-			(offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
-
-		vxtime.last = offset;
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (vxtime.mode == VXTIME_PMTMR) {
-		lost = pmtimer_mark_offset();
-#endif
-	} else {
-		offset = (((tsc - vxtime.last_tsc) *
-			   vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
-
-		if (offset < 0)
-			offset = 0;
-
-		if (offset > USEC_PER_TICK) {
-			lost = offset / USEC_PER_TICK;
-			offset %= USEC_PER_TICK;
-		}
-
-		/* FIXME: 1000 or 1000000? */
-		monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;
-
-		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
-
-		if ((((tsc - vxtime.last_tsc) *
-		      vxtime.tsc_quot) >> US_SCALE) < offset)
-			vxtime.last_tsc = tsc -
-				(((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
-	}
-
-	if (lost > 0) {
-		handle_lost_ticks(lost, regs);
-		jiffies += lost;
-	}
-
 /*
  * Do the timer stuff.
  */
 
-	do_timer(regs);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
-#endif
-
-/*
- * In the SMP case we use the local APIC timer interrupt to do the profiling,
- * except when we simulate SMP mode on a uniprocessor system, in that case we
- * have to call the local interrupt handler.
- */
-
-#ifndef CONFIG_X86_LOCAL_APIC
-	profile_tick(CPU_PROFILING, regs);
-#else
-	if (!using_apic_timer)
-		smp_local_timer_interrupt(regs);
-#endif
-
+	pit_clockevent.event_handler(regs);
 /*
  * If we have an externally synchronized Linux clock, then update CMOS clock
  * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
@@ -462,13 +186,10 @@
 		rtc_update = xtime.tv_sec + 660;
 	}
  
-	write_sequnlock(&xtime_lock);
 }
 
 static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	if (apic_runs_main_timer > 1)
-		return IRQ_HANDLED;
 	main_timer_handler(regs);
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (using_apic_timer)
@@ -477,39 +198,6 @@
 	return IRQ_HANDLED;
 }
 
-static unsigned int cyc2ns_scale __read_mostly;
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
-	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> NS_SCALE;
-}
-
-unsigned long long sched_clock(void)
-{
-	unsigned long a = 0;
-
-#if 0
-	/* Don't do a HPET read here. Using TSC always is much faster
-	   and HPET may not be mapped yet when the scheduler first runs.
-           Disadvantage is a small drift between CPUs in some configurations,
-	   but that should be tolerable. */
-	if (__vxtime.mode == VXTIME_HPET)
-		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
-#endif
-
-	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
-	   which means it is not completely exact and may not be monotonous between
-	   CPUs. But the errors should be too small to matter for scheduling
-	   purposes. */
-
-	rdtscll(a);
-	return cycles_2_ns(a);
-}
 
 static unsigned long get_cmos_time(void)
 {
@@ -562,142 +250,6 @@
 	return mktime(year, mon, day, hour, min, sec);
 }
 
-#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
-   changes.
-   
-   RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
-   not that important because current Opteron setups do not support
-   scaling on SMP anyroads.
-
-   Should fix up last_tsc too. Currently gettimeofday in the
-   first tick after the change will be slightly wrong. */
-
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(void *v)
-{
-	unsigned int cpu;
-	for_each_online_cpu(cpu) {
-		cpufreq_get(cpu);
-	}
-	cpufreq_delayed_issched = 0;
-}
-
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static void cpufreq_delayed_get(void)
-{
-	static int warned;
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		if (!warned) {
-			warned = 1;
-			printk(KERN_DEBUG 
-	"Losing some ticks... checking if CPU frequency changed.\n");
-		}
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
-static unsigned int  ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-static unsigned long cpu_khz_ref = 0;
-
-static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-				 void *data)
-{
-        struct cpufreq_freqs *freq = data;
-	unsigned long *lpj, dummy;
-
-	if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
-		return 0;
-
-	lpj = &dummy;
-	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-#ifdef CONFIG_SMP
-		lpj = &cpu_data[freq->cpu].loops_per_jiffy;
-#else
-		lpj = &boot_cpu_data.loops_per_jiffy;
-#endif
-
-	if (!ref_freq) {
-		ref_freq = freq->old;
-		loops_per_jiffy_ref = *lpj;
-		cpu_khz_ref = cpu_khz;
-	}
-        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-	    (val == CPUFREQ_RESUMECHANGE)) {
-                *lpj =
-		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
-		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
-	}
-	
-	set_cyc2ns_scale(cpu_khz_ref);
-
-	return 0;
-}
- 
-static struct notifier_block time_cpufreq_notifier_block = {
-         .notifier_call  = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
-	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
-				       CPUFREQ_TRANSITION_NOTIFIER))
-		cpufreq_init = 1;
-	return 0;
-}
-
-core_initcall(cpufreq_tsc);
-
-#endif
-
-/*
- * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
- * it to the HPET timer of known frequency.
- */
-
-#define TICK_COUNT 100000000
-
-static unsigned int __init hpet_calibrate_tsc(void)
-{
-	int tsc_start, hpet_start;
-	int tsc_now, hpet_now;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	local_irq_disable();
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	rdtscl(tsc_start);
-
-	do {
-		local_irq_disable();
-		hpet_now = hpet_readl(HPET_COUNTER);
-		tsc_now = get_cycles_sync();
-		local_irq_restore(flags);
-	} while ((tsc_now - tsc_start) < TICK_COUNT &&
-		 (hpet_now - hpet_start) < TICK_COUNT);
-
-	return (tsc_now - tsc_start) * 1000000000L
-		/ ((hpet_now - hpet_start) * hpet_period / 1000);
-}
-
 
 /*
  * pit_calibrate_tsc() uses the speaker output (channel 2) of
@@ -728,137 +280,84 @@
 	return (end - start) / 50;
 }
 
-#ifdef	CONFIG_HPET
-static __init int late_hpet_init(void)
-{
-	struct hpet_data	hd;
-	unsigned int 		ntimer;
-
-	if (!vxtime.hpet_address)
-        	return 0;
-
-	memset(&hd, 0, sizeof (hd));
-
-	ntimer = hpet_readl(HPET_ID);
-	ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
-	ntimer++;
-
-	/*
-	 * Register with driver.
-	 * Timer0 and Timer1 is used by platform.
-	 */
-	hd.hd_phys_address = vxtime.hpet_address;
-	hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
-	hd.hd_nirqs = ntimer;
-	hd.hd_flags = HPET_DATA_PLATFORM;
-	hpet_reserve_timer(&hd, 0);
-#ifdef	CONFIG_HPET_EMULATE_RTC
-	hpet_reserve_timer(&hd, 1);
-#endif
-	hd.hd_irq[0] = HPET_LEGACY_8254;
-	hd.hd_irq[1] = HPET_LEGACY_RTC;
-	if (ntimer > 2) {
-		struct hpet		*hpet;
-		struct hpet_timer	*timer;
-		int			i;
-
-		hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
-		timer = &hpet->hpet_timers[2];
-		for (i = 2; i < ntimer; timer++, i++)
-			hd.hd_irq[i] = (timer->hpet_config &
-					Tn_INT_ROUTE_CNF_MASK) >>
-				Tn_INT_ROUTE_CNF_SHIFT;
+#define PIT_MODE 0x43
+#define PIT_CH0  0x40
 
-	}
+static void __init __pit_init(int val, u8 mode)
+{
+	unsigned long flags;
 
-	hpet_alloc(&hd);
-	return 0;
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(mode, PIT_MODE);
+	outb_p(val & 0xff, PIT_CH0);	/* LSB */
+	outb_p(val >> 8, PIT_CH0);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
 }
-fs_initcall(late_hpet_init);
-#endif
 
-static int hpet_timer_stop_set_go(unsigned long tick)
+static void init_pit_timer(int mode, struct clock_event *evt)
 {
-	unsigned int cfg;
-
-/*
- * Stop the timers and reset the main counter.
- */
+	unsigned long flags;
 
-	cfg = hpet_readl(HPET_CFG);
-	cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
-	hpet_writel(cfg, HPET_CFG);
-	hpet_writel(0, HPET_COUNTER);
-	hpet_writel(0, HPET_COUNTER + 4);
+	spin_lock_irqsave(&i8253_lock, flags);
 
-/*
- * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
- * and period also hpet_tick.
- */
-	if (hpet_use_timer) {
-		hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
-		    HPET_TN_32BIT, HPET_T0_CFG);
-		hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
-		hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
-		cfg |= HPET_CFG_LEGACY;
+	switch(mode) {
+	case CLOCK_EVT_PERIODIC:
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		udelay(10);
+		outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+		outb(LATCH >> 8 , PIT_CH0);     /* MSB */
+		break;
+
+	case CLOCK_EVT_ONESHOT:
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		udelay(10);
+		break;
+	case CLOCK_EVT_SHUTDOWN:
+		outb_p(0x30, PIT_MODE);
+		outb_p(0, PIT_CH0);	/* LSB */
+		outb_p(0, PIT_CH0);	/* MSB */
+		disable_irq(0);
+		break;
 	}
-/*
- * Go!
- */
-
-	cfg |= HPET_CFG_ENABLE;
-	hpet_writel(cfg, HPET_CFG);
-
-	return 0;
+	spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
-static int hpet_init(void)
+static void pit_next_event(unsigned long delta, struct clock_event *evt)
 {
-	unsigned int id;
-
-	if (!vxtime.hpet_address)
-		return -1;
-	set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
-	__set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-
-/*
- * Read the period, compute tick and quotient.
- */
-
-	id = hpet_readl(HPET_ID);
-
-	if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
-		return -1;
-
-	hpet_period = hpet_readl(HPET_PERIOD);
-	if (hpet_period < 100000 || hpet_period > 100000000)
-		return -1;
-
-	hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
-
-	hpet_use_timer = (id & HPET_ID_LEGSUP);
+	unsigned long flags;
 
-	return hpet_timer_stop_set_go(hpet_tick);
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(delta & 0xff , PIT_CH0); /* LSB */
+	outb(delta >> 8 , PIT_CH0);     /* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
-static int hpet_reenable(void)
+struct clock_event pit_clockevent = {
+	.name           = "pit",
+	.capabilities   = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE
+#ifndef CONFIG_SMP
+			| CLOCK_CAP_NEXTEVT
+#endif
+	,
+	.set_mode       = init_pit_timer,
+	.set_next_event = pit_next_event,
+	.shift          = 32,
+};
+
+void setup_pit_timer(void)
 {
-	return hpet_timer_stop_set_go(hpet_tick);
+	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	register_global_clockevent(&pit_clockevent);
 }
 
-#define PIT_MODE 0x43
-#define PIT_CH0  0x40
 
-static void __init __pit_init(int val, u8 mode)
-{
-	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
-	outb_p(mode, PIT_MODE);
-	outb_p(val & 0xff, PIT_CH0);	/* LSB */
-	outb_p(val >> 8, PIT_CH0);	/* MSB */
-	spin_unlock_irqrestore(&i8253_lock, flags);
-}
 
 void __init pit_init(void)
 {
@@ -873,9 +372,9 @@
 void __init stop_timer_interrupt(void)
 {
 	char *name;
-	if (vxtime.hpet_address) {
+	if (hpet_address) {
 		name = "HPET";
-		hpet_timer_stop_set_go(0);
+		hpet_stop();
 	} else {
 		name = "PIT";
 		pit_stop_interrupt();
@@ -890,119 +389,47 @@
 }
 
 static struct irqaction irq0 = {
-	timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
+	timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
 void __init time_init(void)
 {
 	char *timename;
-	char *gtod;
 
 	if (nohpet)
-		vxtime.hpet_address = 0;
-
+		hpet_address = 0;
 	xtime.tv_sec = get_cmos_time();
 	xtime.tv_nsec = 0;
 
 	set_normalized_timespec(&wall_to_monotonic,
 	                        -xtime.tv_sec, -xtime.tv_nsec);
 
-	if (!hpet_init())
-                vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;
-	else
-		vxtime.hpet_address = 0;
+	if (hpet_arch_init())
+		hpet_address = 0;
+
+	setup_pit_timer();
 
 	if (hpet_use_timer) {
 		/* set tick_nsec to use the proper rate for HPET */
 	  	tick_nsec = TICK_NSEC_HPET;
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (pmtmr_ioport && !vxtime.hpet_address) {
-		vxtime_hz = PM_TIMER_FREQUENCY;
-		timename = "PM";
-		pit_init();
-		cpu_khz = pit_calibrate_tsc();
-#endif
 	} else {
 		pit_init();
 		cpu_khz = pit_calibrate_tsc();
 		timename = "PIT";
 	}
 
-	vxtime.mode = VXTIME_TSC;
-	gtod = time_init_gtod();
+	if (unsynchronized_tsc())
+		mark_tsc_unstable();
 
-	printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
-	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
 	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
 		cpu_khz / 1000, cpu_khz % 1000);
-	vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
-	vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
-	vxtime.last_tsc = get_cycles_sync();
 	setup_irq(0, &irq0);
 
 	set_cyc2ns_scale(cpu_khz);
 }
 
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
-#ifdef CONFIG_SMP
-	if (apic_is_clustered_box())
-		return 1;
-#endif
-	/* Most intel systems have synchronized TSCs except for
-	   multi node systems */
- 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-#ifdef CONFIG_ACPI
-		/* But TSC doesn't tick in C3 so don't use it there */
-		if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100)
-			return 1;
-#endif
- 		return 0;
-	}
-
- 	/* Assume multi socket systems are not synchronized */
- 	return num_present_cpus() > 1;
-}
-
-/*
- * Decide what mode gettimeofday should use.
- */
-__init static char *time_init_gtod(void)
-{
-	char *timetype;
-
-	if (unsynchronized_tsc())
-		notsc = 1;
-	if (vxtime.hpet_address && notsc) {
-		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
-		if (hpet_use_timer)
-			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		else
-			vxtime.last = hpet_readl(HPET_COUNTER);
-		vxtime.mode = VXTIME_HPET;
-		do_gettimeoffset = do_gettimeoffset_hpet;
-#ifdef CONFIG_X86_PM_TIMER
-	/* Using PM for gettimeofday is quite slow, but we have no other
-	   choice because the TSC is too unreliable on some systems. */
-	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
-		timetype = "PM";
-		do_gettimeoffset = do_gettimeoffset_pm;
-		vxtime.mode = VXTIME_PMTMR;
-		sysctl_vsyscall = 0;
-		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
-#endif
-	} else {
-		timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
-		vxtime.mode = VXTIME_TSC;
-	}
-	return timetype;
-}
 
 __setup("report_lost_ticks", time_setup);
 
@@ -1033,7 +460,7 @@
 	unsigned long ctime = get_cmos_time();
 	unsigned long sleep_length = (ctime - sleep_start) * HZ;
 
-	if (vxtime.hpet_address)
+	if (hpet_address)
 		hpet_reenable();
 	else
 		i8254_timer_resume();
@@ -1042,21 +469,9 @@
 	write_seqlock_irqsave(&xtime_lock,flags);
 	xtime.tv_sec = sec;
 	xtime.tv_nsec = 0;
-	if (vxtime.mode == VXTIME_HPET) {
-		if (hpet_use_timer)
-			vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		else
-			vxtime.last = hpet_readl(HPET_COUNTER);
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (vxtime.mode == VXTIME_PMTMR) {
-		pmtimer_resume();
-#endif
-	} else
-		vxtime.last_tsc = get_cycles_sync();
-	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
-	monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
+	write_sequnlock_irqrestore(&xtime_lock,flags);
 	touch_softlockup_watchdog();
 	return 0;
 }
@@ -1083,243 +498,3 @@
 
 device_initcall(time_init_device);
 
-#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
- * is enabled, we support RTC interrupt functionality in software.
- * RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- *    is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
- */
-#include <linux/rtc.h>
-
-#define DEFAULT_RTC_INT_FREQ 	64
-#define RTC_NUM_INTS 		1
-
-static unsigned long UIE_on;
-static unsigned long prev_update_sec;
-
-static unsigned long AIE_on;
-static struct rtc_time alarm_time;
-
-static unsigned long PIE_on;
-static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
-static unsigned long PIE_count;
-
-static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
-static unsigned int hpet_t1_cmp; /* cached comparator register */
-
-int is_hpet_enabled(void)
-{
-	return vxtime.hpet_address != 0;
-}
-
-/*
- * Timer 1 for RTC, we do not use periodic interrupt feature,
- * even if HPET supports periodic interrupts on Timer 1.
- * The reason being, to set up a periodic interrupt in HPET, we need to
- * stop the main counter. And if we do that everytime someone diables/enables
- * RTC, we will have adverse effect on main kernel timer running on Timer 0.
- * So, for the time being, simulate the periodic interrupt in software.
- *
- * hpet_rtc_timer_init() is called for the first time and during subsequent
- * interuppts reinit happens through hpet_rtc_timer_reinit().
- */
-int hpet_rtc_timer_init(void)
-{
-	unsigned int cfg, cnt;
-	unsigned long flags;
-
-	if (!is_hpet_enabled())
-		return 0;
-	/*
-	 * Set the counter 1 and enable the interrupts.
-	 */
-	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
-		hpet_rtc_int_freq = PIE_freq;
-	else
-		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
-	local_irq_save(flags);
-	cnt = hpet_readl(HPET_COUNTER);
-	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
-	hpet_writel(cnt, HPET_T1_CMP);
-	hpet_t1_cmp = cnt;
-	local_irq_restore(flags);
-
-	cfg = hpet_readl(HPET_T1_CFG);
-	cfg &= ~HPET_TN_PERIODIC;
-	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-	hpet_writel(cfg, HPET_T1_CFG);
-
-	return 1;
-}
-
-static void hpet_rtc_timer_reinit(void)
-{
-	unsigned int cfg, cnt;
-
-	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
-		cfg = hpet_readl(HPET_T1_CFG);
-		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_T1_CFG);
-		return;
-	}
-
-	if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
-		hpet_rtc_int_freq = PIE_freq;
-	else
-		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
-	/* It is more accurate to use the comparator value than current count.*/
-	cnt = hpet_t1_cmp;
-	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
-	hpet_writel(cnt, HPET_T1_CMP);
-	hpet_t1_cmp = cnt;
-}
-
-/*
- * The functions below are called from rtc driver.
- * Return 0 if HPET is not being used.
- * Otherwise do the necessary changes and return 1.
- */
-int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	if (bit_mask & RTC_UIE)
-		UIE_on = 0;
-	if (bit_mask & RTC_PIE)
-		PIE_on = 0;
-	if (bit_mask & RTC_AIE)
-		AIE_on = 0;
-
-	return 1;
-}
-
-int hpet_set_rtc_irq_bit(unsigned long bit_mask)
-{
-	int timer_init_reqd = 0;
-
-	if (!is_hpet_enabled())
-		return 0;
-
-	if (!(PIE_on | AIE_on | UIE_on))
-		timer_init_reqd = 1;
-
-	if (bit_mask & RTC_UIE) {
-		UIE_on = 1;
-	}
-	if (bit_mask & RTC_PIE) {
-		PIE_on = 1;
-		PIE_count = 0;
-	}
-	if (bit_mask & RTC_AIE) {
-		AIE_on = 1;
-	}
-
-	if (timer_init_reqd)
-		hpet_rtc_timer_init();
-
-	return 1;
-}
-
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	alarm_time.tm_hour = hrs;
-	alarm_time.tm_min = min;
-	alarm_time.tm_sec = sec;
-
-	return 1;
-}
-
-int hpet_set_periodic_freq(unsigned long freq)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	PIE_freq = freq;
-	PIE_count = 0;
-
-	return 1;
-}
-
-int hpet_rtc_dropped_irq(void)
-{
-	if (!is_hpet_enabled())
-		return 0;
-
-	return 1;
-}
-
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
-	struct rtc_time curr_time;
-	unsigned long rtc_int_flag = 0;
-	int call_rtc_interrupt = 0;
-
-	hpet_rtc_timer_reinit();
-
-	if (UIE_on | AIE_on) {
-		rtc_get_rtc_time(&curr_time);
-	}
-	if (UIE_on) {
-		if (curr_time.tm_sec != prev_update_sec) {
-			/* Set update int info, call real rtc int routine */
-			call_rtc_interrupt = 1;
-			rtc_int_flag = RTC_UF;
-			prev_update_sec = curr_time.tm_sec;
-		}
-	}
-	if (PIE_on) {
-		PIE_count++;
-		if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
-			/* Set periodic int info, call real rtc int routine */
-			call_rtc_interrupt = 1;
-			rtc_int_flag |= RTC_PF;
-			PIE_count = 0;
-		}
-	}
-	if (AIE_on) {
-		if ((curr_time.tm_sec == alarm_time.tm_sec) &&
-		    (curr_time.tm_min == alarm_time.tm_min) &&
-		    (curr_time.tm_hour == alarm_time.tm_hour)) {
-			/* Set alarm int info, call real rtc int routine */
-			call_rtc_interrupt = 1;
-			rtc_int_flag |= RTC_AF;
-		}
-	}
-	if (call_rtc_interrupt) {
-		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
-		rtc_interrupt(rtc_int_flag, dev_id, regs);
-	}
-	return IRQ_HANDLED;
-}
-#endif
-
-static int __init nohpet_setup(char *s) 
-{ 
-	nohpet = 1;
-	return 1;
-} 
-
-__setup("nohpet", nohpet_setup);
-
-int __init notsc_setup(char *s)
-{
-	notsc = 1;
-	return 1;
-}
-
-__setup("notsc", notsc_setup);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/traps.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/traps.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/traps.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/traps.c	2007-05-19 23:58:35.000000000 +0900
@@ -368,6 +368,7 @@
 #undef HANDLE_STACK
 
 	printk("\n");
+	print_traces(tsk);
 }
 
 static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
@@ -497,7 +498,7 @@
 EXPORT_SYMBOL(out_of_line_bug);
 #endif
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 static int die_owner = -1;
 static unsigned int die_nest_count;
 
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/tsc.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/tsc.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/tsc.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/tsc.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,229 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/clocksource.h>
+#include <linux/time.h>
+#include <linux/acpi.h>
+#include <linux/cpufreq.h>
+
+#include <asm/timex.h>
+
+#define NS_SCALE	10 /* 2^10, carefully chosen */
+#define US_SCALE	32 /* 2^32, arbitralrily chosen */
+
+static int notsc __initdata = 0;
+
+unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+
+static unsigned int cyc2ns_scale __read_mostly;
+
+void set_cyc2ns_scale(unsigned long khz)
+{
+	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> NS_SCALE;
+}
+
+unsigned long long sched_clock(void)
+{
+	unsigned long a = 0;
+
+	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+	   which means it is not completely exact and may not be monotonous between
+	   CPUs. But the errors should be too small to matter for scheduling
+	   purposes. */
+
+	rdtscll(a);
+	return cycles_2_ns(a);
+}
+
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+   changes.
+
+   RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+   not that important because current Opteron setups do not support
+   scaling on SMP anyroads.
+
+   Should fix up last_tsc too. Currently gettimeofday in the
+   first tick after the change will be slightly wrong. */
+
+#include <linux/workqueue.h>
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+	unsigned int cpu;
+	for_each_online_cpu(cpu) {
+		cpufreq_get(cpu);
+	}
+	cpufreq_delayed_issched = 0;
+}
+
+static unsigned int  ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+static unsigned long cpu_khz_ref = 0;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				 void *data)
+{
+        struct cpufreq_freqs *freq = data;
+	unsigned long *lpj, dummy;
+
+	if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	lpj = &dummy;
+	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+		lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+		lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+	if (!ref_freq) {
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = *lpj;
+		cpu_khz_ref = cpu_khz;
+	}
+        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+                *lpj =
+		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			mark_tsc_unstable();
+	}
+
+	set_cyc2ns_scale(cpu_khz_ref);
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call  = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+				       CPUFREQ_TRANSITION_NOTIFIER))
+		cpufreq_init = 1;
+	return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+#ifdef CONFIG_SMP
+	if (apic_is_clustered_box())
+		return 1;
+#endif
+	/* Most intel systems have synchronized TSCs except for
+	   multi node systems */
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+		/* But TSC doesn't tick in C3 so don't use it there */
+		if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100)
+			return 1;
+#endif
+ 		return 0;
+	}
+
+ 	/* Assume multi socket systems are not synchronized */
+ 	return num_present_cpus() > 1;
+}
+
+int __init notsc_setup(char *s)
+{
+	notsc = 1;
+	return 1;
+}
+
+__setup("notsc", notsc_setup);
+
+
+/* clock source code: */
+
+static int tsc_update_callback(void);
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret = (cycle_t)get_cycles_sync();
+	return ret;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+	cycle_t ret = (cycle_t)get_cycles_sync();
+	return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+	.vread			= vread_tsc,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		clocksource_reselect();
+		change = 1;
+	}
+	return change;
+}
+
+static int __init init_tsc_clocksource(void)
+{
+	if (!notsc) {
+		clocksource_tsc.mult = clocksource_khz2mult(cpu_khz,
+							clocksource_tsc.shift);
+		return clocksource_register(&clocksource_tsc);
+	}
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/vmlinux.lds.S
--- ./linux-2.6.18.1/arch/x86_64/kernel/vmlinux.lds.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/vmlinux.lds.S	2007-05-19 23:58:35.000000000 +0900
@@ -93,27 +93,11 @@
   __vsyscall_0 = VSYSCALL_VIRT_ADDR;
 
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
-  xtime_lock = VVIRT(.xtime_lock);
-
-  .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
-  vxtime = VVIRT(.vxtime);
-
-  .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
-  wall_jiffies = VVIRT(.wall_jiffies);
-
-  .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
-  sys_tz = VVIRT(.sys_tz);
-
-  .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
-  sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
-
-  .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
-  xtime = VVIRT(.xtime);
-
+  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
-  jiffies = VVIRT(.jiffies);
+  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+
 
   .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
   .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/vsyscall.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/vsyscall.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/vsyscall.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/vsyscall.c	2007-05-19 23:58:35.000000000 +0900
@@ -26,65 +26,50 @@
 #include <linux/seqlock.h>
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
+#include <linux/clocksource.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
 #include <asm/page.h>
+#include <asm/unistd.h>
 #include <asm/fixmap.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-
-int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
 
-#include <asm/unistd.h>
-
-static __always_inline void timeval_normalize(struct timeval * tv)
-{
-	time_t __sec;
+struct vsyscall_gtod_data_t {
+	raw_seqlock_t lock;
+	int sysctl_enabled;
+	struct timeval wall_time_tv;
+	struct timezone sys_tz;
+	cycle_t offset_base;
+	struct clocksource clock;
+};
 
-	__sec = tv->tv_usec / 1000000;
-	if (__sec) {
-		tv->tv_usec %= 1000000;
-		tv->tv_sec += __sec;
-	}
-}
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data =  {
+	.lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
+	.sysctl_enabled = 1,
+};
 
-static __always_inline void do_vgettimeofday(struct timeval * tv)
+void update_vsyscall(struct timespec* wall_time, struct clocksource* clock)
 {
-	long sequence, t;
-	unsigned long sec, usec;
+	unsigned long flags;
 
-	do {
-		sequence = read_seqbegin(&__xtime_lock);
-		
-		sec = __xtime.tv_sec;
-		usec = (__xtime.tv_nsec / 1000) +
-			(__jiffies - __wall_jiffies) * (1000000 / HZ);
-
-		if (__vxtime.mode != VXTIME_HPET) {
-			t = get_cycles_sync();
-			if (t < __vxtime.last_tsc)
-				t = __vxtime.last_tsc;
-			usec += ((t - __vxtime.last_tsc) *
-				 __vxtime.tsc_quot) >> 32;
-			/* See comment in x86_64 do_gettimeofday. */
-		} else {
-			usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
-				  __vxtime.last) * __vxtime.quot) >> 32;
-		}
-	} while (read_seqretry(&__xtime_lock, sequence));
+	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+	/* copy vsyscall data */
+	vsyscall_gtod_data.clock = *clock;
+	vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
+	vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
+	vsyscall_gtod_data.sys_tz = sys_tz;
 
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
+	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
 /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
 static __always_inline void do_get_tz(struct timezone * tz)
 {
-	*tz = __sys_tz;
+	*tz = __vsyscall_gtod_data.sys_tz;
 }
 
 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -105,10 +90,44 @@
 	return secs;
 }
 
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+	cycle_t now, base, mask, cycle_delta;
+	unsigned long seq, mult, shift, nsec_delta;
+	cycle_t (*vread)(void);
+	do {
+		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+		vread = __vsyscall_gtod_data.clock.vread;
+		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+			gettimeofday(tv,0);
+			return;
+		}
+		now = vread();
+		base = __vsyscall_gtod_data.clock.cycle_last;
+		mask = __vsyscall_gtod_data.clock.mask;
+		mult = __vsyscall_gtod_data.clock.mult;
+		shift = __vsyscall_gtod_data.clock.shift;
+
+		*tv = __vsyscall_gtod_data.wall_time_tv;
+
+	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
+	/* calculate interval: */
+	cycle_delta = (now - base) & mask;
+	/* convert to nsecs: */
+	nsec_delta = (cycle_delta * mult) >> shift;
+
+	/* convert to usecs and add to timespec: */
+	tv->tv_usec += nsec_delta / NSEC_PER_USEC;
+	while (tv->tv_usec > USEC_PER_SEC) {
+		tv->tv_sec += 1;
+		tv->tv_usec -= USEC_PER_SEC;
+	}
+}
+
 int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
 {
-	if (!__sysctl_vsyscall)
-		return gettimeofday(tv,tz);
 	if (tv)
 		do_vgettimeofday(tv);
 	if (tz)
@@ -120,11 +139,11 @@
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
-	if (!__sysctl_vsyscall)
+	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
 		return time_syscall(t);
 	else if (t)
-		*t = __xtime.tv_sec;		
-	return __xtime.tv_sec;
+		*t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
+	return __vsyscall_gtod_data.wall_time_tv.tv_sec;
 }
 
 long __vsyscall(2) venosys_0(void)
@@ -163,7 +182,7 @@
 		ret = -ENOMEM;
 		goto out;
 	}
-	if (!sysctl_vsyscall) {
+	if (!vsyscall_gtod_data.sysctl_enabled) {
 		*map1 = SYSCALL;
 		*map2 = SYSCALL;
 	} else {
@@ -186,7 +205,7 @@
 
 static ctl_table kernel_table2[] = {
 	{ .ctl_name = 99, .procname = "vsyscall64",
-	  .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
+	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644,
 	  .strategy = vsyscall_sysctl_nostrat,
 	  .proc_handler = vsyscall_sysctl_change },
 	{ 0, }
diff -urN ./linux-2.6.18.1/arch/x86_64/kernel/x8664_ksyms.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/x8664_ksyms.c
--- ./linux-2.6.18.1/arch/x86_64/kernel/x8664_ksyms.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/kernel/x8664_ksyms.c	2007-05-19 23:58:35.000000000 +0900
@@ -12,10 +12,12 @@
 
 EXPORT_SYMBOL(kernel_thread);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 
 EXPORT_SYMBOL(__get_user_1);
 EXPORT_SYMBOL(__get_user_2);
diff -urN ./linux-2.6.18.1/arch/x86_64/lib/thunk.S linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/lib/thunk.S
--- ./linux-2.6.18.1/arch/x86_64/lib/thunk.S	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/lib/thunk.S	2007-05-19 23:58:35.000000000 +0900
@@ -42,11 +42,13 @@
 	thunk rwsem_wake_thunk,rwsem_wake
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
-	
-	thunk __down_failed,__down
-	thunk_retrax __down_failed_interruptible,__down_interruptible
-	thunk_retrax __down_failed_trylock,__down_trylock
-	thunk __up_wakeup,__up
+
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+	thunk __compat_down_failed,__compat_down
+	thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible
+	thunk_retrax __compat_down_failed_trylock,__compat_down_trylock
+	thunk __compat_up_wakeup,__compat_up
+#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
diff -urN ./linux-2.6.18.1/arch/x86_64/mm/fault.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/mm/fault.c
--- ./linux-2.6.18.1/arch/x86_64/mm/fault.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/mm/fault.c	2007-05-19 23:58:35.000000000 +0900
@@ -79,6 +79,7 @@
 {
 	int loglevel_save = console_loglevel;
 	if (yes) {
+		stop_trace();
 		oops_in_progress = 1;
 	} else {
 #ifdef CONFIG_VT
diff -urN ./linux-2.6.18.1/arch/x86_64/mm/init.c linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/mm/init.c
--- ./linux-2.6.18.1/arch/x86_64/mm/init.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/arch/x86_64/mm/init.c	2007-05-19 23:58:35.000000000 +0900
@@ -51,7 +51,7 @@
 
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
diff -urN ./linux-2.6.18.1/block/cfq-iosched.c linux-2.6.18.1-cabi-20070529-RT_HRT/block/cfq-iosched.c
--- ./linux-2.6.18.1/block/cfq-iosched.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/block/cfq-iosched.c	2007-05-19 23:58:35.000000000 +0900
@@ -1283,7 +1283,7 @@
 
 	q = cfqd->queue;
 
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	spin_lock(q->queue_lock);
 
diff -urN ./linux-2.6.18.1/block/ll_rw_blk.c linux-2.6.18.1-cabi-20070529-RT_HRT/block/ll_rw_blk.c
--- ./linux-2.6.18.1/block/ll_rw_blk.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/block/ll_rw_blk.c	2007-05-19 23:58:35.000000000 +0900
@@ -1547,7 +1547,7 @@
  */
 void blk_plug_device(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -1570,7 +1570,7 @@
  */
 int blk_remove_plug(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
@@ -3584,13 +3584,15 @@
 	struct io_context *ioc;
 	struct cfq_io_context *cic;
 
-	local_irq_save(flags);
+	// FIXME: unsafe upstream too?
+
+	local_irq_save_nort(flags);
 	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
 	ioc->task = NULL;
 	task_unlock(current);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
diff -urN ./linux-2.6.18.1/drivers/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/Makefile
--- ./linux-2.6.18.1/drivers/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/Makefile	2007-05-20 14:14:28.000000000 +0900
@@ -76,3 +76,4 @@
 obj-$(CONFIG_SUPERH)		+= sh/
 obj-$(CONFIG_GENERIC_TIME)	+= clocksource/
 obj-$(CONFIG_DMA_ENGINE)	+= dma/
+obj-$(CONFIG_CABI)		+= cabi/
diff -urN ./linux-2.6.18.1/drivers/acpi/executer/exmutex.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/executer/exmutex.c
--- ./linux-2.6.18.1/drivers/acpi/executer/exmutex.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/executer/exmutex.c	2007-05-19 23:58:35.000000000 +0900
@@ -267,9 +267,9 @@
 	    && (obj_desc->mutex.os_mutex != ACPI_GLOBAL_LOCK)) {
 		ACPI_ERROR((AE_INFO,
 			    "Thread %X cannot release Mutex [%4.4s] acquired by thread %X",
-			    (u32) walk_state->thread->thread_id,
+			    (u32)(long) walk_state->thread->thread_id,
 			    acpi_ut_get_node_name(obj_desc->mutex.node),
-			    (u32) obj_desc->mutex.owner_thread->thread_id));
+			    (u32)(long) obj_desc->mutex.owner_thread->thread_id));
 		return_ACPI_STATUS(AE_AML_NOT_OWNER);
 	}
 
diff -urN ./linux-2.6.18.1/drivers/acpi/osl.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/osl.c
--- ./linux-2.6.18.1/drivers/acpi/osl.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/osl.c	2007-05-19 23:58:35.000000000 +0900
@@ -676,13 +676,13 @@
 acpi_status
 acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
-	struct semaphore *sem = NULL;
+	struct compat_semaphore *sem = NULL;
 
 
-	sem = acpi_os_allocate(sizeof(struct semaphore));
+	sem = acpi_os_allocate(sizeof(struct compat_semaphore));
 	if (!sem)
 		return AE_NO_MEMORY;
-	memset(sem, 0, sizeof(struct semaphore));
+	memset(sem, 0, sizeof(struct compat_semaphore));
 
 	sema_init(sem, initial_units);
 
@@ -705,7 +705,7 @@
 
 acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 
 	if (!sem)
@@ -733,7 +733,7 @@
 acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 {
 	acpi_status status = AE_OK;
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 	int ret = 0;
 
 
@@ -820,7 +820,7 @@
  */
 acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 
 	if (!sem || (units < 1))
diff -urN ./linux-2.6.18.1/drivers/acpi/processor_idle.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/processor_idle.c
--- ./linux-2.6.18.1/drivers/acpi/processor_idle.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/processor_idle.c	2007-05-19 23:58:35.000000000 +0900
@@ -38,9 +38,11 @@
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>	/* need_resched() */
+#include <linux/latency.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
+#include <asm/timex.h>
 
 #include <acpi/acpi_bus.h>
 #include <acpi/processor.h>
@@ -368,10 +370,12 @@
 		/* Get end time (ticks) */
 		t2 = inl(acpi_fadt.xpm_tmr_blk.address);
 
+#ifndef CONFIG_IA64
 #ifdef CONFIG_GENERIC_TIME
 		/* TSC halts in C2, so notify users */
 		mark_tsc_unstable();
 #endif
+#endif
 		/* Re-enable interrupts */
 		local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
@@ -412,10 +416,12 @@
 					  ACPI_MTX_DO_NOT_LOCK);
 		}
 
+#ifndef CONFIG_IA64
 #ifdef CONFIG_GENERIC_TIME
 		/* TSC halts in C3, so notify users */
 		mark_tsc_unstable();
 #endif
+#endif
 		/* Re-enable interrupts */
 		local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
@@ -453,7 +459,8 @@
 	 */
 	if (cx->promotion.state &&
 	    ((cx->promotion.state - pr->power.states) <= max_cstate)) {
-		if (sleep_ticks > cx->promotion.threshold.ticks) {
+		if (sleep_ticks > cx->promotion.threshold.ticks &&
+			cx->promotion.state->latency <= system_latency_constraint()) {
 			cx->promotion.count++;
 			cx->demotion.count = 0;
 			if (cx->promotion.count >=
@@ -494,8 +501,10 @@
       end:
 	/*
 	 * Demote if current state exceeds max_cstate
+	 * or if the latency of the current state is unacceptable
 	 */
-	if ((pr->power.state - pr->power.states) > max_cstate) {
+	if ((pr->power.state - pr->power.states) > max_cstate ||
+		pr->power.state->latency > system_latency_constraint()) {
 		if (cx->demotion.state)
 			next_state = cx->demotion.state;
 	}
@@ -1009,9 +1018,10 @@
 
 	seq_printf(seq, "active state:            C%zd\n"
 		   "max_cstate:              C%d\n"
-		   "bus master activity:     %08x\n",
+		   "bus master activity:     %08x\n"
+		   "maximum allowed latency: %d usec\n",
 		   pr->power.state ? pr->power.state - pr->power.states : 0,
-		   max_cstate, (unsigned)pr->power.bm_activity);
+		   max_cstate, (unsigned)pr->power.bm_activity, system_latency_constraint());
 
 	seq_puts(seq, "states:\n");
 
@@ -1077,6 +1087,29 @@
 	.release = single_release,
 };
 
+
+static void smp_callback(void *v)
+{
+       /* we already woke the CPU up, nothing more to do */
+}
+
+/*
+ * This function gets called when a part of the kernel has a new latency requirement.
+ * This means we need to get all processors out of their C-state, and then recalculate
+ * a new suitable C-state. Just do a cross-cpu IPI; that wakes them all right up.
+ */
+static int acpi_processor_latency_notify(struct notifier_block *b,
+	unsigned long l, void *v)
+{
+	smp_call_function(smp_callback, NULL, 0, 1);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block acpi_processor_latency_notifier = {
+	.notifier_call = acpi_processor_latency_notify,
+};
+
+
 int acpi_processor_power_init(struct acpi_processor *pr,
 			      struct acpi_device *device)
 {
@@ -1093,6 +1126,7 @@
 			       "ACPI: processor limited to max C-state %d\n",
 			       max_cstate);
 		first_run++;
+		register_latency_notifier(&acpi_processor_latency_notifier);
 	}
 
 	if (!pr)
@@ -1164,6 +1198,7 @@
 		 * copies of pm_idle before proceeding.
 		 */
 		cpu_idle_wait();
+		unregister_latency_notifier(&acpi_processor_latency_notifier);
 	}
 
 	return 0;
diff -urN ./linux-2.6.18.1/drivers/acpi/tables/tbget.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/tables/tbget.c
--- ./linux-2.6.18.1/drivers/acpi/tables/tbget.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/tables/tbget.c	2007-05-19 23:58:35.000000000 +0900
@@ -325,7 +325,7 @@
 	if (header->length < sizeof(struct acpi_table_header)) {
 		ACPI_ERROR((AE_INFO,
 			    "Table length (%X) is smaller than minimum (%X)",
-			    header->length, sizeof(struct acpi_table_header)));
+			    header->length, (int)sizeof(struct acpi_table_header)));
 
 		return_ACPI_STATUS(AE_INVALID_TABLE_LENGTH);
 	}
diff -urN ./linux-2.6.18.1/drivers/acpi/tables/tbrsdt.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/tables/tbrsdt.c
--- ./linux-2.6.18.1/drivers/acpi/tables/tbrsdt.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/tables/tbrsdt.c	2007-05-19 23:58:35.000000000 +0900
@@ -189,7 +189,7 @@
 		ACPI_ERROR((AE_INFO,
 			    "RSDT/XSDT length (%X) is smaller than minimum (%X)",
 			    table_ptr->length,
-			    sizeof(struct acpi_table_header)));
+			    (int)sizeof(struct acpi_table_header)));
 
 		return (AE_INVALID_TABLE_LENGTH);
 	}
diff -urN ./linux-2.6.18.1/drivers/acpi/utilities/utmutex.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/utilities/utmutex.c
--- ./linux-2.6.18.1/drivers/acpi/utilities/utmutex.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/acpi/utilities/utmutex.c	2007-05-19 23:58:35.000000000 +0900
@@ -259,7 +259,7 @@
 	} else {
 		ACPI_EXCEPTION((AE_INFO, status,
 				"Thread %X could not acquire Mutex [%X]",
-				(u32) this_thread_id, mutex_id));
+				(u32)(long) this_thread_id, mutex_id));
 	}
 
 	return (status);
diff -urN ./linux-2.6.18.1/drivers/block/paride/pseudo.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/block/paride/pseudo.h
--- ./linux-2.6.18.1/drivers/block/paride/pseudo.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/block/paride/pseudo.h	2007-05-19 23:58:35.000000000 +0900
@@ -43,7 +43,7 @@
 static int ps_tq_active = 0;
 static int ps_nice = 0;
 
-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock);
 
 static DECLARE_WORK(ps_tq, ps_tq_int, NULL);
 
diff -urN ./linux-2.6.18.1/drivers/cabi/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/Kconfig
--- ./linux-2.6.18.1/drivers/cabi/Kconfig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/Kconfig	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,10 @@
+#
+# CABI related configuration option
+# 
+
+config CABI
+         bool 'CPU Accounting and Binding Interface support'
+         default n
+         ---help---
+         CPU Accounting and Binding Interface support.
+
diff -urN ./linux-2.6.18.1/drivers/cabi/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/Makefile
--- ./linux-2.6.18.1/drivers/cabi/Makefile	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/Makefile	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,15 @@
+#
+# Makefile for the linux kernel.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definitions are now in the main makefile...
+
+
+obj-$(CONFIG_CABI) = cabi_init.o cabi_account.o cabi_timer.o cabi_sched.o cabi_isr.o cabi_signal.o udivdi3.o cabi_overload.o cabi_syscalls.o cabi_debug.o cabi_cyclic.o cabi_defsrv.o
+#cabi_dsv_replenish.o cabi_cyc_isr.o cabi_dsv_isr.o cabi_ovl_isr.o cabi_ovl_replenish.o
+obj-$(CONFIG_PROC_FS) += cabi_procfs.o
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_account.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_account.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_account.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_account.c	2007-06-17 00:22:36.000000000 +0900
@@ -0,0 +1,1175 @@
+/*
+  * linux/drivers/cabi/cabi_account.c 
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * 2006-9-12. Modified by Midori SUGAYA to fix bugs in tick calculation 
+  *            and make adjustment for 2.6.
+  * 2006-2.    Porting to kernel-2.6 Takeharu KATO (ARM, SH)
+  * 2005-2.    New release based on LinuxRK. by Midori SUGAYA, Hirotaka
+  *            ISHIKAWA.
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  *             
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <asm/processor.h>
+#include <asm/div64.h>
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_debug.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi_debug.h>
+
+/*
+ * Static variables for accounting
+ */
+
+/* 
+ * This values keeps the initial address of the accounting
+ * object and overload accounting object which currently 
+ * running.
+ */
+
+cabi_account_t cabi_current_account;
+cabi_account_t cabi_current_overload_account;
+
+/*
+ * Current capacity which current accounting object holds.
+ */
+cpu_capacity_t cabi_account_current_capacity;
+
+/*
+ * Other global variables.
+ */
+struct list_head cabi_account_head;
+int overload_cabi;
+int last_cabi_id = 0;
+
+extern long sys_setpgid(pid_t pid, pid_t pgid);
+extern rwlock_t tasklist_lock;
+
+/*
+ * Name: cabi_account_init 
+ * 
+ * Initialize the variables which are used in the accounting.
+ * This function is called from the cabi_init which is executed
+ * during kernel start boot.
+ */
+void
+cabi_account_init(void)
+{
+	ENTER;
+	INIT_LIST_HEAD(&cabi_account_head);
+	cabi_account_current_capacity = 0;
+	cabi_current_account = NULL_ACCOUNT;
+	cabi_current_overload_account = NULL;
+	EXIT;
+}
+
+/*
+ * Name: search_cabi (cabi_id) 
+ *
+ * Search the accounting object (cabi) which is specified by the
+ * argument with cabi_id. Return the initial address of the
+ * accounting object.
+ */
+
+cabi_account_t
+search_cabi(unsigned long cabi_id)
+{
+
+	struct list_head *cabi_list;
+	cabi_account_t cabi = NULL_ACCOUNT;
+
+	cabi_list = &cabi_account_head;
+
+	while (!list_empty(cabi_list)) {
+		cabi = list_entry(cabi_list, struct cabi_account, cpu_link);
+	
+		if (cabi->cabi_id == cabi_id)
+			break;
+
+		cabi_list = cabi_list->next;
+		if (cabi_list == &cabi_account_head) {
+			cabi = NULL_ACCOUNT;
+			break;
+		}
+	}
+	return cabi;
+}
+
+/*
+ * Name: get_cabi_id (void)
+ *
+ * Return unique accounting object id for the caller. In this 
+ * function, duplicate ids and reserved ids are carafully
+ * asided from the id pools. CABI_ID_MAX keeps the maximum 
+ * limit of the id which is defined in cabi.h header.
+ */ 
+
+int
+get_cabi_id(void)
+{
+	struct list_head *cabi_list;
+	cabi_account_t cabi = NULL_ACCOUNT;
+	int begin_cabi_id, cabi_id_error;
+	cabi_id_error = -1;
+
+	begin_cabi_id = -1;
+	last_cabi_id++;
+
+	cabi_list = &cabi_account_head;
+	cabi_list = cabi_list->next;
+	if (cabi_list == &cabi_account_head) {	/* First AO create. */
+		last_cabi_id = FIRST_CABI_ID;
+		return last_cabi_id;
+	}
+	while (!list_empty(cabi_list)) {	/* Get cabi_id */
+		cabi = list_entry(cabi_list, struct cabi_account, cpu_link);
+		/* cabi_id is full */
+		if (last_cabi_id == begin_cabi_id)
+			return cabi_id_error;
+		/* selected cabi_id is for overload */
+		if (OVERLOAD_CABI_ID == last_cabi_id) {
+			if (begin_cabi_id == -1)
+				begin_cabi_id = last_cabi_id;
+			++last_cabi_id;
+			cabi_list = &cabi_account_head;
+			cabi_list = cabi_list->next;
+			continue;
+		}
+		/* selected cabi_id is already used */
+		if (cabi->cabi_id == last_cabi_id) {
+			if (begin_cabi_id == -1)
+				begin_cabi_id = last_cabi_id;
+			++last_cabi_id;
+			cabi_list = &cabi_account_head;
+			cabi_list = cabi_list->next;
+			continue;
+		}
+		/* selected cabi_id is MAX value */
+
+		if (last_cabi_id >= CABI_ID_MAX) {
+			if (begin_cabi_id == -1)
+				begin_cabi_id = last_cabi_id;
+			last_cabi_id = FIRST_CABI_ID;
+			cabi_list = &cabi_account_head;
+			cabi_list = cabi_list->next;
+			continue;
+		}
+		/* next cabi list */
+		cabi_list = cabi_list->next;
+
+		/* all lists were searched. */
+		if (cabi_list == &cabi_account_head)
+			break;
+	}
+	return last_cabi_id;
+}
+
+/*
+ * Capacity calculation of accounting object (cabi)
+ */
+
+cpu_capacity_t 
+capacity_of(cpu_capacity_quad_t qc, cpu_capacity_quad_t qt)
+{
+
+        unsigned long long  qc_tmp, qt_tmp; 	/* unsigned long long */
+        unsigned long result, rem;  		/* unsigned long */
+
+	ENTER;
+
+	/* to store fine grained value, we use 10000 instead of
+	   100 for deviding. */
+	qc_tmp = (unsigned long long) qc * 10000;
+	qt_tmp = (unsigned long long) qt;
+
+	if (qt_tmp > 1000000000) {
+		qc_tmp = qc_tmp >> 8;
+		qt_tmp = qt_tmp >> 8;
+	}
+ 
+	result = (unsigned long)  
+			 div_long_long_rem (qc_tmp, qt_tmp, &rem);
+
+	cabi_debug_capacity(result);
+
+	EXIT;
+        return result;
+}
+
+void 
+set_terminate_operation (struct cabi_account *cabi)
+{
+
+	switch (cabi->pm.operation){
+	case OP_NONE:
+		break;
+	case OP_BLOCK:
+		cabi->opt_state = CABI_UNBLOCK;
+		break;
+	case OP_SIGNAL:
+		cabi->opt_state = CABI_SIGNAL;
+		break;
+	default:
+		break;
+	}
+}
+
+int
+capacity_admission_control (cabi_param_data_t *p,
+                                cpu_capacity_t capacity)
+{
+
+        cabi_account_current_capacity =
+            capacity + cabi_account_current_capacity;
+
+        switch (p->policy) {
+        case PO_DEFSRV:
+                break;
+        case PO_EDF:
+                //if (cabi_account_current_capacity > 99)
+                //      return 0;
+                break;
+        case PO_RM:
+                //if (cabi_account_current_capacity > 69)
+                //      return 0;
+                break;
+        default:
+                break;
+        }
+
+        return 1;
+}
+
+
+/*
+ * Name: cabi_account_create
+ *
+ * Create a accounting object. The function called from 
+ * sys_cabi_account_create which is a interface of the system call.
+ */ 
+
+cabi_account_t
+cabi_account_create(struct timespec *c, struct timespec *t,
+		    cabi_param_data_t * p)
+{
+
+	cabi_account_t cabi;
+	cpu_capacity_t capacity;
+	cpu_capacity_quad_t qc, qt, overload_qc;
+	long get_id = 0;
+
+
+	ENTER;
+	cabi_debug_timespecs(c,t);
+
+	/* 
+	 * if this is overload cabi, calculate parameters. 
+	 */
+	if (p->bind_proc_type == BIND_IDLE_PROC){
+		capacity_overload(c,t,&overload_qc);
+		qc = overload_qc;
+	} else {
+		qc = ((unsigned long long)c->tv_sec * NANOSEC) 
+			+ (unsigned long long)c->tv_nsec;
+	}
+	qt = ((unsigned long long)t->tv_sec * NANOSEC) +
+			 (unsigned long long)t->tv_nsec;
+
+
+	/* capacity is the rate of qc/qt * 100 */
+	cabi_debug_ct (qc, qt);
+	capacity = capacity_of (qc, qt);
+	cabi_debug_capacity (capacity);
+
+	/* 
+	 * Add the new capacity to the global capacity 
+	 * This is the point that admission control should be taken
+	 * by its policy 
+	 */
+	capacity_admission_control(p, capacity);
+
+	cabi_debug_capacity(cabi_account_current_capacity);
+
+	/* create an accounting object */
+	if((cabi = malloc(sizeof (struct cabi_account))) == NULL)
+		return NULL;
+	bzero(cabi, sizeof (struct cabi_account));
+
+	/* memory copy */
+	memcpy(&cabi->pm, p, sizeof(struct cpu_param));
+	
+	/* 
+	 * Set up the unique id for cabi. Firtst, check whether the
+	 * the cabi is for the overload or not. If overload, a special
+         * id should be set. If not, set a sequencial number for it.
+	 */
+        if (cabi->pm.bind_proc_type | IDLE_PROCESS) {
+		/* if it is for overload, setup the overload id */
+		setup_overload_id (cabi);
+
+        } else {
+                if((get_id = get_cabi_id()) < 0) {
+                        free(cabi);
+                        return NULL;
+                } else {
+                        cabi->cabi_id = (unsigned int)get_id;
+                }
+        }
+	CABI_DBG;
+	cabi_debug("[create] cabi_id %lu\n", cabi->cabi_id);
+
+	/*
+	 * Initialize the list head for proc list.
+	 */
+        INIT_LIST_HEAD(&cabi->cpu_proc_list);
+        memcpy (&cabi->cpu_time, c, sizeof(struct timespec));
+        memcpy (&cabi->cpu_period, t, sizeof(struct timespec));
+        cabi->cpu_capacity = capacity;
+
+	/* 
+	 * if this is the first call as the requirement of reservation,
+	 * we should set up cabi hooks. 
+	 */
+	if (list_empty(&cabi_account_head)) {
+		/* enable scheduling hook */
+		cabi_enable();
+	}
+	list_add(&cabi->cpu_link, &cabi_account_head);
+
+	/* calculate cpu ticks per capacity */
+	/* to pass the nanosec value to the usec2tick interface,
+         * we devide the nanosec value by 1000 into the usec. 
+         */
+	nanosec2tick(&qc, &cabi->cpu_time_ticks);
+	nanosec2tick(&qt, &cabi->cpu_period_ticks);
+	
+	/* cabi debug ticks */
+
+	/*
+	 * Let the account replenished from the next jiffy.
+	 * See also cabi_timer.c:cabi_replenish_timer_create().
+	 *      cpu_period_used_ticks = 0;
+	 *      cpu_period_available_ticks = 0;
+	 */
+	/* init waitqueue */
+	init_waitqueue_head(&cabi->depleted_wait);
+
+	/* create entry as /proc/cabi/<cabi_id> */
+	INIT_LIST_HEAD(&cabi->cpu_proc_list);
+	cabi->cpu_state = CABI_IS_DEPLETED;
+	cabi->signal_block = SIGNAL_OFF;
+
+#ifdef CONFIG_PROC_FS
+	cabi_proc_account_create(cabi);
+#endif	
+	set_terminate_operation(cabi);
+	set_account_policy(cabi);
+
+	return cabi;		/* success */
+}
+
+/*
+ * Name: cabi_account_destroy
+ *
+ * Delete a accounting object from kernel memory. The function called from
+ * sys_cabi_account_destroy which is a interface of the system call.
+ */
+
+int
+cabi_account_destroy(unsigned long cabi_id)
+{
+	cabi_account_t cabi = NULL_ACCOUNT;
+	/* find the cabi address */
+	if (!(cabi = search_cabi(cabi_id))) 
+		return CABI_ENOEXIST;
+
+	/* if account has attached pgrocesses */
+	if (!list_empty(&cabi->cpu_proc_list)) 
+		/* process is still attached. */
+		return CABI_EATTACHED;
+
+	/* destroy timer */
+	cabi_replenish_timer_cancel(cabi);
+
+	/* return capacity */
+	if (cabi->cpu_capacity >= cabi_account_current_capacity) {
+		cabi_account_current_capacity = 0;
+	} else {
+		cabi_account_current_capacity -= cabi_account_current_capacity;
+	}
+
+	/* delete cpu_link */
+	list_del(&cabi->cpu_link);
+	INIT_LIST_HEAD(&cabi->cpu_link);	/* for sure */
+	cabi->pm.operation = OP_NONE;
+	CABI_DBG;
+
+#ifdef CONFIG_PROC_FS
+	/* 
+	 * remove entry under /proc/cabi/
+	 * must be before cabi_resource_set_detach_account()
+	 * since resource_set is reffered in cabi_proc_account_destroy()
+	 */
+	cabi_proc_account_destroy(cabi);
+#endif
+
+	/* finally free a generic account object. */
+	free(cabi);
+
+	if (cabi_id == OVERLOAD_CABI_ID)
+		cabi_current_overload_account = NULL;
+
+	/* if this is the last cabi, disable_isr */
+	if (!cabi_account_current_capacity)
+		cabi_disable();	
+
+	return CABI_SUCCESS;	/* success */
+}
+
+
+/*
+ * Name: cabi_account_set
+ *
+ * Set a parameter for accounting object. The function called from
+ * sys_cabi_account_set which is a interface of the system call.
+ */
+
+int
+cabi_account_set(unsigned long cabi_id, struct timespec *c,
+		 struct timespec *t, cabi_param_data_t * p)
+{
+        cpu_capacity_t capacity;
+	cpu_capacity_quad_t qc, qt, overload_qc;
+	cabi_account_t cabi = NULL_ACCOUNT;
+
+	ENTER;
+
+	/* find the cabi address */
+	if (!(cabi = search_cabi(cabi_id))) {
+		/* cabi dose not exist. */
+		return CABI_ENOEXIST;
+	}
+
+	/* check timespec */
+	cabi_debug_timespecs(&cabi->cpu_time, &cabi->cpu_period);
+	cabi_debug_timespecs(c,t);
+
+	/* cabi is depleted */
+	cabi->cpu_state = CABI_IS_DEPLETED;
+
+	/* set terminate action */
+	memcpy(&cabi->pm, p, sizeof(cabi_param_data_t));
+
+	if(cabi->pm.operation == OP_BLOCK) {
+		cabi->opt_state = CABI_UNBLOCK;
+	} else { 
+		cabi->opt_state = CABI_SIGNAL;
+		if (!CABI_SIGNUM(cabi)) {
+			cabi->pm.operation = OP_NONE;
+		}
+	 }
+
+	/* if this is overload cabi, re-calculate parameters. */
+	if (cabi->pm.bind_proc_type == BIND_IDLE_PROC){
+		
+		/* debug timespec */
+		cabi_debug_timespecs (c,t);
+
+		/* new parameter has been set in the overload_qc */
+		capacity_overload(c,t,&overload_qc);
+		qc = overload_qc;
+	} else {
+		qc = (c->tv_sec * NANOSEC) + c->tv_nsec;
+	}
+	qt = (t->tv_sec * NANOSEC) + t->tv_nsec;
+
+	/* calculate a new capacity */
+	capacity = capacity_of(qc, qt);
+
+	/* set current capacity */
+	cabi_account_current_capacity =
+	    capacity + cabi_account_current_capacity - cabi->cpu_capacity;
+	
+	/* check the result */
+	cabi_debug_capacity (capacity);
+	cabi_debug_capacity (cabi_account_current_capacity);
+
+	/* load the new parameters */
+	memcpy(&cabi->cpu_time, c, sizeof(struct timespec));
+	memcpy(&cabi->cpu_period, t, sizeof(struct timespec));
+	cabi->cpu_capacity = capacity;
+
+	/* initialize available ticks */
+	cabi->cpu_period_available_ticks = 0;
+
+	/* calculate cabi ticks per capacity */
+	nanosec2tick(&qc, &cabi->cpu_time_ticks);
+	nanosec2tick(&qt, &cabi->cpu_period_ticks);
+
+	/* cancel replenish timer */
+	cabi_replenish_timer_cancel(cabi);
+
+	/* reset a timer for it */
+	cabi_replenish_timer_init(cabi, &cabi->cpu_period_ticks);
+
+	return CABI_SUCCESS;	/* success */
+
+}
+
+/*
+ * Name: cabi_account_attach
+ *
+ * Attach a process to specified accounting object.
+ */
+
+int
+cabi_account_attach(unsigned long cabi_id, struct task_struct *tsk)
+{
+	struct rs_proc_list *rs_proc;
+	cabi_account_t cabi = NULL_ACCOUNT;
+	unsigned long flags;
+
+	ENTER;
+
+	/* find the cabi address */
+	if (!(cabi = search_cabi(cabi_id))) {
+		CABI_DBG;
+		/* did not exist. */
+		return CABI_ENOEXIST;
+	}
+	CABI_DBG;
+	/*
+	 * !!! Need to make sure cabi is really a account.
+	 */
+	LVAL_TASK_ACCOUNT(tsk) = cabi;
+	
+	/* show info */
+	cabi_debug_info(cabi);
+	CABI_DBG;
+
+	if((rs_proc = malloc(sizeof (struct rs_proc_list))) == NULL)
+		return CABI_ENOMEM;
+	CABI_DBG;
+	bzero(rs_proc, sizeof (struct rs_proc_list));
+
+	/* If there is no process bind to this cabi,
+         *  we should set the replenish timer for it.
+         */
+	CABI_DBG;
+	 
+        if (list_empty(&cabi->cpu_proc_list)) {
+                cabi_replenish_timer_init(cabi,&cabi->cpu_period_ticks);
+        }
+	CABI_DBG;
+
+	/* set proc file system for the new belonged process */
+	rs_proc->rs_proc_task = tsk;
+	rs_proc->rs_proc_pid = tsk->pid;
+
+	CABI_DBG;
+
+	local_save_flags(flags);
+	{
+		local_irq_disable();
+		list_add(&rs_proc->rs_proc_list, &cabi->cpu_proc_list);
+	}
+	local_irq_restore(flags);
+
+	CABI_DBG;
+
+	/* attach! */
+	/* NOT YET fixed. 
+	 * if we set the rdtsc time from here, there are lag time
+	 * between the current cpu_period_start_time and the time 
+	 * which initial_rep_timer will be set one jiffy after.
+	 */
+	CABI_DBG;
+	if (cabi->cpu_period_start_ticks == 0 &&
+	    rs_proc->rs_proc_task == current) {
+
+		/* 
+		 * XXX consider offset time for RT-processes!
+		 */
+		cabi_rdticks(&cabi->cpu_period_start_ticks);
+	}
+	CABI_DBG;
+
+	/* if this is the current process,
+	 * make it the current account and start accouting now */
+	if (current == tsk) {
+
+		/* XXX atomic ? */
+		if (cabi_current_account != cabi) {
+
+			/* XXX cabi が RUNNING の間は NULL であることを
+                           保証できているか?できていれば assertion で
+                           チェックできる */
+			cabi_current_account = cabi;
+			
+			/* check account info */
+			cabi_debug_info (cabi_current_account);
+		}
+		CABI_DBG;
+
+		cabi_start_account(cabi);
+
+		if (cabi->cpu_state & CABI_IS_DEPLETED) {
+
+			/* start accounting */
+			cabi_account_enforce(cabi);
+		}
+		CABI_DBG;
+	}
+	if (cabi->cabi_id == OVERLOAD_CABI_ID)
+		overload_cabi++;
+	
+	CABI_DBG;
+
+	return CABI_SUCCESS;
+}
+
+/*
+ * Name: cabi_account_detach
+ *
+ * Detach a process from a specified accounting object.
+ */
+
+int
+cabi_account_detach(struct task_struct *tsk)
+{
+	cabi_account_t cabi = TASK_ACCOUNT(tsk);
+	struct rs_proc_list *rs_proc = NULL;
+	struct list_head *proc_list;
+
+
+	/* show cabi info */
+	cabi_debug_info (cabi);
+
+	/* find rs_proc */
+	proc_list = cabi->cpu_proc_list.next;
+	while (proc_list != &cabi->cpu_proc_list) {
+		rs_proc =
+		    list_entry(proc_list, struct rs_proc_list, rs_proc_list);
+		if (rs_proc->rs_proc_pid == tsk->pid) {
+			break;
+		}
+		/* next element */
+		proc_list = proc_list->next;
+	}
+
+	if (rs_proc && rs_proc->rs_proc_pid == tsk->pid) {
+
+		/* show rs_proc info */
+		cabi_debug_detach (rs_proc);
+
+		/* remove rs_proc from the list */
+		list_del(&rs_proc->rs_proc_list);
+
+		/* free rs_proc */
+		INIT_LIST_HEAD(&rs_proc->rs_proc_list);
+		free(rs_proc);
+	}
+	/* detach resource set from task */
+	LVAL_TASK_ACCOUNT(tsk) = NULL_ACCOUNT;
+
+	if (tsk->state != TASK_RUNNING) {
+		wake_up(&cabi->depleted_wait);
+	}
+
+	return CABI_SUCCESS;
+}
+
+void
+set_account_policy (cabi_account_t cabi)
+{
+
+	ENTER;
+	switch (cabi->pm.policy) {
+		case PO_NONE:
+			break;
+		case PO_CYCLIC:
+			CABI_DBG;
+			cabi->ops = &cyclic_ops;
+			break;
+		case PO_OVLD:
+			cabi->ops = &ovl_ops;
+			break;
+		case PO_RM:
+			break;
+		case PO_DEFSRV:
+			CABI_DBG;	
+			cabi->ops = &dsv_ops;
+			break;
+		case PO_VPE:
+			break;
+		case PO_EDF:
+			break;
+		default:
+			CABI_DBG;
+			cabi->ops = &cyclic_ops;
+			break;
+	}
+	EXIT;
+}
+
+/*
+ * Name: cabi_account_replenish
+ *
+ * This function is called when the periodic timer is expired.
+ * At this point, the accounting system update the status and
+ * values of the accounting object and initialize them for
+ * preparing the next period.
+ *
+ *             T            T
+ *       |<---------->|<---------->|
+ *     --|------------|------------|--->t
+ *       t1           t2           t3..
+ *
+ */
+
+void
+cabi_account_replenish(cabi_account_t cabi)
+{
+	cabi->ops->replenish (cabi);
+}	
+		
+
+/*
+ * Name: cabi_account_enforce
+ *
+ * This function will change the status of the accounting object
+ * to CABI_IS_DEPLETED which cpu_state has not been depleted yet.
+ */
+
+
+void
+cabi_account_enforce(cabi_account_t cabi)
+{
+
+	ENTER;
+	
+	if (cabi->cabi_id != OVERLOAD_CABI_ID &&
+	    !(cabi->cpu_state & CABI_IS_DEPLETED)) {
+
+		/* check state */
+		cabi_debug_state(cabi);
+
+		/*   
+                 * Here, the capacity of accounting system has
+                 * been used up already. So, we have to change
+                 * the status to IS_DEPLETED.
+                 */
+		
+		cabi->cpu_state |= CABI_IS_DEPLETED;
+		cabi_debug_state (cabi);
+
+		/* set current account */
+		cabi_current_account = cabi;
+	}
+	EXIT;
+}
+
+
+/*
+ * Name: cabi_account_check_enforce
+ *
+ * Check the cpu_state of the accounting object. If the
+ * available ticks of TSC is less than the used ticks,
+ * the status should be changed. Call cabi_account_enforce.
+ */
+inline void
+cabi_account_check_enforce(cabi_account_t cabi)
+{
+	ENTER;
+
+	if ((cabi->cabi_id != OVERLOAD_CABI_ID) &&
+	    cabi->cpu_period_used_ticks >= cabi->cpu_period_available_ticks) {
+
+		/* debug tikcs */
+		cabi_debug_ticks(cabi);
+
+		/* enforcing a account */
+		cabi_account_enforce(cabi);
+	}
+	EXIT;
+}
+
+/*
+ * Start & Stop the account of CPU utilization.
+ */
+/*
+ * Name: cabi_start_account
+ *
+ * It saves the start time of the process computation time (C)
+ * with TSC (Time Stamp Counter).
+ *
+ *              T
+ *    |<-------------------------->|<-..
+ *    |       C            C       |
+ *    | <---------->     <--->     |
+ *    | start     end    s   e     |
+ *  --|----------------------------|--> t
+ * s: cabi_start_account
+ * e: cabi_stop_account
+ */
+
+void
+cabi_start_account(cabi_account_t cabi)
+{
+
+	unsigned long long next;
+
+	/* debug */
+	cabi_debug_ticks (cabi);	
+
+	/* 
+	 * This code is used for the illegal status for the 
+	 * accuonting. At this point, if both of the available_ticks 
+	 * and the used_ticks equal to zero and used_ticks exceeded 
+	 * by the available_ticks, their accounting object cpu_status
+	 * should be CABI_IS_DEPLETED. No need to check the
+         * status.
+         */ 
+
+	if (cabi->cpu_period_available_ticks < cabi->cpu_period_used_ticks) {
+			cabi->cpu_state |= CABI_IS_DEPLETED;
+			goto start_account_out;	
+	}
+
+	/* 
+	 * We can't calculate the substraction if the number of the ticks 
+	 * larger than the 32bit.
+	 */
+
+       if (CHECK_DEGIT(cabi->cpu_period_available_ticks) ||
+                 CHECK_DEGIT(cabi->cpu_period_used_ticks)) {
+
+                next = cabi_subtract_ll(cabi->cpu_period_available_ticks,
+                                cabi->cpu_period_used_ticks,
+                                (cabi_account_t) cabi);
+		/* debug */
+		cabi_debug_tick(next);
+
+        } else {
+                next = cabi->cpu_period_available_ticks -
+                                cabi->cpu_period_used_ticks;
+		/* debug */
+		cabi_debug_tick(next);
+
+        }
+
+	/*
+	 * To save the time of the start_account, set the rdticks at this
+         * point. After finishing the running, we will get the time of rdticks
+         * in cabi_stop_account function.
+	 */
+	cabi_rdticks(&cabi->cpu_period_start_ticks);
+
+	switch (cabi->cpu_state) {
+
+	case CABI_IS_NULL:
+		cabi_debug("[start:NULL] %x\n", (int)cabi->cpu_state);
+		cabi->cpu_state |= CABI_IS_RUNNING;
+		cabi_enforce_timer_start(cabi, &next);
+		cabi_debug("[start:NULL] %x\n", (int)cabi->cpu_state);
+		break;
+	case CABI_IS_DEPLETED:
+		cabi_debug("[start:DEPLETED] %x\n", (int)cabi->cpu_state);
+		cabi->cpu_state |= CABI_IS_RUNNING;
+		cabi_debug("[start:DEPLETED] %x\n", (int)cabi->cpu_state);
+		break;
+	case CABI_IS_RUNNING:	/* must not be so */
+		cabi_debug("[start:RUNNING] %x\n", (int)cabi->cpu_state);
+		cabi_debug_state (cabi);
+		cabi_enforce_timer_start(cabi, &next);
+		cabi_debug("[start:RUNNING] %x\n", (int)cabi->cpu_state);
+		break;
+	case CABI_IS_RUNNING | CABI_IS_DEPLETED:	 
+		cabi_debug("[start:RUN|DEP] %x\n", (int)cabi->cpu_state);
+		cabi_debug_state (cabi);
+		break;
+	default:
+		/* unkown */
+		cabi_debug("[start:default] %x\n", (int)cabi->cpu_state);
+		cabi_debug_state (cabi);
+		cabi_debug_tick (next);
+		cabi->cpu_state = CABI_IS_RUNNING;
+
+		if (next > 0) {
+			cabi_debug_tick(next);
+			cabi_enforce_timer_start(cabi, &next);
+		} else {
+			cabi->cpu_state |= CABI_IS_DEPLETED;
+		}
+		cabi_debug("[start:default] %x\n", (int)cabi->cpu_state);
+start_account_out:
+		break;
+
+	}
+}
+
+
+void
+cabi_stop_account(cabi_account_t cabi, cpu_tick_t now)
+{
+
+	long long left;
+
+	cabi_enforce_timer_cancel();
+
+	cabi_account_summation_of_used_ticks(cabi, now);
+	
+	/* check state */
+	cabi_debug_state (cabi);
+
+	switch (cabi->cpu_state) {
+
+	case CABI_IS_RUNNING:
+		cabi_debug ("[stop:b:RUN] %x\n", (int)cabi->cpu_state);
+		cabi->cpu_state &= ~CABI_IS_RUNNING;
+		cabi_account_check_enforce(cabi);
+		cabi_debug ("[stop:a:RUN] %x\n", (int)cabi->cpu_state);
+		break;
+	case CABI_IS_RUNNING | CABI_IS_DEPLETED:
+		cabi_debug ("[stop:b:RUN|DEP] %x\n", (int)cabi->cpu_state);
+		cabi->cpu_state &= ~CABI_IS_RUNNING;
+		cabi_debug ("[stop:a:RUN|DEP] %x\n", (int)cabi->cpu_state);
+		break;
+	case CABI_IS_NULL:	/* must not be so */
+		cabi_debug ("[stop:b:NULL] %x\n", (int)cabi->cpu_state);
+		cabi_debug_state(cabi);
+		cabi_account_check_enforce(cabi);
+		cabi_debug ("[stop:a:NULL] %x\n", (int)cabi->cpu_state);
+		break;
+	case CABI_IS_DEPLETED:	/* must not be so */
+		cabi_debug ("[stop:b:DEP] %x\n", (int)cabi->cpu_state);
+		cabi_debug_state(cabi);
+		cabi_debug ("[stop:a:DEP] %x\n", (int)cabi->cpu_state);
+		break;
+	default:		/* unkown */
+		left =
+		    cabi->cpu_period_available_ticks -
+		    cabi->cpu_period_used_ticks;
+
+		/* check */
+		cabi_debug_state(cabi);
+		cabi_debug_ticks(cabi);
+
+		/* set new state */
+		cabi->cpu_state = CABI_IS_NULL;
+
+
+		if (left <= 0) {
+			cabi->cpu_state |= CABI_IS_DEPLETED;
+
+			/* check state */
+			cabi_debug_state (cabi);
+		}
+		break;
+	}
+}
+
+
+#ifdef CONFIG_PROC_FS
+/* this is test global variables. */
+int count = 0;
+int gc =0;
+
+int
+cabi_account_read_proc(cabi_account_t cabi, char *buf)
+{
+	char *p = buf;
+	cpu_tick_data_t	used_cpu_time;
+	cpu_capacity_t ave, cur, prv; /* unsigned long */
+	unsigned long rem;
+	struct rs_proc_list *rs_proc = NULL;
+	struct list_head *proc_list;
+
+	ENTER;
+
+	/* check before load */
+	cabi_debug_ticks (cabi);
+
+	/* set tick */
+	used_cpu_time = TICK2USEC(&cabi->cpu_period_used_ticks);
+
+	/* set capacity */
+	cur = capacity_of (cabi->cpu_period_used_ticks, 
+			cabi->cpu_period_ticks);
+
+	/* check prev */
+	cabi_debug_tick (cabi->cpu_period_prev_used_ticks);
+
+	/* set capacity */
+	prv = capacity_of (cabi->cpu_period_prev_used_ticks,
+			cabi->cpu_period_ticks);
+
+	/* set total utils */
+	if (cabi->cpu_average.total_count) {
+		/* 
+		 * ave         : %ul
+		 * total_utils : %ull
+		 * total_count : %ul
+		 */
+		ave = (unsigned long) div_long_long_rem (
+		(unsigned long long) cabi->cpu_average.total_utils, 
+		(unsigned long) cabi->cpu_average.total_count,&rem);
+	
+		if(cabi->cpu_average.total_utils > 1000000000) {
+			/* this is not good solution, FIX ME */
+			cabi->cpu_average.history[gc++] = ave; 
+			cabi->cpu_average.total_utils = 0;
+			cabi->cpu_average.total_count = 0;
+		}
+
+	} else {
+		ave = cur;
+	}
+
+	/* 1st line: utilization statics */
+        /* reseved, previous, average, max, min */
+	p += sprintf(p, "%lu.%04lu ",
+		     CAPACITY_INT(cabi->cpu_capacity),
+		     CAPACITY_FRAC(cabi->cpu_capacity));
+
+	p += sprintf(p, "%lu.%04lu %lu.%04lu ", 
+		     CAPACITY_INT(prv), CAPACITY_FRAC(prv),
+		     CAPACITY_INT(ave), CAPACITY_FRAC(ave));
+
+	p += sprintf(p, "%lu.%04lu ", 
+		     CAPACITY_INT(cabi->cpu_max_utilization),
+		     CAPACITY_FRAC(cabi->cpu_max_utilization));
+
+	p += sprintf(p, "%lu.%04lu \n",
+		     CAPACITY_INT(cabi->cpu_min_utilization),
+		     CAPACITY_FRAC(cabi->cpu_min_utilization));
+
+	/* 2nd line: current CPU usage 
+	 * used available requested */
+
+	p += sprintf(p, "%llu %lu %lu\n",
+		     used_cpu_time,
+		     TICK2USEC(&cabi->cpu_period_available_ticks),
+		     TICK2USEC(&cabi->cpu_time_ticks));
+	/* debug */
+	cabi_debug_ticks (cabi);
+
+	/* 3rd line: count of replenishment */
+	p += sprintf(p, "%lu\n", cabi->cpu_average.total_count);
+
+	/* 4th line: account object id */
+	p += sprintf(p, "%lu\n", cabi->cabi_id);
+
+	/* 5th line: process id */
+	/* find rs_proc */
+	proc_list = cabi->cpu_proc_list.next;
+	while (proc_list != &cabi->cpu_proc_list) {
+		/* get a rs_proc */
+		rs_proc = list_entry(proc_list, struct rs_proc_list,
+				     rs_proc_list);
+		p += sprintf(p, "%d ", rs_proc->rs_proc_pid);
+		/* next element */
+		proc_list = proc_list->next;
+	}
+	p += sprintf(p, "\n");
+
+	return (p - buf);
+}
+
+int
+cabi_account_read_bindpid_proc(cabi_account_t cabi, char *buf)
+{
+	char *p = buf;
+	struct rs_proc_list *rs_proc = NULL;
+	struct list_head *proc_list;
+
+	/* find rs_proc */
+	proc_list = cabi->cpu_proc_list.next;
+	while (proc_list != &cabi->cpu_proc_list) {
+		/* get a rs_proc */
+		rs_proc = list_entry(proc_list, struct rs_proc_list,
+				     rs_proc_list);
+		p += sprintf(p, "%d\n", rs_proc->rs_proc_pid);
+		/* next element */
+		proc_list = proc_list->next;
+	}
+	return (p - buf);
+}
+
+int
+cabi_account_status_proc(char *buf)
+{
+	char *p = buf;
+
+	/* cabi ticks per second, CPU capacity taken for accounts */
+	p += sprintf(p, "%lu\n",
+		     (unsigned long) cabi_cpu_ticks_per_second);
+
+	return (p - buf);
+}
+
+int
+cabi_account_base_status_proc(char *buf)
+{
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_cyclic.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_cyclic.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_cyclic.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_cyclic.c	2007-06-17 00:23:21.000000000 +0900
@@ -0,0 +1,345 @@
+/*
+ * CABI -- CPU Accounting and Blocking Interfaces.
+ *
+ * Copyright (C) OS Research Group, Waseda University
+ *               Nakajima Laboratory. MontaVista Software, Inc.
+ *
+ * Midori Sugaya <doly@dcl.info.waseda.ac.jp>
+ *
+ */
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi_debug.h>
+
+#include <linux/interrupt.h>
+
+/*
+ * --------------------------------------------------------------
+ * Policy    : Default cyclic executive
+ * ---------------------------------------------------------------
+ * Status    : RUNNING   running when the replenish timer expires.
+ *           : DEPLETED  depleted when the replenish timer expires,
+ *           : NULL
+ * ---------------------------------------------------------------
+ * Operation : BLOCK     change status of the operation
+ *           : SIGNAL    change status of the operation
+ * ---------------------------------------------------------------
+ * Priority  : Not control, use user defined priority
+ * ---------------------------------------------------------------
+ */
+
+int is_running = 0;
+void cyc_replenish (cabi_account_t);
+void cyc_replenish_capacity (cabi_account_t);
+void cyc_account_status_in (cabi_account_t);
+void cyc_account_status_out (cabi_account_t);
+void cyc_statistic_calculation (cabi_account_t);
+void cyc_replenish_operation (cabi_account_t);
+void cyc_isr (cabi_account_t);
+void cyc_isr_operation (cabi_account_t);
+
+struct cabi_account_operations cyclic_ops = {
+       .replenish              = cyc_replenish,
+       .isr		       = cyc_isr,
+       .isr_operation          = cyc_isr_operation,
+};
+
+void 
+cyc_replenish (cabi_account_t cabi)
+{
+
+	ENTER;
+	cyc_account_status_in(cabi);
+	cyc_statistic_calculation (cabi);
+	cyc_replenish_capacity(cabi);
+	cyc_account_status_out (cabi);
+	EXIT;
+
+}
+
+void
+cyc_replenish_capacity (cabi_account_t cabi)
+{
+        /* debug */
+        cabi_debug_ticks(cabi);
+        cabi_debug_info(cabi);
+
+        /* reset the used ticks */
+        cabi->cpu_period_used_ticks = 0;
+}
+
+void 
+cyc_stop_account (cabi_account_t cabi)
+{
+	cpu_tick_data_t now;
+
+	ENTER;
+	cabi_rdticks(&now);
+	cabi_stop_account(cabi, &now);
+	is_running++;
+	EXIT;
+}
+void 
+cyc_account_status_in (cabi_account_t cabi)
+{
+        
+	ENTER;
+	switch (cabi->cpu_state) {
+		case CABI_IS_NULL:
+			/* 00 : no bind process */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			break;
+		case CABI_IS_RUNNING:
+			/* 01 : passed the period */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			cyc_stop_account(cabi);
+			break;
+		case CABI_IS_DEPLETED:
+			/* 10 : correctly accounted */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			break;
+		case CABI_IS_RUNNING|CABI_IS_DEPLETED:
+			/* 11 : fist time to account */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			cyc_stop_account(cabi);
+			break;
+		default:
+			/* it should not be */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			break;
+	}
+
+	/* it should be changed to the initial status, CABI_IS_NULL. */
+        cabi->cpu_state &= ~CABI_IS_DEPLETED;
+
+	EXIT;
+}
+
+void
+cyc_replenish_operation (cabi_account_t cabi)
+{
+	
+	switch (cabi->pm.operation) {
+		case OP_BLOCK:
+			cabi->opt_state = CABI_UNBLOCK;
+			wake_up(&cabi->depleted_wait);
+			break;
+		case OP_SIGNAL:
+			break;
+		default:
+			break;
+	}
+}
+
+void 
+cpu_time_replenish_init (cabi_account_t cabi)
+{
+	
+	cabi->cpu_period_available_ticks = cabi->cpu_time_ticks;
+}
+
+int bonus;
+
+void
+cyc_account_status_out (cabi_account_t cabi)
+{
+
+	unsigned long long avail = cabi->cpu_period_available_ticks;
+	unsigned long long used  = cabi->cpu_period_prev_used_ticks;
+	unsigned long long cpu_time = cabi->cpu_time_ticks;
+	
+	ENTER;
+
+	if (avail > 0){
+		
+		switch (cabi->pm.rep_policy) {
+		case REP_SOFT:
+			CABI_DBG;
+			if (avail != cabi->cpu_time_ticks) {
+               			cabi_debug_ticks (cabi);
+				cpu_time_replenish_init(cabi);		
+			}
+			break;
+		case REP_HARD:
+			CABI_DBG;
+			if (bonus) {
+				bonus = ~bonus;
+				cpu_time_replenish_init(cabi);
+				break;
+			} else {
+				bonus++;
+				if (cpu_time > used) {
+					CABI_DBG;
+					/* used less than the capacity */
+					cpu_time_replenish_init(cabi);
+					cabi->cpu_period_available_ticks
+						+= (cpu_time - used);
+					cabi_debug_avail_ticks (cabi);		
+				} else if (cpu_time < used) {
+					CABI_DBG;
+					/* used more than the capacity */
+					cpu_time_replenish_init(cabi);
+					cabi->cpu_period_available_ticks
+						-= (used - cpu_time);
+					cabi_debug_avail_ticks (cabi);
+				} else {
+					CABI_DBG;
+					/* equal to avail and used */
+					cabi_debug_avail_ticks (cabi);
+				}
+			}
+			break;
+		default:
+			/* if not set anything, this path is taken. */
+			break;
+		}
+		cyc_replenish_operation(cabi);
+
+	} else {
+		CABI_DBG;
+		cabi_debug_avail_ticks(cabi);
+		cpu_time_replenish_init(cabi);
+	
+		if (is_running)
+			cabi_start_account(cabi);
+	} 
+	
+
+}
+
+void
+cyc_statistic_calculation(cabi_account_t cabi)
+{
+	
+	cpu_capacity_t c;
+
+	ENTER;
+
+        /* update statistics */
+        cabi->cpu_period_prev_used_ticks = cabi->cpu_period_used_ticks;
+        cabi->cpu_total_used_ticks += cabi->cpu_period_used_ticks;
+
+        /* check capacity */
+        c = capacity_of(cabi->cpu_period_used_ticks, 
+				cabi->cpu_period_ticks);
+
+        /* debug */
+        cabi_debug_capacity(c);
+
+        if (cabi->cpu_max_utilization < c) {
+                cabi->cpu_max_utilization = c;
+
+                /* capacity is exceed the max utilization */
+                cabi_debug_capacity(c);
+
+        } else if (cabi->cpu_min_utilization > c
+                   || cabi->cpu_min_utilization == 0) {
+
+                cabi->cpu_min_utilization = c;
+
+                /* capacity: less than the min util */
+                cabi_debug_capacity(c);
+
+        }
+
+        if (cabi->cpu_max_utilization < c) {
+                cabi->cpu_max_utilization = c;
+
+                /* capacity is exceed the max utilization */
+                cabi_debug_capacity(c);
+
+        } else if (cabi->cpu_min_utilization > c
+                   || cabi->cpu_min_utilization == 0) {
+
+                cabi->cpu_min_utilization = c;
+
+                /* capacity: less than the min util */
+                cabi_debug_capacity(c);
+	}
+
+        /* summation for average */
+        cabi->cpu_average.total_utils += c;
+        cabi->cpu_average.total_count++;
+
+	EXIT;
+}
+
+extern cabi_account_t cabi_current_account;
+void cyc_isr (cabi_account_t);
+void cyc_isr_operation (cabi_account_t);
+
+#include <linux/interrupt.h>
+#include <linux/irqflags.h>
+#include <linux/compiler.h>
+
+void
+cyc_isr (cabi_account_t cabi)
+{
+
+        ENTER;
+	
+        if (unlikely(irqs_disabled())) {
+                printk(KERN_ERR "BUG3: CABI %s/0x%08x/%d\n",
+                current->comm, preempt_count(), current->pid);
+        } 
+
+        if (cabi_account_depleted(cabi))
+                cyc_isr_operation(cabi);
+
+        EXIT;
+}
+
+
+void
+cyc_isr_operation (cabi_account_t cabi)
+{
+
+	ENTER;
+	
+        if (unlikely(irqs_disabled())) {
+                printk(KERN_ERR "BUG2: CABI %s/0x%08x/%d\n",
+                current->comm, preempt_count(), current->pid);
+        }
+	switch (cabi->pm.operation) {
+		case OP_BLOCK:
+			CABI_DBG;
+			cabi->opt_state = CABI_BLOCKED;
+			cabi->block_count++;
+			cabi_account_sleep_on(cabi);
+			break;
+		case OP_SIGNAL:
+			CABI_DBG;
+			if (cabi->opt_state != CABI_SIGNAL)
+				cabi->opt_state = CABI_SIGNAL;
+			if (!CABI_SIGFLAG(cabi)) {
+				if (!cabi->signal_block) {
+					cabi_send_signal(
+					cabi, current->pid,
+					CABI_SIGNUM(cabi));
+				}
+			} else {
+				if (!cabi->signal_block) {
+					cabi_send_signal(
+					cabi, CABI_SIGPID(cabi),
+					CABI_SIGNUM(cabi));
+				}
+			}	
+			break;
+		default:
+			cabi_current_account = NULL_ACCOUNT;
+			CABI_DBG;
+		        cabi_debug ("cabi_ret_with_reschedule:"
+                                    "cabi(0x%x) operation(%d)\n",
+                                   (int) cabi, cabi->pm.operation);
+	
+			break;
+	}
+	EXIT;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_debug.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_debug.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_debug.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_debug.c	2007-06-17 00:25:02.000000000 +0900
@@ -0,0 +1,607 @@
+/*
+ * linux/drivers/cabi/cabi_debug.c
+ * 
+ * CABI -- CPU Accounting and Blocking Interfaces.
+ * 
+ * Copyright (C) OS Research Group, Waseda University 
+ *		 Nakajima Laboratory.
+ * 
+ * 2007-04	Midori Sugaya <doly@dcl.info.waseda.ac.jp>
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi_debug.h>
+
+/*
+ * For test
+ */
+int cabi_len = 0;
+int cabi_debug_counter = 0;
+unsigned long *start_address;
+int cabi_dump_flag = 0;
+int cabi_mask = 0x3ff; 
+int serial_number = 0;
+/* 
+ * 0x3f  (63)
+ * 0x7f  (127)
+ * 0xff  (255)
+ * 0x1ff (511)
+ * 0x3ff (1023)
+ * 0xfff  (4095)
+ * 0x13ff (5120) 1024*5
+ */
+struct cabi_dump_struct 
+{
+	int counter;
+	int logid;
+	unsigned long long  tsc_time;
+	pid_t pid;
+	unsigned long cabi_id;
+};
+struct cabi_dump_struct *cs;
+
+unsigned long buffer_counter = 0;
+unsigned long serial_counter = 0;
+
+int cabi_dump_init (void)
+{
+
+	cs = malloc ((cabi_mask+1)* (sizeof (struct cabi_dump_struct)));
+	bzero(cs, (cabi_mask+1) * (sizeof (struct cabi_dump_struct)));
+	buffer_counter = 0;
+	
+	if (!cs) {
+		printk ("Can not allocate memory!\n");
+		return -1;
+	} 
+
+	cabi_dump_flag = 1;
+	return 0;
+}
+
+void cabi_dump (int logid)
+{
+
+	unsigned long long now;
+	unsigned long flag;
+	
+	if (cabi_dump_flag) {	
+	cabi_spin_lock (flag);	
+		(cs + buffer_counter)->counter = serial_counter++;
+		(cs + buffer_counter)->logid = logid;
+		 cabi_eval_rdtsc(&now);
+		(cs + buffer_counter)->tsc_time = now;
+		(cs + buffer_counter)->pid = 0;
+		(cs + buffer_counter)->cabi_id = 0;
+		buffer_counter++;
+		buffer_counter &= cabi_mask;
+	cabi_spin_unlock (flag);	
+	}
+}
+
+void cabi_dump_sched (int logid, struct task_struct *tsk)
+{
+
+	unsigned long long now;
+	unsigned long flag;
+
+	cabi_account_t cabi = TASK_ACCOUNT(tsk);
+
+
+	if (cabi_dump_flag) {	
+		cabi_spin_lock (flag);	
+		(cs + buffer_counter)->counter = serial_counter++;
+		(cs + buffer_counter)->logid = logid;
+		 cabi_eval_rdtsc(&now);
+		(cs + buffer_counter)->tsc_time = now;
+		(cs + buffer_counter)->pid = tsk->pid;
+
+		if (cabi == NULL) {
+			(cs + buffer_counter)->cabi_id = 0;
+		} else {
+			(cs + buffer_counter)->cabi_id = cabi->cabi_id;
+		}
+		buffer_counter++;
+		buffer_counter &= cabi_mask;
+        	cabi_spin_unlock (flag);	
+	}
+}
+
+void cabi_dump_end (void)
+{
+	unsigned long flag;
+	cabi_dump_flag = 0;
+	buffer_counter = 0;
+
+	cabi_spin_lock(flag);
+	free (cs);
+	cabi_spin_unlock(flag);
+	
+
+}
+void cabi_dump_ex (int logid, struct task_struct *tsk, 
+					cabi_account_t cabi)
+{
+
+	unsigned long long now;
+	
+	cabi_eval_rdtsc(&now);
+
+	if (cabi_dump_flag) {	
+		(cs + buffer_counter)->counter = serial_counter++;
+		(cs + buffer_counter)->logid = logid;
+		(cs + buffer_counter)->tsc_time = now;
+		if (tsk) 
+			(cs + buffer_counter)->pid = tsk->pid;
+		else
+			(cs + buffer_counter)->pid = 0;
+		if (cabi)
+			(cs + buffer_counter)->cabi_id = cabi->cabi_id;
+		else
+			(cs + buffer_counter)->cabi_id = 0;
+	
+		buffer_counter++;
+		buffer_counter &= cabi_mask;
+	}
+}
+
+unsigned long read_counter;
+void cabi_dump_read (int flag)
+{
+
+	unsigned long temp_counter;
+	unsigned long local_counter;
+	unsigned long flags;
+	int i;
+
+	cabi_dump_flag = 0;
+	local_counter = 0;
+	
+	if (flag) {
+		temp_counter = buffer_counter;
+		temp_counter &= cabi_mask;
+		printk("logid = %d\n", (cs+temp_counter)->logid);
+	
+		if ((cs + temp_counter)->logid == 0) {
+			printk("READ_0\n");
+			read_counter = 0;
+		} else {
+			printk("READ_1\n");
+			read_counter = buffer_counter;
+		}
+	}
+	printk ("CABI-DUMP: LOOP %d\n", cabi_mask+1);
+	for (i=0; i< 127; i++) {
+		local_counter++;
+		
+		if ((cs+read_counter)->pid) {
+		printk ("CABI-DMPRD:%lu %lu %d %llu %ld %lu\n",
+			local_counter,
+			(cs + read_counter)->counter,
+			(cs + read_counter)->logid,
+			(cs + read_counter)->tsc_time,
+			(cs + read_counter)->pid,
+			(cs + read_counter)->cabi_id);
+		} else {
+		printk ("CABI-DMPRD:%lu %lu %d %llu \n",
+			local_counter,
+			(cs + read_counter)->counter,
+			(cs + read_counter)->logid,
+			(cs + read_counter)->tsc_time);
+		}
+		read_counter++;
+		read_counter &= cabi_mask;
+	}
+	cabi_spin_unlock(flags);
+}
+
+
+
+void
+__debug_timespecs (struct timespec *c, struct timespec *t)
+{
+	cabi_debug("[debug_timespec] c (%lu %lu) t (%lu %lu)\n",
+		(unsigned long)c->tv_sec,
+		(unsigned long)c->tv_nsec,
+		(unsigned long)t->tv_sec,
+		(unsigned long)t->tv_nsec);
+}
+
+void
+cabi_debug_timespecs (struct timespec *c, struct timespec *t)
+{
+	__debug_timespecs (c,t);
+}
+
+void 
+cabi_debug_timespec (struct timespec *t)
+{
+
+	cabi_debug ("[debug_timespec] t (%lu %lu)\n",
+                (unsigned long)t->tv_sec,
+                (unsigned long)t->tv_nsec);
+}
+	
+
+void cabi_debug_ct(cpu_capacity_quad_t qc, cpu_capacity_quad_t qt)
+{
+	cabi_debug("[debug_ct] qc %llu qt %llu\n", qc, 	qt);
+}
+
+void
+cabi_debug_capacity (cpu_capacity_t capacity)
+{
+	cabi_debug ("[capacity] capacity %lu.%02lu \n",
+                        CAPACITY_INT(capacity),
+                        CAPACITY_FRAC(capacity));
+}
+
+
+void
+cabi_debug_sig_param (cabi_account_t cabi)
+{ 
+        cabi_debug ("[cabi_sig_param] signal : pid %d sig %d flag %d\n",
+               (int) CABI_SIGPID(cabi), (int) CABI_SIGNUM(cabi),
+               (int) CABI_SIGFLAG(cabi));
+}
+
+void
+cabi_sanity_check_euid (struct task_struct *tsk) 
+{
+	
+	cabi_debug("cabi_create: Permission denied. EUID [%d], UID [%d]\n",
+                       tsk->euid, tsk->uid);
+}
+
+int 
+pid_check (pid_t pid)
+{
+	if (pid < 0) {
+		cabi_debug("[bind_pid]: invalid pid (%d)\n",
+			(int) pid );
+		return 1;
+	}
+	return 0;
+}
+
+int 
+pgid_check (pid_t pgid) 
+{
+	if (pgid == 0 || pgid < 0) {
+		cabi_debug("[bind_pgid]: invalid pgid %d.\n",
+			(int) pgid);
+		return 1;
+	}
+	return 0;
+}
+
+int
+cabi_id_check (unsigned long cabi_id)
+{
+ 	if (cabi_id == 0 || cabi_id == OVERLOAD_CABI_ID) {
+		cabi_debug("invalid cabi id %d\n",
+				(int) cabi_id);
+                return 1;
+        }
+	return 0;
+}
+
+void 
+bind_pid_error (pid_t pid)
+{
+	cabi_debug ("[bind_pid] error invalid pid(%d)\n", pid);
+}
+ 
+
+void 
+cabi_account_attached(void)
+{
+	
+      cabi_debug("account_attach_process: account already "
+            "attached pid(%d) cabi(0x%x) [%d]\n",
+             current->pid, (int) TASK_ACCOUNT(current),
+	     (int) TASK_ACCOUNT(current)->cabi_id);
+}
+
+int 
+cabi_sanity_check_ucabi (struct cabi_uaccount *user_cabi) 
+{
+
+	ENTER;
+
+
+	if (user_cabi->cpu_time.tv_sec <= 0 && 
+			user_cabi->cpu_time.tv_nsec <= 0) {
+		cabi_debug("cpu_time sec %lu nsec %lu\n",
+			user_cabi->cpu_time.tv_sec,
+			user_cabi->cpu_time.tv_nsec);
+                return CABI_EINVAL;
+	}
+
+	CABI_DBG;
+        if (user_cabi->cpu_period.tv_sec <= 0 && 
+			user_cabi->cpu_period.tv_nsec <= 0) {
+                cabi_debug("period sec %lu nsec %lu\n",
+                        user_cabi->cpu_period.tv_sec,
+                        user_cabi->cpu_period.tv_nsec);
+                return CABI_EINVAL;
+	}
+	
+	CABI_DBG;
+	if (user_cabi->pm.operation == OP_BLOCK) {
+        	if (user_cabi->pm.operation <= OP_NONE ||
+            	user_cabi->pm.operation >= OP_UNKNOWN)
+                	return CABI_EINVAL;
+
+	CABI_DBG;
+        if (user_cabi->pm.bind_proc_type < BIND_NORMAL_PROC ||
+            user_cabi->pm.bind_proc_type > BIND_IDLE_PROC)
+                return CABI_EINVAL;
+	}
+
+	if (user_cabi->pm.operation == CABI_SIGNAL) {
+		CABI_DBG;
+        	if (user_cabi->pm.operation == OP_SIGNAL) {
+                	if (user_cabi->pm.cabi_signal.pid < 0)
+                        return CABI_EINVAL;
+                if (user_cabi->pm.cabi_signal.sig < 0 ||
+                    user_cabi->pm.cabi_signal.sig > 32)
+                        return CABI_EINVAL;
+        	}
+	}
+	EXIT;
+	return CABI_SUCCESS;
+}	
+
+int 
+cabi_sanity_check_operation (struct cabi_uaccount *ucabi)
+{
+	ENTER;
+	switch (ucabi->pm.operation) {
+	case OP_BLOCK:
+		CABI_DBG;
+		if (ucabi->pm.operation != OP_BLOCK) 
+			return -1;
+		break;
+	case OP_SIGNAL:
+		CABI_DBG;
+		if (ucabi->pm.operation != OP_SIGNAL) 
+			return -1;
+
+		/* check signal validatations */
+		if (!ucabi->pm.cabi_signal.pid) {
+			ucabi->pm.cabi_signal.flag = CABI_SEND_DEFL;
+		} else {
+			ucabi->pm.cabi_signal.flag = CABI_SEND_PID;
+		}
+		if (!ucabi->pm.cabi_signal.sig) {
+			/* if signal number is null, set default
+			  signal which will do nothing. */
+			ucabi->pm.operation = OP_NONE;
+		}
+		break;
+	default:
+		break;
+	}
+	EXIT;
+
+	return 0;
+}
+
+int 
+cabi_sanity_check_timespec_c_and_t (
+		struct timespec *cpu_time, 
+		struct timespec *cpu_period)
+{
+	
+	long time_sec, time_nsec, period_sec, period_nsec;
+
+	ENTER;
+
+	/* Initialize and check execution time parameter */
+        time_sec = time_nsec = 0;
+        time_sec = cpu_time->tv_nsec / NANOSEC;
+        time_nsec = cpu_time->tv_nsec - time_sec * NANOSEC;
+
+        if (time_sec > (long)(CABI_TIME_SEC_MAX - cpu_time->tv_sec)) {
+		cabi_debug("cpu_time is over %ld\n", time_sec);
+                return CABI_EINVAL;
+	}
+        time_sec += cpu_time->tv_sec;
+
+        /* Initizalize and check the period parameter */
+        period_sec = period_nsec = 0;
+        period_sec = cpu_period->tv_nsec / NANOSEC;
+        period_nsec = cpu_period->tv_nsec - period_sec * NANOSEC;
+
+
+        if (period_sec > (long)(CABI_TIME_SEC_MAX - cpu_period->tv_sec)) {
+		cabi_debug ("cpu_period is over max %ld\n", period_sec);
+                return CABI_EINVAL;
+	}
+        period_sec += cpu_period->tv_sec;
+
+	
+        if (time_sec > period_sec) {
+		cabi_debug ("time_sec > period_sec error!.\n");
+                return CABI_EINVAL;
+        } else if (time_sec == period_sec) {
+                if (time_nsec > period_nsec) {
+			cabi_debug ("time_nsec > period_nsec error.\n");
+                        return CABI_EINVAL;
+
+                }
+	}
+	EXIT;
+	return CABI_SUCCESS;
+}
+
+int 
+cabi_sanity_check_overload_id (void)
+{
+	
+	cabi_account_t cabi;
+
+	if ((cabi = search_cabi(OVERLOAD_CABI_ID)) != NULL) {              
+		cabi_debug ("[%d] idle cabi_id : %d\n", __LINE__,
+                                (int)cabi->cabi_id);
+                        return CABI_EINVAL;
+	}
+	return CABI_SUCCESS;
+}
+
+int 
+cabi_sanity_check_sigoperation (struct cabi_uaccount *user_cabi)
+{
+	 if (user_cabi->pm.operation != OP_SIGNAL) {
+			cabi_debug("invalid operation \n");
+                        return CABI_EINVAL;
+                }
+                cabi_debug ("[sys_create] idle cabi act : %d\n",
+                        user_cabi->pm.operation);
+	return CABI_SUCCESS;
+}
+
+void 
+cabi_sanity_check_parameter_to_copy (struct timespec *c,
+			struct timespec *t, struct cpu_param *p)
+{
+	cabi_debug ("[sys_create: %d] c(%d), t(%d), operation(%x)"
+                    " bind_proc_type (%x)\n",
+		     __LINE__,
+                    (int) c->tv_nsec,
+		    (int) t->tv_nsec, 
+		     p->operation, 
+		     p->bind_proc_type);
+}
+
+
+/* cabi account attach */
+void cabi_debug_info (cabi_account_t cabi)
+{
+	CABI_DBG;
+	cabi_debug("cabi %x, id %lu\n",
+                        (int)cabi, cabi->cabi_id);
+}
+
+void cabi_debug_state (cabi_account_t cabi)
+{
+	CABI_DBG;
+	cabi_debug ("cabi_cpu_state 0x%x \n", (int) cabi->cpu_state);
+}
+
+void cabi_debug_detach (struct rs_proc_list *rs_proc)
+{
+	CABI_DBG;
+	cabi_debug("[detach] rs_proc(0x%x) pid (%d)\n",
+			(int) rs_proc, rs_proc->rs_proc_pid);
+}
+
+/* replenish */
+void cabi_debug_ticks (cabi_account_t cabi)
+{
+	CABI_DBG;
+	cabi_debug ("[ticks] used %llu period %llu avail %llu cpu_time %llu\n",
+                        cabi->cpu_period_used_ticks,
+                        cabi->cpu_period_ticks,
+			cabi->cpu_period_available_ticks,
+			cabi->cpu_time_ticks);
+}
+
+void cabi_debug_avail_ticks (cabi_account_t cabi)
+{
+	cabi_debug ("[avail_ticks] used %llu avail %llu cpu_time %llu\n",
+                        cabi->cpu_period_used_ticks,
+			cabi->cpu_period_available_ticks,
+			cabi->cpu_time_ticks);
+}
+
+void cabi_debug_tick (unsigned long long tick)
+{
+	cabi_debug("tick %llu\n", tick);
+}
+
+/* cabi_sched.c */
+void cabi_debug_entities (cabi_account_t cabi01,
+				cabi_account_t cabi02)
+{
+	cabi_debug("cabi %x cabi %x\n",
+                        (int) cabi01, (int) cabi02);
+}
+
+/* cabi_signal.c */
+void cabi_debug_signal_pid (int pid)
+{
+	cabi_debug ("cabi_send_signal: invalid pid (%d).\n", pid);
+        cabi_debug ("No received process. (%d)\n", pid);
+}
+
+void cabi_debug_signal_attrs (int pid, int sig, struct task_struct *tsk)
+{
+	cabi_debug ("cabi_send_signal: pid %d sig %d \n", pid, sig);
+	cabi_debug ("pid %d status %d\n",
+                                (int) tsk->pid, (int)tsk->state);
+	cabi_debug ("signal_struct: count %d\n",
+                                (int)tsk->signal->count.counter);
+} 
+
+/* cabi_timer.c */
+void cabi_debug_times (struct timer_list *tmr, unsigned long jiffies)
+{
+	cabi_debug (" expires %lu, jiffies %lu)\n", 
+				tmr->expires, jiffies);
+}
+
+void cabi_debug_timer (struct timer_list *tmr)
+{
+	cabi_debug (" expires %lu \n", tmr->expires);
+}
+
+/* kernel/sched.c */
+void cabi_debug_ksched (struct task_struct *prev, 
+			struct task_struct *next)
+{
+
+	cabi_account_t prev_cabi = TASK_ACCOUNT(prev);
+	cabi_account_t next_cabi = TASK_ACCOUNT(next);
+
+	/* check */
+	if (prev_cabi) {
+		cabi_debug ("\n[ksched:01] prev_cabi[%d] prev[%d] next[%d]\n",
+				(int) prev_cabi->cabi_id,
+				prev->pid, next->pid);
+		}
+	if (next_cabi) {
+		cabi_debug ("[ksched:02]next_cabi[%d] prev[%d] next[%d]\n",
+				(int) next_cabi->cabi_id,
+				prev->pid, next->pid);
+		}
+
+	if (prev->pid == 0 || next->pid == 0) {
+		if (TASK_ACCOUNT(prev) || TASK_ACCOUNT(next)) {
+			cabi_debug ("[ksched:03] prev[%d] next[%d]\n",
+			prev->pid, next->pid);
+		 }
+	}
+}
+
+/* cabi dsv */
+
+void check_sched_param (struct rs_proc_list *rs_proc)
+{
+       
+       struct sched_param param;
+       sys_sched_getparam (rs_proc->rs_proc_task->pid, &param);
+       cabi_debug ("[boost:] pid %d prio %d\n", 
+			rs_proc->rs_proc_task->pid, 
+			(int) param.sched_priority);
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_defsrv.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_defsrv.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_defsrv.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_defsrv.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,458 @@
+/*
+ * CABI -- CPU Accounting and Blocking Interfaces.
+ *
+ * Copyright (C) OS Research Group, Waseda University
+ *             /  Nakajima Laboratory. MontaVista Software, Inc.
+ *
+ * Midori Sugaya <doly@dcl.info.waseda.ac.jp>
+ *
+ */
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi_debug.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+
+/*
+ * --------------------------------------------------------------
+ * Policy    : Defferrable Server Algorithm
+ * ---------------------------------------------------------------
+ * Status    : RUNNING   running when the replenish timer expires.
+ *           : DEPLETED  depleted when the replenish timer expires,
+ *           : NULL
+ * ---------------------------------------------------------------
+ * Operation : BLOCK     change status of the operation
+ *           : SIGNAL    change status of the operation
+ *           : NONE      nothing to do 
+ * ---------------------------------------------------------------
+ * Priority  : Boost until cpu_time
+ * ---------------------------------------------------------------
+ */
+
+extern cabi_account_t cabi_current_account;
+int dsv_is_running = 0;
+extern int sched_setscheduler (struct task_struct *p, int policy,
+                        struct sched_param *param);
+extern spinlock_t cabi_lock;
+
+
+/* prototypes */
+void dsv_replenish (cabi_account_t);
+void dsv_replenish_capacity (cabi_account_t);
+void dsv_account_status_in (cabi_account_t);
+void dsv_account_status_out (cabi_account_t);
+void dsv_statistic_calculation (cabi_account_t);
+void dsv_replenish_operation (cabi_account_t);
+void dsv_replenish_priority_boost (cabi_account_t);
+void dsv_isr (cabi_account_t);
+void dsv_isr_operation (cabi_account_t);
+void dsv_isr_priority (cabi_account_t);
+int setup_boost_priority (cabi_account_t);
+int free_boost_priority (cabi_account_t);
+int cabi_sched_setscheduler (pid_t,int policy, struct sched_param *);
+
+
+/* operation table */
+struct cabi_account_operations dsv_ops = {
+       .replenish     = dsv_replenish,
+       .isr           = dsv_isr,
+       .isr_operation = dsv_isr_operation,
+       .isr_priority  = dsv_isr_priority,
+};
+
+void 
+dsv_replenish (cabi_account_t cabi)
+{
+        ENTER;
+	dsv_account_status_in(cabi);
+	dsv_statistic_calculation (cabi);
+	dsv_replenish_capacity(cabi);
+        dsv_account_status_out (cabi);
+	EXIT;
+}
+
+void
+dsv_replenish_capacity (cabi_account_t cabi)
+{
+	ENTER;
+
+        /* debug */
+        cabi_debug_ticks(cabi);
+        cabi_debug_info(cabi);
+
+        /* reset the used ticks */
+        cabi->cpu_period_used_ticks = 0;
+	EXIT;
+}
+
+void 
+dsv_stop_account (cabi_account_t cabi)
+{
+	cpu_tick_data_t now;
+
+	ENTER;
+	cabi_rdticks(&now);
+	cabi_stop_account(cabi, &now);
+	dsv_is_running++;
+	EXIT;
+}
+void 
+dsv_account_status_in (cabi_account_t cabi)
+{
+        
+	ENTER;
+	/* it should be changed to the initial status, CABI_IS_NULL. */
+	dsv_stop_account(cabi);
+        cabi->cpu_state &= ~CABI_IS_DEPLETED;
+	EXIT;
+}
+
+
+
+
+void
+dsv_replenish_operation (cabi_account_t cabi)
+{
+	ENTER;
+	
+	/* even if there are set any operation, if policy is dsv,
+	   one operation should be done. */
+
+	switch (cabi->pm.operation) {
+		case OP_BLOCK:
+			cabi_debug("[op_block] op: %d\n",
+					cabi->pm.operation);
+			break;
+		case OP_SIGNAL:
+			cabi_debug("[op_signal] op: %d\n",
+					cabi->pm.operation);
+			break;
+		case OP_NONE:
+			cabi_debug("[op_boost] op: %d\n",
+					cabi->pm.operation);
+			break;
+		default:
+			break;
+	}
+	EXIT;
+}
+
+void 
+dsv_cpu_time_replenish_init (cabi_account_t cabi)
+{
+
+	cabi->cpu_period_available_ticks = cabi->cpu_time_ticks;
+}
+
+extern int bonus;
+
+void
+dsv_account_status_out (cabi_account_t cabi)
+{
+
+	unsigned long long avail = cabi->cpu_period_available_ticks;
+	unsigned long long used  = cabi->cpu_period_prev_used_ticks;
+	unsigned long long cpu_time = cabi->cpu_time_ticks;
+
+	ENTER;
+
+	cabi->priority_boost = 1;
+	cabi->priority_depleted = 1;
+
+
+	if (avail > 0){
+		switch (cabi->pm.rep_policy) {
+		case REP_SOFT:
+			CABI_DBG;
+			if (avail != cabi->cpu_time_ticks) {
+               			cabi_debug_ticks (cabi);
+				dsv_cpu_time_replenish_init(cabi);		
+			}
+			break;
+		case REP_HARD:
+			CABI_DBG;
+			if (bonus) {
+				bonus = ~bonus;
+				dsv_cpu_time_replenish_init(cabi);
+					break;
+			} else {
+				bonus++;
+				if (cpu_time > used) {
+					CABI_DBG;
+					/* used less than the capacity */
+					dsv_cpu_time_replenish_init(cabi);
+					cabi->cpu_period_available_ticks
+						+= (cpu_time - used);
+					cabi_debug_avail_ticks (cabi);		
+				} else if (cpu_time < used) {
+					CABI_DBG;
+					/* used more than the capacity */
+					dsv_cpu_time_replenish_init(cabi);
+					cabi->cpu_period_available_ticks
+						-= (used - cpu_time);
+					cabi_debug_avail_ticks (cabi);
+				} else {
+					CABI_DBG;
+					/* equal to avail and used */
+					cabi_debug_avail_ticks (cabi);
+				}
+			}
+			break;
+			default:
+				/* if not set anything, 
+				this path is taken. */
+			break;
+		}
+		dsv_replenish_operation(cabi);
+
+	} else {
+		CABI_DBG;
+		cabi_debug_avail_ticks(cabi);
+		dsv_cpu_time_replenish_init(cabi);
+	}
+	
+	if (dsv_is_running) {
+		cabi_start_account(cabi);
+	}
+
+ 
+
+	EXIT;
+}
+
+void
+dsv_statistic_calculation(cabi_account_t cabi)
+{
+	
+	cpu_capacity_t c;
+	ENTER;
+
+        /* update statistics */
+        cabi->cpu_period_prev_used_ticks 
+		= cabi->cpu_period_used_ticks;
+        cabi->cpu_total_used_ticks 
+		+= cabi->cpu_period_used_ticks;
+
+        /* check capacity */
+        c = capacity_of(cabi->cpu_period_used_ticks, 
+		cabi->cpu_period_ticks);
+
+        /* debug */
+        cabi_debug_capacity(c);
+
+        if (cabi->cpu_max_utilization < c) {
+                cabi->cpu_max_utilization = c;
+
+                /* capacity is exceed the max utilization */
+                cabi_debug_capacity(c);
+
+        } else if (cabi->cpu_min_utilization > c
+                   || cabi->cpu_min_utilization == 0) {
+
+                cabi->cpu_min_utilization = c;
+
+                /* capacity: less than the min util */
+                cabi_debug_capacity(c);
+
+        }
+
+        if (cabi->cpu_max_utilization < c) {
+                cabi->cpu_max_utilization = c;
+
+                /* capacity is exceed the max utilization */
+                cabi_debug_capacity(c);
+
+        } else if (cabi->cpu_min_utilization > c
+                   || cabi->cpu_min_utilization == 0) {
+
+                cabi->cpu_min_utilization = c;
+
+                /* capacity: less than the min util */
+                cabi_debug_capacity(c);
+	}
+
+        /* summation for average */
+        cabi->cpu_average.total_utils += c;
+        cabi->cpu_average.total_count++;
+
+	EXIT;
+}
+
+
+void 
+dsv_isr (cabi_account_t cabi)
+{
+        
+	ENTER;
+	dsv_isr_operation(cabi);
+	EXIT;
+}
+
+void
+dsv_isr_operation (cabi_account_t cabi)
+{
+
+	ENTER;
+	/* whenever the operation is set or not, we set the
+	   same operation for the process */
+
+	switch (cabi->pm.operation) {
+		case OP_NONE:
+			CABI_DBG;
+			break;
+		case OP_BLOCK:
+			CABI_DBG;
+			break;
+		case OP_SIGNAL:
+			CABI_DBG;
+			if (cabi->opt_state != CABI_SIGNAL)
+				cabi->opt_state = CABI_SIGNAL;
+			if (!CABI_SIGFLAG(cabi)) {
+				if (!cabi->signal_block) {
+					cabi_send_signal(
+					cabi, current->pid,
+					CABI_SIGNUM(cabi));
+				}
+			} else {
+				if (!cabi->signal_block) {
+					cabi_send_signal(
+					cabi, CABI_SIGPID(cabi),
+					CABI_SIGNUM(cabi));
+				}
+			}	
+		default:
+			cabi_current_account = NULL_ACCOUNT;
+			break;
+	}
+	dsv_isr_priority (cabi);
+
+	EXIT;
+}
+
+void
+dsv_isr_priority (cabi_account_t cabi)
+{
+
+	ENTER;
+	CABI_DBG;
+
+	/* if this function called from the replenish_account,
+	 the priority_boost bit is set. So we setup the 
+	 priority boost */
+	if (cabi->priority_boost) {
+		setup_boost_priority(cabi);
+		cabi->priority_boost = 0;
+
+	} 
+
+	/* if cabi is depleted, the flag is set
+	   at fist time */
+	if (cabi->priority_depleted) {
+		if (cabi->cpu_state & CABI_IS_DEPLETED) {
+			free_boost_priority (cabi);
+			cabi->priority_depleted = 0;
+		}
+		
+	} 
+	EXIT;
+} 
+
+int
+free_boost_priority (cabi_account_t cabi)
+{
+
+        struct sched_param param = { .sched_priority = 0 };
+        struct rs_proc_list *rs_proc;
+        struct list_head *proc_list;
+
+        ENTER;
+
+        cabi_debug_state(cabi);
+        if (!list_empty(&cabi->cpu_proc_list)) {
+                proc_list = cabi->cpu_proc_list.next;
+                while (proc_list != &cabi->cpu_proc_list) {
+                        rs_proc = list_entry (proc_list,
+                        struct rs_proc_list, rs_proc_list);
+
+                        /* if there are tasks in the list
+                         * set static priority
+                         */
+                        if(rs_proc->rs_proc_task) {
+				//check_sched_param (rs_proc->rs_proc_task);
+                                cabi_sched_setscheduler(
+                                        rs_proc->rs_proc_task->pid,
+                                        SCHED_NORMAL, &param);
+                        }
+                        proc_list = proc_list->next;
+                }
+        }
+        EXIT;
+        return 0;
+
+}
+
+int
+cabi_sched_setscheduler(pid_t pid,
+                        int policy, struct sched_param *param)
+{
+        struct task_struct *p;
+        int retval;
+
+        ENTER;
+
+        if (!param || pid < 0)
+                return -EINVAL;
+
+        p = __cabi_find_process_by_pid (pid);
+        if (!p) {
+                read_unlock_irq(&tasklist_lock);
+                return -ESRCH;
+        }
+
+        retval = sched_setscheduler(p, policy, param);
+
+        EXIT;
+
+        return retval;
+}
+
+
+int 
+setup_boost_priority (cabi_account_t cabi)
+{
+
+        struct rs_proc_list *rs_proc = NULL;
+        struct list_head *proc_list = NULL;
+        struct sched_param param;
+
+        ENTER;
+
+        if (!list_empty(&cabi->cpu_proc_list)) {
+
+                proc_list = cabi->cpu_proc_list.next;
+                while (proc_list != &cabi->cpu_proc_list) {
+                        rs_proc = list_entry (proc_list,
+                        struct rs_proc_list, rs_proc_list);
+                        /* if there are tasks in the list
+                         * set static priority
+                         */
+                        if(rs_proc->rs_proc_task) {
+
+				//check_sched_param (rs_proc->rs_proc_task);
+
+                                param.sched_priority =
+                                        CABI_RT_MAX_PRIORITY;
+
+                                cabi_sched_setscheduler (
+                                        current->pid,
+                                        SCHED_FIFO, &param);
+                        }
+                        proc_list = proc_list->next;
+                }
+        }
+
+        EXIT;
+	return 0;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_init.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_init.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_init.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_init.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,154 @@
+ /*
+  * linux/drivers/cabi/cabi_init.c 
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * This project was developed under the direction of Dr. Tatsuo Nakajima.
+  *
+  * Authors: Midori Sugaya, Hirotaka Ishikawa
+  * 
+  * Please send bug-reports/suggestions/comments to qos@dcl.info.waseda.ac.jp 
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  * 
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with this software; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+
+#include <linux/string.h>
+#include <cabi/cabi.h>
+
+extern void cabi_account_init(void);
+extern void cabi_proc_init(void);
+
+spinlock_t cabi_lock;
+cpu_tick_data_t cabi_cpu_ticks_per_second;
+cpu_tick_data_t cabi_cpu_ticks_per_jiffy;
+cpu_tick_data_t cabi_timer_adjust;
+
+#ifdef CONFIG_X86
+#include <asm/div64.h>
+#define CHECK_TIME	5
+void
+cabi_cpu_calibration_i386(void)
+{
+
+	unsigned long first_jiffies, rem, ctps;
+	int i = CHECK_TIME;
+	cpu_tick_data_t  begin, end, err, result;
+
+	ENTER;
+	printk("Cabi_cpu_calibration: Calibrating cpu ");
+
+	cabi_rdticks(&begin);
+	cabi_rdticks(&end);
+	err = end - begin;
+
+/* for debugging */
+	/* wait for "start of" clock tick */
+	first_jiffies = jiffies;
+	while (first_jiffies == jiffies) {
+		/* nothing */
+	}
+	cabi_rdticks(&begin);
+	while (i-- > 0) {
+		first_jiffies = jiffies;
+		printk(".");
+		while (jiffies - first_jiffies < HZ) {
+			
+		}
+	}
+	cabi_rdticks(&end);
+	/* */
+
+	result = (end - begin - err);
+        cabi_cpu_ticks_per_second = result;
+
+	/* use 64bit interface */
+	ctps = div_long_long_rem(result, CHECK_TIME, &rem);
+	/* set the calculated result */
+	cabi_cpu_ticks_per_second = (unsigned long long) ctps;
+	
+	
+	EXIT;
+}
+#else	/* #ifdef CONFIG_X86 */
+void
+cabi_cpu_calibration(void)
+{
+	printk("cabi initialize...\n");
+	cabi_cpu_ticks_per_second = CABI_CLOCK;
+	{
+		cpu_tick_data_t t;
+		t = CABI_USECS_PER_JIFFY;
+		usec2tick(&t, &cabi_cpu_ticks_per_jiffy);
+	}
+}
+#endif	/* #ifdef CONFIG_X86 */
+
+void
+cabi_init(void)
+{
+	spin_lock_init(&cabi_lock);
+	cabi_account_init();
+#ifdef CONFIG_X86
+	cabi_cpu_calibration_i386();
+#else
+	cabi_cpu_calibration();
+#endif	/* #ifdef CONFIG_X86 */
+
+#ifdef CONFIG_PROC_FS
+	cabi_proc_init();
+#endif
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_isr.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_isr.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_isr.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_isr.c	2007-06-17 00:26:20.000000000 +0900
@@ -0,0 +1,193 @@
+/*
+  * linux/drivers/cabi/cabi_isr.c
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * This project was developed under the direction of Dr. Tatsuo Nakajima.
+  * Authors: Midori Sugaya, Hirotaka Ishikawa
+  * Please send bug-reports/suggestions/comments to doly@dcl.info.waseda.ac.jp 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  * 
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with this software; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_debug.h>
+#include <cabi/cabi_error.h>
+#include <linux/sched.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <linux/interrupt.h>
+#include <linux/irqflags.h>
+#include <linux/compiler.h>
+
+extern cabi_account_t cabi_current_account;
+/*
+ * Hooks
+ */
+int (*cabi_ret_with_reschedule_hook) (void);
+
+/*
+ * Checks
+ */
+void cabi_debug_current (cabi_account_t cabi_current)
+{
+	if (cabi_current) {
+                cabi_debug("current_account %x\n",
+				(int) cabi_current);
+        }
+}
+void
+cabi_account_sleep_on (cabi_account_t cabi)
+{
+
+	 
+	if (unlikely(irqs_disabled())) {
+		printk(KERN_ERR "BUG1: CABI %s/0x%08x/%d\n",
+		current->comm, preempt_count(), current->pid);
+	}
+        sleep_on (&cabi->depleted_wait);
+	
+        
+}
+
+/*
+ * For "cabi_ret_with_reschedule_hook", cabi_ret_with_reschedule is called when
+ * a process exits from the kernel after processing a system call.
+ */
+
+asmlinkage int
+cabi_ret_with_reschedule(void)
+{
+	cabi_account_t cabi;
+	cabi_current_account = (cabi_account_t)TASK_ACCOUNT(current);
+
+	/* set current account */
+	if ((cabi = cabi_current_account)) {
+		cabi->ops->isr (cabi);
+	}
+	return 0;
+}
+
+void
+cabi_enable_isr(void)
+{
+	unsigned long flags;
+
+	ENTER;
+	cabi_debug(" cabi_enable_isr %x reschedule_hook %x reschedule\n",
+		(int) cabi_ret_with_reschedule_hook,
+		(int) cabi_ret_with_reschedule);
+
+	if (cabi_ret_with_reschedule_hook == &cabi_ret_with_reschedule) {
+		cabi_debug("cabi_enable_isr : hook already set.\n");
+		return;
+	}
+		
+	cabi_spin_lock(flags);
+	{
+		/* install hooks */
+		cabi_ret_with_reschedule_hook = &cabi_ret_with_reschedule;
+
+		cabi_debug ("enable_isr: set ret_with_reschedule_hook %x\n"
+		       " <== address %x (reschedule)\n",
+			(int) cabi_ret_with_reschedule_hook,
+			(int) cabi_ret_with_reschedule);
+		
+		/* start cabi timer */
+		cabi_timer_start();
+	}
+	cabi_spin_unlock(flags);
+
+	EXIT;
+}
+
+void
+cabi_disable_isr(void)
+{
+	unsigned long flags;
+
+	ENTER;
+	if (cabi_ret_with_reschedule_hook == (void *) 0)
+		return;
+
+	cabi_spin_lock(flags);
+	{
+		/* remove hooks */
+		cabi_ret_with_reschedule_hook = (void *) 0;
+	}
+	cabi_spin_unlock(flags);
+	EXIT;
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+int 
+cabi_account_read_block_proc(cabi_account_t cabi, char *buf)
+{
+	char *p = buf;
+	
+	if (cabi->pm.operation == OP_SIGNAL) {
+		p += sprintf(p, "%d\n%lu\n", cabi->opt_state, 
+							cabi->signal_count);
+	} else {
+		p += sprintf(p, "%d\n%lu\n", cabi->opt_state, 
+							cabi->block_count);
+	}
+	return (p - buf);
+}
+
+#endif /* CONFIG_PROC_FS */
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_overload.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_overload.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_overload.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_overload.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,432 @@
+/*
+ * CABI -- CPU Accounting and Blocking Interfaces.
+ *
+ * Copyright (C) OS Research Group, Waseda University
+ *               Nakajima Laboratory. MontaVista Software, Inc.
+ *
+ * Midori Sugaya <doly@dcl.info.waseda.ac.jp>
+ *
+ */
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi_debug.h>
+#include <linux/time.h>
+
+/*
+ * --------------------------------------------------------------
+ * Policy    : Overload Monitoring
+ * ---------------------------------------------------------------
+ * Status    : RUNNING   running when the replenish timer expires.
+ *           : DEPLETED  depleted when the replenish timer expires,
+ *           : NULL
+ * ---------------------------------------------------------------
+ * Operation : BLOCK     change status of the operation
+ *           : SIGNAL    change status of the operation
+ * ---------------------------------------------------------------
+ * Priority  : Not control, use user defined priority
+ * ---------------------------------------------------------------
+ */
+
+extern int overload_cabi;
+extern cabi_account_t cabi_current_account;
+extern cabi_account_t cabi_current_overload_account;
+int is_ovl_running;
+
+void ovl_replenish (cabi_account_t);
+void ovl_replenish_capacity(cabi_account_t);
+void ovl_account_status_in(cabi_account_t);
+void ovl_account_status_out(cabi_account_t);
+void ovl_statistic_calculation(cabi_account_t);
+void ovl_replenish_operation (cabi_account_t);
+void ovl_isr (cabi_account_t);
+void ovl_isr_operation (cabi_account_t);
+
+
+struct cabi_account_operations ovl_ops = {
+       .replenish              = ovl_replenish,
+       .isr                    = ovl_isr,
+       .isr_operation          = ovl_isr_operation,
+};
+
+void 
+ovl_replenish (cabi_account_t cabi)
+{
+
+	ENTER;
+	ovl_account_status_in(cabi);
+	ovl_statistic_calculation (cabi);
+	ovl_replenish_capacity(cabi);
+	ovl_account_status_out (cabi);
+	EXIT;
+
+}
+
+void
+ovl_replenish_capacity (cabi_account_t cabi)
+{
+
+	ENTER;
+        /* debug */
+        cabi_debug_ticks(cabi);
+        cabi_debug_info(cabi);
+
+        /* reset the used ticks */
+        cabi->cpu_period_used_ticks = 0;
+	EXIT;
+}
+
+void 
+ovl_stop_account (cabi_account_t cabi)
+{
+	cpu_tick_data_t now;
+
+	ENTER;
+	cabi_rdticks(&now);
+	cabi_stop_account(cabi, &now);
+	is_ovl_running++;
+	EXIT;
+}
+void 
+ovl_account_status_in (cabi_account_t cabi)
+{
+        
+	ENTER;
+	switch (cabi->cpu_state) {
+		case CABI_IS_NULL:
+			/* 00 : no bind process */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			break;
+		case CABI_IS_RUNNING:
+			/* 01 : passed the period */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			ovl_stop_account(cabi);
+			break;
+		case CABI_IS_DEPLETED:
+			/* 10 : correctly accounted */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			break;
+		case CABI_IS_RUNNING|CABI_IS_DEPLETED:
+			/* 11 : fist time to account */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			ovl_stop_account(cabi);
+			break;
+		default:
+			/* it should not be */
+			CABI_DBG;
+			cabi_debug_state(cabi);
+			break;
+	}
+
+	/* it should be changed to the initial status, CABI_IS_NULL. */
+        cabi->cpu_state &= ~CABI_IS_DEPLETED;
+
+        /* check overload condition */
+        if (overload_cabi)
+                cabi_account_check_overload();
+
+	EXIT;
+}
+
+void
+ovl_replenish_operation (cabi_account_t cabi)
+{
+	ENTER;
+	
+	switch (cabi->pm.operation) {
+		case OP_BLOCK:
+			cabi_debug("[op_block] op: %d\n",
+					cabi->pm.operation);
+			cabi->opt_state = CABI_UNBLOCK;
+			wake_up(&cabi->depleted_wait);
+			break;
+		case OP_SIGNAL:
+			cabi_debug("[op_signal] op: %d\n",
+					cabi->pm.operation);
+			break;
+		default:
+			break;
+	}
+	EXIT;
+}
+
+int ovl_bonus;
+
+void
+ovl_account_status_out (cabi_account_t cabi)
+{
+
+        unsigned long long avail = cabi->cpu_period_available_ticks;
+        unsigned long long used  = cabi->cpu_period_prev_used_ticks;
+	unsigned long long cpu_time = cabi->cpu_time_ticks;
+
+	ENTER;
+	if (avail > 0){
+		switch (cabi->pm.rep_policy) {
+		case REP_SOFT:
+			CABI_DBG;
+			if (avail != cabi->cpu_time_ticks) {
+               			cabi_debug_ticks (cabi);
+				cpu_time_replenish_init(cabi);		
+			}
+			break;
+		case REP_HARD:
+			CABI_DBG;
+			if (ovl_bonus) {
+			ovl_bonus = ~ovl_bonus;
+				cpu_time_replenish_init(cabi);
+				break;
+			} else {
+				ovl_bonus++;
+				if (cpu_time > used) {
+					CABI_DBG;
+					/* used less than the capacity */
+					cpu_time_replenish_init(cabi);
+					cabi->cpu_period_available_ticks
+						+= (cpu_time - used);
+					cabi_debug_avail_ticks (cabi);		
+				} else if (cpu_time < used) {
+					CABI_DBG;
+					/* used more than the capacity */
+					cpu_time_replenish_init(cabi);
+					cabi->cpu_period_available_ticks
+						-= (used - cpu_time);
+					cabi_debug_avail_ticks (cabi);
+				} else {
+					CABI_DBG;
+					/* equal to avail and used */
+					cabi_debug_avail_ticks (cabi);
+				}
+			}
+			break;
+		default:
+			/* if not set anything, this path is taken. */
+			break;
+		}
+		ovl_replenish_operation(cabi);
+
+	} else {
+		CABI_DBG;
+		cabi_debug_avail_ticks(cabi);
+		cpu_time_replenish_init(cabi);
+
+		if (is_ovl_running)
+			cabi_start_account(cabi);
+
+	} 
+	EXIT;
+}
+
+
+void
+ovl_statistic_calculation(cabi_account_t cabi)
+{
+	
+	cpu_capacity_t c;
+
+	ENTER;
+        /* update statistics */
+        cabi->cpu_period_prev_used_ticks = 
+			cabi->cpu_period_used_ticks;
+        cabi->cpu_total_used_ticks += 
+			cabi->cpu_period_used_ticks;
+
+        /* check capacity */
+        c = capacity_of(cabi->cpu_period_used_ticks, 
+			cabi->cpu_period_ticks);
+
+        /* debug */
+        cabi_debug_capacity(c);
+
+        if (cabi->cpu_max_utilization < c) {
+                cabi->cpu_max_utilization = c;
+
+                /* capacity is exceed the max utilization */
+                cabi_debug_capacity(c);
+
+        } else if (cabi->cpu_min_utilization > c
+                   || cabi->cpu_min_utilization == 0) {
+
+                cabi->cpu_min_utilization = c;
+
+                /* capacity: less than the min util */
+                cabi_debug_capacity(c);
+
+        }
+
+        if (cabi->cpu_max_utilization < c) {
+                cabi->cpu_max_utilization = c;
+
+                /* capacity is exceed the max utilization */
+                cabi_debug_capacity(c);
+
+        } else if (cabi->cpu_min_utilization > c
+                   || cabi->cpu_min_utilization == 0) {
+
+                cabi->cpu_min_utilization = c;
+
+                /* capacity: less than the min util */
+                cabi_debug_capacity(c);
+	}
+
+        /* summation for average */
+        cabi->cpu_average.total_utils += c;
+        cabi->cpu_average.total_count++;
+
+	EXIT;
+}
+
+
+void 
+ovl_isr (cabi_account_t cabi)
+{
+        
+	ENTER;
+	if (cabi_account_overload(cabi)) 
+		ovl_isr_operation(cabi);
+	
+	EXIT;
+}
+
+void
+ovl_isr_operation (cabi_account_t cabi)
+{
+
+	ENTER;
+	switch (cabi->pm.operation) {
+		case OP_SIGNAL:
+			CABI_DBG;
+			if (cabi->opt_state != CABI_SIGNAL)
+				cabi->opt_state = CABI_SIGNAL;
+			if (!cabi->signal_block) {
+				cabi_send_signal(
+				cabi, CABI_SIGPID(cabi),
+				CABI_SIGNUM(cabi));
+			}
+			break;
+		default:
+			cabi_current_account = NULL_ACCOUNT;
+			CABI_DBG;
+		        cabi_debug ("cabi_ret_with_reschedule:"
+                                    "cabi(0x%x) operation(%d)\n",
+                                   (int) cabi, cabi->pm.operation);
+	
+			break;
+	}
+	EXIT;
+}
+
+
+
+
+/*
+ * Name: capacity_overload 
+ * 
+ * This function return the new_qc value as a cpu time within 
+ * a period.
+ */
+void
+capacity_overload(struct timespec *c, struct timespec * t,
+			cpu_capacity_quad_t * overload_qc)
+{
+	
+	cpu_capacity_quad_t qc, qt, new_nano_qc;
+	cpu_capacity_t capacity;
+  
+	/* realculate requested capacity */
+	qc =  (c->tv_sec * NANOSEC) + c->tv_nsec;
+	qt =  (t->tv_sec * NANOSEC) + t->tv_nsec; 
+
+	/* current requested capacity */	
+	capacity     = capacity_of(qc, qt);
+        cabi_debug("[over] capacity(%lu.%02lu)\n",
+                CAPACITY_INT(capacity), CAPACITY_FRAC(capacity));
+
+	new_nano_qc  = qt - qc;
+        cabi_debug("[over] new nano_qc %llu\n",  
+						new_nano_qc);
+	overload_qc  = &new_nano_qc;
+        cabi_debug("[over] overload_qc (%p),\n", 
+			(cpu_capacity_quad_t *)overload_qc);
+
+	
+}
+
+int setup_overload_id (cabi_account_t cabi)
+{
+
+
+	if (!cabi_current_overload_account) {
+	/* 
+	 * set reserved id for the overload cabi
+	 */
+	cabi->cabi_id = OVERLOAD_CABI_ID;
+	cabi_current_overload_account = cabi;
+
+    	cabi_debug("[overload] overload id set %ld %x\n",
+           cabi->cabi_id, (int) cabi_current_overload_account);
+
+	} else {
+    	/* 
+     	 * if there has been existed a overload cabi,
+     	 * return with error.
+     	 */
+    	cabi_debug("[ovl_set:02] overload already exist %x. error return.\n",
+        	   (int) cabi_current_overload_account);
+
+    	free (cabi);
+    	return CABI_ERROR;
+	}
+  
+	return CABI_SUCCESS;
+}
+
+int
+cabi_account_check_overload(void)
+{
+	cabi_account_t cabi = NULL_ACCOUNT;
+
+	/* search overload cabi */
+	if (!cabi_current_overload_account) {
+		return CABI_ENOEXIST;
+	} else {
+		cabi = cabi_current_overload_account;
+	}
+
+	if (cabi != cabi_current_account)
+		return CABI_ENOAVLE;
+
+	/* compare the total used ticks with available ticks. */
+	if (cabi->cpu_period_used_ticks < cabi->cpu_period_available_ticks) {
+
+                cabi_debug("cabi_account_check_overload:"
+                       " cabi(0x%x) used(%lu) < available(%lu)\n",
+                       (int) cabi,
+                       (unsigned long) cabi->cpu_period_used_ticks,
+                       (unsigned long) cabi->cpu_period_available_ticks);
+
+		if (cabi->cabi_id == OVERLOAD_CABI_ID) {
+			cabi->overload |= CABI_IS_OVERLOAD;
+
+                        cabi_debug("cabi state overload(0x%x), signal sent.\n",
+                               (int) cabi->overload);
+
+			cabi_send_signal(cabi, CABI_SIGPID(cabi), 
+					 CABI_SIGNUM(cabi));
+		}
+	} else {
+		if (cabi->cabi_id == OVERLOAD_CABI_ID) {
+
+                        cabi_debug("overload(0x%x), no signal sent.\n",
+                               (int) cabi->overload);
+
+			cabi->overload = CABI_IS_NULL;
+		}
+	}
+	return CABI_SUCCESS;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_procfs.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_procfs.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_procfs.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_procfs.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,209 @@
+ /*
+  * linux/drivers/cabi/cabi_procfs.c - implements the interfaces for procfs.
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * This project was developed under the direction of Dr. Tatsuo Nakajima.
+  *
+  * Authors: Midori Sugaya, Hirotaka Ishikawa
+  * 
+  * Please send bug-reports/suggestions/comments to doly@dcl.info.waseda.ac.jp 
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  * 
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with this software; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+
+#include <cabi/cabi.h>
+
+#define MAX_LEN 10
+#define NR_CABI 1000
+#define ENTRIES 4
+
+
+#ifdef CONFIG_PROC_FS
+
+/* proc root directory */
+static struct proc_dir_entry *proc_cabi_dir; 
+/* proc object_id directories */
+static struct proc_dir_entry *cabi_id_dir [NR_CABI];
+
+static int
+proc_cabi_status_read(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	cabi_account_t cabi = (cabi_account_t) data;
+	return cabi_account_read_proc(cabi, page);
+}
+
+static int
+proc_cabi_base_status_read (char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	char *p = page;
+	extern int cabi_account_base_status_proc(char *);
+
+	p += cabi_account_base_status_proc(p);
+
+	return (p - page);
+}
+ 
+static int
+proc_cabi_block_read(char *page, char **start, off_t off,
+		      int count, int *eof, void *data)
+{
+	cabi_account_t cabi = (cabi_account_t) data;	
+	return cabi_account_read_block_proc(cabi, page);	
+}
+
+static int
+proc_cabi_bindpid_read(char *page, char **start, off_t off,
+		      int count, int *eof, void *data)
+{
+	cabi_account_t cabi = (cabi_account_t) data;
+	return cabi_account_read_bindpid_proc (cabi, page);
+}
+
+static int
+proc_cabi_time_read (char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+	cabi_account_t cabi = (cabi_account_t) data;
+	return cabi_account_read_time_proc(cabi, page);
+}
+
+void __init
+cabi_proc_init(void)
+{
+	struct proc_dir_entry *proc_cabi_status;
+	
+	/* proc directory entry */
+	proc_cabi_dir = create_proc_entry("cabi", S_IFDIR, 0);
+
+	if (!proc_cabi_dir) {
+	  printk("Cannot create /proc/cabi\n");
+	}
+
+	proc_cabi_status = create_proc_entry("cabi_status", S_IFREG | S_IRUGO,
+						proc_cabi_dir);
+	proc_cabi_status->read_proc = proc_cabi_base_status_read;
+}
+
+
+struct cabi_entry_list {
+  const char *name;
+  int (*f)(char *, char **, off_t, int, int *, void *);
+} CABI_PROC_ENTRY[] = { 
+  { "status",   proc_cabi_status_read},
+  { "term_act",    proc_cabi_block_read},
+  { "bind_pid",  proc_cabi_bindpid_read},
+  { "time_set", proc_cabi_time_read},
+};
+
+void 
+cabi_register_proc_account(cabi_account_t cabi)
+{
+	int i;
+	struct proc_dir_entry *entry;
+	unsigned char string[MAX_LEN];
+  
+	sprintf(string, "%d", (int) cabi->cabi_id);
+	cabi_id_dir[cabi->cabi_id] = proc_mkdir (string, proc_cabi_dir);
+  
+	if (!cabi_id_dir[cabi->cabi_id])
+		printk("Cannot create /proc/cabi/%d\n", (int)cabi->cabi_id);
+  
+  
+	for (i = 0; i < ENTRIES; i++) {
+    		entry = create_proc_entry (CABI_PROC_ENTRY[i].name, 
+			S_IFREG|S_IRUGO, cabi_id_dir[cabi->cabi_id]);
+
+		if (!entry)
+			printk("Cannot create /proc/cabi/%d/%s\n",
+				(int) cabi->cabi_id, CABI_PROC_ENTRY[i].name);
+    
+		entry->nlink = 1;
+		entry->data = cabi;
+		entry->read_proc = *CABI_PROC_ENTRY[i].f;
+	}
+}
+
+
+void
+cabi_proc_account_create(cabi_account_t cabi)
+{
+
+	cabi_debug ("cabi_proc_account_create: cabi(0x%x) cabi_id (%d) \n", 
+	       (int) cabi, (int) cabi->cabi_id);
+	cabi_register_proc_account(cabi);
+
+
+}
+
+void
+cabi_proc_account_destroy(cabi_account_t cabi)
+{
+        char buf[16];
+
+        cabi_debug ("cabi_proc_account_destroy: cabi(%d)", 
+			(int) cabi->cabi_id);
+        sprintf(buf, "%d", (int) cabi->cabi_id);
+        remove_proc_entry(buf, proc_cabi_dir);
+}
+
+#endif /* CONFIG_PROC_FS */
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_sched.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_sched.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_sched.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_sched.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,158 @@
+ /*
+  * linux/drivers/cabi/cabi_sched.c
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * This project was developed under the direction of Dr. Tatsuo Nakajima.
+  * Authors: Midori Sugaya, Hirotaka Ishikawa
+  * Please send bug-reports/suggestions/comments to qos@dcl.info.waseda.ac.jp 
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  * 
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANYARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with this software; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+
+#include <cabi/cabi.h>
+
+ /**/
+/* #undef	DEBUG_CABI_SCHED */
+ /**/ 
+extern cabi_account_t cabi_current_account;
+
+void (*cabi_schedule_hook) (struct task_struct *, struct task_struct *);
+
+void
+cabi_schedule_cpu(struct task_struct *prev, struct task_struct *next)
+{
+	cpu_tick_data_t now;
+	cabi_account_t prev_cabi = TASK_ACCOUNT(prev);
+	cabi_account_t next_cabi = TASK_ACCOUNT(next);
+
+
+	if (prev_cabi || next_cabi) {
+		cabi_debug
+		    ("pid(%d) cabi(0x%x)"
+		     " -> pid(%d) cabi(0x%x) \n",
+		     	prev->pid, (int) prev_cabi, 
+			next->pid, (int) next_cabi);
+	}
+
+	/* 
+	 * In this point, if the process is belong to the different 
+	 * accounting object (cabi), the enforce timer should be 
+	 * stopped at this time.
+	 */
+
+	if (prev_cabi != next_cabi) {
+		//cabi_enforce_timer_cancel();
+		/* if there is a current cabi, stop it */
+
+		/* debug */
+		cabi_debug_entities(cabi_current_account, prev_cabi);
+
+		/* 
+	 	 * if the previous scheduled process is belong to the
+		 * prev_cabi, accounting object (cabi) should be checked
+		 * whether it is deplated or not through the function
+		 * cabi_stop_account (cabi_account_check_enforce)
+		 */
+		if (prev_cabi == cabi_current_account) {
+		
+			/* debug */
+			cabi_debug_entities (cabi_current_account, 
+						prev_cabi);
+
+			if (cabi_current_account) {
+				
+				/* get time and stop accounting */
+				cabi_rdticks(&now);
+				cabi_stop_account(cabi_current_account, &now);
+			}
+		}
+
+		/* if there is a cabi for next, start it */
+		if ((cabi_current_account = next_cabi)) {
+
+			
+			/* debug */
+			cabi_debug_entities (cabi_current_account,
+							next_cabi);
+			/* start accounting */
+			cabi_start_account(next_cabi);
+		}
+	}
+	//cabi_current_account = next_cabi;
+}
+
+void
+cabi_enable_schedule_cpu(void)
+{
+	if (!cabi_schedule_hook) {
+		cabi_schedule_hook = cabi_schedule_cpu;
+		cabi_debug
+		    ("cabi_enable_schedule_cpu: cabi_schedule_hook enabled\n");
+	}
+}
+
+void
+cabi_disable_schedule_cpu(void)
+{
+	if (cabi_schedule_hook) {
+		cabi_schedule_hook = (void *) 0;
+
+		cabi_debug
+		    ("cabi_disable_schedule_cpu:cabi_schedule_hook disabled\n");
+	}
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_signal.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_signal.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_signal.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,118 @@
+ /*
+  * linux/drivers/cabi/cabi_signal.c 
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * This project was developed under the direction of Dr. Tatsuo Nakajima.
+  *
+  * Authors: Midori Sugaya, Hirotaka Ishikawa
+  * 
+  * Please send bug-reports/suggestions/comments to qos@dcl.info.waseda.ac.jp 
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  * 
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with this software; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+
+#include <cabi/cabi_error.h>
+#include <cabi/cabi.h>
+
+int
+cabi_send_signal(cabi_account_t cabi, int pid, int sig)
+{
+
+	int error;
+	struct siginfo info;
+	struct task_struct *tsk;
+
+	/* Set the siginfo */
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_KERNEL;
+	info.si_pid = 0;
+	info.si_uid = 0;
+
+	if (sig <= 0)
+		return -EINVAL;  
+
+	/* This is only valid for normal process(not idle process). */
+	if (pid <= 0)
+		return -EINVAL;
+	
+	/* search process by pid */
+	if (!(tsk = __cabi_find_process_by_pid(pid))) {
+		
+		/* debug */
+		cabi_debug_signal_pid (pid);
+		
+		/* return */
+		error = -ESRCH;
+		return error;
+	} else {
+	
+		/* debug */
+		cabi_debug_signal_attrs (pid, sig, tsk);
+
+		/* send signal */
+		error = send_sig_info(sig, &info, tsk);
+
+		if (error > 0) {
+			cabi->signal_block = SIGNAL_ON;
+			cabi->signal_count++;
+		}
+			
+	}
+	return error;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_syscalls.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_syscalls.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_syscalls.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_syscalls.c	2007-06-17 00:34:48.000000000 +0900
@@ -0,0 +1,597 @@
+/*
+  * linux/drivers/cabi/cabi_account.c 
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * 2007       Modified by Midori Sugaya to change interfaces.
+  * 2006-9-12. Modified by Midori Sugaya to fix bugs in tick calculation 
+  *            and make adjustment for 2.6.
+  * 2006-2.    Porting to kernel-2.6 Takeharu KATO (ARM, SH)
+  * 2005-2.    New release based on LinuxRK. by Midori SUGAYA, Hirotaka
+  *            ISHIKAWA.
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  *             
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+
+#include <asm/processor.h>
+#include <asm/div64.h>
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_debug.h>
+#include <cabi/cabi_error.h>
+
+/*
+ * - struct regist - is for manipulating the group of
+ * processes. Especially used in the binding procedure
+ * of the group of processes.
+ */
+struct rglist {
+        int count;
+        pid_t pid;
+        struct task_struct *tsk;
+        struct list_head rg_link;
+};
+typedef struct rglist *rglist_t;
+struct list_head rglist_head;
+
+/*
+ * Linux proc file system interface
+ */
+#include <asm/uaccess.h>
+#include <linux/sched.h>
+
+extern int cabi_sanity_check_euid(struct task_struct *tsk);
+
+/*
+ *  Name: sys_cabi_account_create
+ *
+ * Create a accounting object named cabi. Mainly sanitary checks
+ * of the parameters.
+ *
+ */
+asmlinkage int
+sys_cabi_account_create(struct cabi_uaccount *ucabi)
+{
+        struct cabi_uaccount user_cabi;
+        cabi_account_t cabi;
+        unsigned long ret;
+        
+	/* permission check */
+        if (current->euid) {
+                cabi_sanity_check_euid (current);
+                goto error_return;
+        }
+
+	CABI_DBG;
+        /* parameter copy */
+        if (copy_from_user(&user_cabi, ucabi, 
+			sizeof (struct cabi_uaccount)))
+                return CABI_EINVAL;
+
+        /* parameter check */
+        if ((ret = cabi_sanity_check_ucabi (&user_cabi)) > 0) {
+                        goto error_return;
+        }
+
+        /* cpu_time and cpu_period should be checked. */
+        if ((ret = cabi_sanity_check_timespec_c_and_t 
+			(&user_cabi.cpu_time,
+                         &user_cabi.cpu_period)) > 0) {
+                        goto error_return;
+        }
+
+	/* check policy integrity */
+	cabi_sanity_check_operation (&user_cabi);
+
+        /*
+         * for overload cabi, check parameters and set a new cpu_time.
+         */
+        if (user_cabi.pm.bind_proc_type == BIND_IDLE_PROC) {
+                cabi_sanity_check_overload_id ();
+                cabi_sanity_check_sigoperation (&user_cabi);
+
+        }
+	
+	/* parameter check */
+        cabi_sanity_check_parameter_to_copy
+					(&user_cabi.cpu_time,
+					 &user_cabi.cpu_period,
+					 &user_cabi.pm);
+	
+	/* call create API */
+        if (!(cabi = cabi_account_create(&user_cabi.cpu_time,
+					 &user_cabi.cpu_period,
+					 &user_cabi.pm)))
+                goto error_return;
+
+	/* copy the result vaule to return the caller */
+        ret = copy_to_user(&ucabi->cabi_id, &cabi->cabi_id,
+                     sizeof (cabi_object_t));
+
+        if (ret < 0)
+                goto error_return;
+
+        return CABI_SUCCESS;
+
+error_return:
+	free (&user_cabi);
+        free (&ucabi);
+        EXIT;
+
+
+        return CABI_CREATE_ERR;
+
+}
+
+
+/*
+ *  Name: sys_cabi_account_set
+ *
+ * Set the parameters to a specified accounting object (cabi).
+ */
+asmlinkage int
+sys_cabi_account_set(unsigned long cabi_id, struct cabi_uaccount *ucabi)
+{
+	struct cabi_uaccount user_cabi;
+	cabi_account_t cabi;
+	int ret;
+
+        /* permission check */
+        if (current->euid) {
+                cabi_sanity_check_euid (current);
+                return CABI_EACCESS;
+        }
+
+        /* null pointer check */
+        if (!ucabi)
+                return CABI_EINVAL;
+
+        /* parameter copy */
+        if (copy_from_user(&user_cabi, ucabi, 
+			sizeof (struct cabi_uaccount)))
+                return CABI_EINVAL;
+
+        /* parameter check */
+        if ((ret = cabi_sanity_check_ucabi (&user_cabi)) > 0) {
+                        goto error_return;
+        }
+
+        /* cpu_time and cpu_period should be checked. */
+        if ((ret = cabi_sanity_check_timespec_c_and_t 
+			(&user_cabi.cpu_time,
+                         &user_cabi.cpu_period)) > 0) {
+                        goto error_return;
+        }
+
+	if ((cabi = search_cabi(cabi_id)) == NULL) {
+		return CABI_ENOEXIST;
+	} else {
+		if (user_cabi.pm.bind_proc_type !=
+		    cabi->pm.bind_proc_type)
+			return CABI_EINVAL;
+	}
+
+	/* 
+	 * for overload cabi, check parameters and set a new cpu_time.
+	 */
+	if (user_cabi.pm.bind_proc_type == BIND_IDLE_PROC) {
+		if ((cabi = search_cabi(OVERLOAD_CABI_ID)) == NULL) {
+			return CABI_EINVAL;
+		}
+		if (user_cabi.pm.operation != OP_SIGNAL) {
+			return CABI_EINVAL;
+		}
+	}
+
+	/* pass the new parameter to cabi */
+	ret =  cabi_account_set(cabi_id, 
+				&user_cabi.cpu_time, 
+				&user_cabi.cpu_period, 
+				&user_cabi.pm);
+
+	if (ret < 0) {
+		return CABI_ERROR;
+	} else 
+		return CABI_SUCCESS;
+error_return:
+        free (&ucabi);
+        free (&user_cabi);
+        EXIT;
+
+	TEXIT;
+        return CABI_ERROR;
+}
+
+
+/*
+ *  Name: sys_cabi_account_get
+ *
+ * Get the parameters from a specified accounting object (cabi).
+ */
+
+asmlinkage int
+sys_cabi_account_get(unsigned long cabi_id, struct cabi_uaccount *ucabi)
+{
+	struct list_head *proc_list;
+	struct rs_proc_list *rs_proc = NULL;
+	unsigned long ret;
+	cabi_account_t cabi = NULL_ACCOUNT;
+	cpu_capacity_quad_t rev_qc;	
+
+	/* null pointer check */
+	if (!ucabi)
+		return CABI_EINVAL;
+
+	/* find cabi address */
+	if (!(cabi = search_cabi(cabi_id)))
+		return CABI_ENOEXIST;
+
+	proc_list = cabi->cpu_proc_list.next;
+	while (proc_list != &cabi->cpu_proc_list) {
+		/* get a rs_proc */
+		rs_proc =
+		    list_entry(proc_list, 
+		    struct rs_proc_list, rs_proc_list);
+
+		/* next element */
+		proc_list = proc_list->next;
+	}
+
+        /* if this is overload cabi, re-calculate parameters. */
+        if (cabi->pm.bind_proc_type == BIND_IDLE_PROC){
+                capacity_overload(&cabi->cpu_time, 
+			&cabi->cpu_period, &rev_qc);
+		ret = copy_to_user(&ucabi->cpu_time, &rev_qc,
+			     	sizeof (ucabi->cpu_time));
+	} else {
+		ret = copy_to_user(&ucabi->cpu_time, &cabi->cpu_time, 
+				sizeof(ucabi->cpu_time));
+	}
+
+	ret = copy_to_user(&ucabi->cpu_period, &cabi->cpu_period,
+		     	sizeof (ucabi->cpu_period));
+	ret = copy_to_user(&ucabi->pm, &cabi->pm,
+		     	sizeof (ucabi->pm));
+	ret = copy_to_user(&ucabi->cabi_id, &cabi->cabi_id, 
+			sizeof (ucabi->cabi_id));
+
+	if (ret < 0) 
+		return CABI_ERROR;
+	
+	return CABI_SUCCESS;
+}
+
+
+/*
+ *  Name: sys_cabi_account_destory
+ *
+ * Destory a accounting object.
+ *
+ */
+asmlinkage int
+sys_cabi_account_destroy(unsigned long cabi_id)
+{
+	int ret;
+
+	/* permission check */
+        if (current->euid) {
+                cabi_sanity_check_euid (current);
+                return CABI_EACCESS;
+        }
+	if (cabi_id <= 0)
+		return CABI_EINVAL;	/* invalid argument */
+
+	ret = cabi_account_destroy(cabi_id);
+	return ret;
+}
+
+
+/*
+ * Name: sys_cabi_account_bind_pid
+ *
+ * Bind a process to a accounting object.
+ *
+ */
+asmlinkage int
+sys_cabi_account_bind_pid(unsigned long cabi_id, pid_t pid)
+{
+
+	int ret;
+	struct task_struct *tsk;
+
+        /* permission check */
+        if (current->euid) {
+                cabi_sanity_check_euid (current);
+                return CABI_EACCESS;
+        }
+
+	/* parameter check */
+	if (pid_check(pid)) {
+		return CABI_EINVAL;	/* invalid argument */
+	}
+
+	/* find overload cabi address. */
+	if (cabi_id == OVERLOAD_CABI_ID) {
+		if (pid != IDLE_PROCESS) {
+			bind_pid_error(pid);
+			return CABI_EINVAL;
+		}
+		if (!(tsk = __cabi_find_idle_process(pid))) {
+			bind_pid_error(pid);
+			return CABI_EPNOEXIST;	/* PID(0) dose not exist. */
+		} 
+		/* idle process attached. */
+	} else {
+		if (!(tsk = __cabi_find_process_by_pid(pid))) {
+			bind_pid_error(pid);
+			return CABI_EPNOEXIST;	/* PID dose not exist. */
+		}
+	}
+
+	/* if the task has been registered, rerun. */
+	if (TASK_ACCOUNT(tsk)) {
+		cabi_account_attached ();
+		if (cabi_id == TASK_ACCOUNT(tsk)->cabi_id) {
+			/* PID is already registered into AO. */
+			return CABI_EREGIST;
+		} else {
+			/* PID is registered into another AO. */
+			return CABI_ENOAVLE;
+		}
+	}
+
+	/* call a function to attach */
+	ret = cabi_account_attach(cabi_id, tsk);
+	return ret;
+
+}
+
+
+/*
+ * Name: sys_cabi_account_bind_pgid
+ *
+ * Bind a grounp of processes to a accounting object.
+ *
+ */
+asmlinkage int
+sys_cabi_account_bind_pgid(unsigned long cabi_id, pid_t pgid)
+{
+
+	cabi_account_t cabi;
+	struct rglist *rg, *tmp;
+	struct list_head *reg_list;
+	struct task_struct *tsk;
+	int i, ret;
+	int already_registered, shouldbe_registered, 
+	    group_member, rglist_count;
+
+
+	/* permission check */
+	if (current->euid) {
+		cabi_sanity_check_euid (current);
+		return CABI_EACCESS;
+	}
+
+	/* if pgid is 0 or < 0, error return */
+	if (pgid_check (pgid)) {
+		return CABI_EINVAL;
+	}
+
+	already_registered = shouldbe_registered =
+	    group_member = rglist_count = 0;
+
+	INIT_LIST_HEAD(&rglist_head);
+
+	/* check cabi_id */
+	if (cabi_id_check(cabi_id)) {
+		return CABI_EINVAL;	/* invalid argument */
+	}
+
+	/* current(caller) check */
+	if (TASK_ACCOUNT(current)) {
+		cabi = TASK_ACCOUNT(current);
+		if (cabi->cabi_id == cabi_id) {
+			already_registered++;
+		} else {
+			/* If the caller was already binded to other AO, we
+			 * return error. */
+			return CABI_ENOAVLE;
+		}
+	} else {
+		/* 
+		 * If the caller process has not bind any AO, we shoud
+		 * bind it to the requested AO. But we will not do 
+		 * anything here, because for_each_task() routine picks
+		 * up a process as a member of the target group process.
+		 */
+	}
+
+	/* 
+	 * If the process which is belong to the same group should be
+	 * bind the requested AO.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_process(tsk) {
+		if (process_group(tsk) == pgid) {
+			group_member++;
+			printk("bind_pgid: tsk->pgid %d group_member %d\n",
+			       process_group(tsk), group_member);
+			/* 
+			 * The process which has the right pgid was found.
+			 * Next, we shoud check if this process has been 
+			 * already bind to other cabi, if so, error return.
+			 */
+			if (TASK_ACCOUNT(tsk)) {
+				printk("bind_pgid: this account already "
+				       "attached to this process. "
+				       " pid(%d) cabi(0x%x)\n",
+				       tsk->pid, (int) TASK_ACCOUNT(tsk));
+				/* 
+				 * If the process has been registered 
+				 * the requested cabi, count up, else
+				 * error return.
+				 */
+				cabi = TASK_ACCOUNT(tsk);
+				if (cabi->cabi_id == cabi_id) {
+					already_registered++;
+				} else {
+					read_unlock(&tasklist_lock);
+					return CABI_ENOAVLE;
+				}
+			} else {
+				/* If the task has not been registered,
+				 * we should bind it. But here, we only
+				 * registerd the list.
+				 */
+				if((rg = malloc(sizeof 
+					(struct rglist))) == NULL) {
+					read_unlock(&tasklist_lock);
+					return CABI_ENOMEM;
+				}
+				bzero(rg, sizeof (struct rglist));
+				INIT_LIST_HEAD(&rg->rg_link);
+
+				rg->pid = tsk->pid;
+				rg->tsk = tsk;
+
+				list_add(&rg->rg_link, &rglist_head);
+				rglist_count++;
+				shouldbe_registered++;
+			}
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	if (group_member == 0) {
+		return CABI_EPGNOEXIST;
+	} else if (group_member == already_registered) {
+		return CABI_EREGIST;
+	} else {
+		/* regist processes */
+		reg_list = &rglist_head;
+		for (i = 0; i < shouldbe_registered + 1; i++) {
+			if (reg_list == &rglist_head) {
+				reg_list = reg_list->next;
+			} else {
+				tmp = list_entry(reg_list, 
+					struct rglist, rg_link);
+				if ((ret =
+				     cabi_account_attach(cabi_id, 
+					tmp->tsk))!= CABI_SUCCESS) {
+					return ret;
+				}
+				reg_list = reg_list->next;
+			}
+		}
+	}
+
+	TEXIT;
+
+	return CABI_SUCCESS;
+}
+
+
+/*
+ * Name: sys_cabi_account_unbind
+ *
+ * Unbind a process which has been attached to a specified 
+ * accounting object.
+ *
+ */
+asmlinkage int
+sys_cabi_account_unbind(pid_t pid)
+{
+
+	struct task_struct *tsk;
+	int ret;
+
+	/* permission check */
+	if (current->euid) {
+		cabi_sanity_check_euid (current);
+		return CABI_EACCESS;
+	}
+
+	/* pid check */
+	if (pid_check(pid)) {
+		return CABI_EINVAL;	/* Invalid argument */
+	}
+
+	/* find overload cabi address. */
+	if (pid == IDLE_PROCESS) {
+		if (!(tsk = __cabi_find_idle_process(pid))) {
+			printk("account_unbind: idle (%d) failed.\n", pid);
+			return CABI_EPNOEXIST;	/* PID(0) dose not exist. */
+		} 
+	} else {
+		if (!(tsk = __cabi_find_process_by_pid(pid))) {
+			printk("account_unbind: invalid pid(%d)\n", pid);
+			return CABI_EPNOEXIST;	/* PID dose not exist. */
+		}
+	}
+
+	if (!TASK_ACCOUNT(tsk)) {
+		printk("account_unbind: no account attached to "
+		       "this process. pid(%d)\n", tsk->pid);
+		return CABI_ENOBIND;
+	}
+
+	ret = cabi_account_detach(tsk);
+	return ret;
+}
+
+asmlinkage int 
+sys_cabi_account_eval (int eventtype)
+{
+	
+	int flag;
+
+	switch (eventtype) {
+		case 0: 
+			cabi_dump_init();
+			break;
+		case 1:	/* this is the first time */
+			flag = 1;
+			cabi_dump_read(flag);
+			break;
+		case 2: 
+			flag = 0;
+			cabi_dump_read(flag);
+			break;
+		case 3:
+			cabi_dump_end();
+			break;
+		default:
+			break;
+
+	}
+	return 0;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/cabi_timer.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_timer.c
--- ./linux-2.6.18.1/drivers/cabi/cabi_timer.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/cabi_timer.c	2007-06-17 00:37:01.000000000 +0900
@@ -0,0 +1,255 @@
+ /*
+  * linux/drivers/cabi/cabi_timer.c 
+  * 
+  * CABI -- CPU Accounting and Blocking Interfaces.
+  * 
+  * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+  *               MontaVista Software, Inc. 
+  * 
+  * This software was developed by the Waseda University and Montavista Software,
+  * Inc. Funding for this project was provided by IPA (Information-technology
+  * Promotion Agency, Japan). This software may be used and distributed 
+  * according to the terms of the GNU Public License, incorporated herein by
+  * reference. 
+  * 
+  * This project was developed under the direction of Dr. Tatsuo Nakajima.
+  *
+  * Authors: Midori Sugaya, Hirotaka Ishikawa
+  * 
+  * Please send bug-reports/suggestions/comments to qos@dcl.info.waseda.ac.jp 
+  * 
+  * Futher details about this project can be obtained at
+  * http://dcl.info.waseda.ac.jp/osrg/
+  * 
+  * This is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the
+  * Free Software Foundation; either version 2 of the License, or (at your
+  * option) any later version.
+  *
+  * This software is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with this software; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+  *
+  * This file is derived from software distributed under the following terms:
+  */
+ /*
+  * Real-time and Multimedia Systems Laboratory
+  * Copyright (c) 1999 Carnegie Mellon University
+  * All Rights Reserved.
+  *
+  * Permission to use, copy, modify and distribute this software and its
+  * documentation is hereby granted, provided that both the copyright
+  * notice and this permission notice appear in all copies of the
+  * software, derivative works or modified versions, and any portions
+  * thereof, and that both notices appear in supporting documentation.
+  *
+  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+  *
+  * Carnegie Mellon requests users of this software to return to
+  *
+  *  Real-Time and Multimedia Systems Laboratory
+  *  Attn: Prof. Raj Rajkumar
+  *  Electrical and Computer Engineering, and Computer Science
+  *  Carnegie Mellon University
+  *  Pittsburgh PA 15213-3890
+  *
+  *  or via email to raj@ece.cmu.edu
+  *
+  * any improvements or extensions that they make and grant Carnegie Mellon
+  * the rights to redistribute these changes.
+  */
+
+#include <asm/siginfo.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/config.h>
+#include <linux/posix-timers.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_debug.h>
+#include <cabi/cabi_error.h>
+
+__inline__ void
+cabi_tstojiffie(struct timespec *tp, unsigned long *jiff)
+{
+	*jiff = HZ * tp->tv_sec;
+	*jiff += (tp->tv_nsec + (NANOSEC / HZ)) / (NANOSEC / HZ);
+} 
+extern cabi_account_t cabi_current_account;
+
+/*
+ * cabi_replenish_timer_process(cabi_timer_t tmr)
+ */
+enum hrtimer_restart
+cabi_replenish_timer_process(struct hrtimer *tmr)
+{
+	int ret;
+	struct timespec res;
+	cabi_account_t cabi;
+
+	ENTER;
+	cabi = (cabi_account_t ) tmr->data;
+
+	/* replenish it, and set timer expiration `ticks' later */
+	cabi_account_replenish(cabi);
+
+	/* debug parameters */
+	cabi_debug_info (cabi);
+        hrtimer_get_res (CLOCK_REALTIME, &res);
+        cabi_debug("[rep_prcess] hrtimer_get_res %llu %llu \n",
+                        (unsigned long long) res.tv_sec,
+                        (unsigned long long) res.tv_nsec);
+
+        tmr->expires.tv64 = (cabi->cpu_period.tv_sec * NANOSEC)
+                            + cabi->cpu_period.tv_nsec;
+        cabi_debug("[rep_process] expire %llu\n",
+                        tmr->expires.tv64);
+
+	/* if there are bit in signal_block, off it */
+	if (cabi->pm.operation == OP_SIGNAL && cabi->signal_block)
+		cabi->signal_block = SIGNAL_OFF;
+
+        //ret = hrtimer_forward (tmr, tmr->get_time(), tmr->expires);
+        ret = hrtimer_start (tmr, tmr->expires, HRTIMER_REL);
+        cabi_debug ("[rep_pros_tmr] hrtimer_start %d %d.\n", 
+					tmr->state, ret);
+	
+	EXIT;
+	return tmr->state;
+}
+
+/*
+ * cabi_replenish_timer_create():
+ * 	Create a timer to replenish a account, add it to the queue.
+ * 	Set a linux timer if necessary.
+ */
+#include <asm-generic/siginfo.h>
+#define INIT_TIMEOUT	10000000
+void
+cabi_replenish_timer_init(cabi_account_t cabi, cpu_tick_t ticks)
+{
+
+	//struct timer_list *tmr = &cabi->cpu_replenish_tmr;
+	struct hrtimer *tmr = &cabi->cpu_replenish_hrtmr;
+	struct timespec res;
+	int ret;
+
+	ENTER;
+	hrtimer_init(tmr, CLOCK_REALTIME, HRTIMER_REL);
+        hrtimer_get_res (CLOCK_REALTIME, &res);
+        tmr->expires.tv64 = INIT_TIMEOUT;
+        tmr->function = cabi_replenish_timer_process;
+        tmr->data = (char *)cabi;
+
+	cabi_debug_info (cabi);
+
+        /* Enqueue the timer */
+        ret = hrtimer_start(tmr, tmr->expires, HRTIMER_REL);
+	EXIT;
+}
+
+void
+cabi_replenish_timer_cancel(cabi_account_t cabi)
+{
+	struct hrtimer *tmr = &cabi->cpu_replenish_hrtmr;
+	hrtimer_cancel(tmr);
+}
+
+/*
+ * Enforcement timer management
+ */
+extern void cabi_enforce_timer_process (struct hrtimer *tmr);
+static struct hrtimer cabi_enforce_hrtmr;
+
+/* called from "cabi->cpu_ops->start_account()" */
+void
+cabi_enforce_timer_start(cabi_account_t cabi, cpu_tick_t next_available_ticks)
+{
+	struct hrtimer *tmr = &cabi_enforce_hrtmr;
+	struct timespec ts;
+	unsigned long flags;
+	int ret;
+
+	ENTER;
+	
+	/* debug */
+	cabi_debug_tick(*next_available_ticks);
+
+	/* convert tick to timespec */
+	tick2ts(next_available_ticks, &ts);
+
+	/* set the parameters to tmr */
+	cabi_spin_lock(flags);
+	tmr->expires.tv64 = (ts.tv_sec*NANOSEC + ts.tv_nsec);
+	tmr->data = (char *)cabi;
+
+        if (tmr->expires.tv64 <= 0) {
+                if (cabi->cpu_state != CABI_IS_DEPLETED)
+                        cabi->cpu_state &= CABI_IS_DEPLETED;
+        }
+	ret = hrtimer_start (tmr, tmr->expires, HRTIMER_REL);
+	cabi_spin_unlock (flags);
+
+	EXIT;
+}
+
+/*
+ * Cancel enforce_timer and set timer to next jiffy
+ */
+void
+cabi_enforce_timer_cancel(void)
+{
+	struct hrtimer *tmr = &cabi_enforce_hrtmr;
+        int ret;
+
+	ENTER;
+        ret = hrtimer_try_to_cancel(&cabi_enforce_hrtmr);
+	EXIT;
+}
+
+/*
+ * Process enforce timer expiration.
+ */
+void
+cabi_enforce_timer_process(struct hrtimer *tmr)
+{
+
+	cabi_account_t cabi = (cabi_account_t) tmr->data;
+	
+	ENTER;
+	/* make sure if cabi corresponds to the current cabi */
+	if (cabi_current_account == cabi) {
+			
+		/* debug */
+		cabi_debug_entities (cabi_current_account, cabi);
+		
+		/* enforce the account */
+		cabi_account_enforce(cabi_current_account);
+	}
+	tmr->state = HRTIMER_NORESTART;
+	EXIT;	
+}
+
+void
+cabi_timer_start(void)
+{
+	ENTER;
+	hrtimer_init(&cabi_enforce_hrtmr, CLOCK_REALTIME, HRTIMER_REL);
+	cabi_enforce_hrtmr.function = cabi_enforce_timer_process;
+	EXIT;
+}
+
+int
+cabi_account_read_time_proc(cabi_account_t cabi, char *buf)
+{
+        char *p = buf;
+        p += sprintf(p, "timespec \n");
+        return (p - buf);
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/Makefile
--- ./linux-2.6.18.1/drivers/cabi/examples/Makefile	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/Makefile	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,69 @@
+CABIDIR  = ../..
+CC       = $(CROSS_COMPILE)gcc -g
+CFLAGS   = -O2 -Wall
+LIBS	 = -L../lib -lcabi 
+INCLUDES = -I../../../include -I/usr/include 
+
+OBJS = cabi_create cabi_destroy cabi_bind cabi_unbind cabi_set cabi_get \
+	cabi_overload_create cabi_overload_destroy \
+	cabi_create_bind cabi_exec_bind cabi_fifo_crebid \
+	cabi_defsrv_crebid cabi_ts_crebid cabi_fifo_crebid_soft \
+	cabi_fifo_crebid_hard cabi_ctimer cabi_create_loop\
+
+all: cabi_create cabi_destroy cabi_bind cabi_unbind cabi_set cabi_get cabi_overload_create cabi_overload_destroy cabi_create_bind cabi_exec_bind cabi_fifo_crebid cabi_defsrv_crebid cabi_ts_crebid cabi_fifo_crebid_soft cabi_fifo_crebid_hard  cabi_ctimer cabi_create_loop \
+
+cabi_create: cb_create.c
+	$(CC) -o cabi_create cb_create.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_destroy: cb_destroy.c
+	$(CC) -o cabi_destroy cb_destroy.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_bind: cb_bind.c
+	$(CC) -o cabi_bind cb_bind.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_unbind: cb_unbind.c
+	$(CC) -o cabi_unbind cb_unbind.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_set: cb_set.c
+	$(CC) -o cabi_set cb_set.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_get: cb_get.c
+	$(CC) -o cabi_get cb_get.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_overload_create: cb_overload_create.c
+	$(CC) -o cabi_overload_create cb_overload_create.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_overload_destroy: cb_overload_destroy.c
+	$(CC) -o cabi_overload_destroy cb_overload_destroy.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_create_bind: cb_create_bind.c
+	$(CC) -o cabi_create_bind cb_create_bind.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_exec_bind: cb_exec_bind.c
+	$(CC) -o cabi_exec_bind cb_exec_bind.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_fifo_crebid_soft: cb_fifo_crebid_soft.c
+	$(CC) -o cabi_fifo_crebid_soft cb_fifo_crebid_soft.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_fifo_crebid_hard: cb_fifo_crebid_hard.c
+	$(CC) -o cabi_fifo_crebid_hard cb_fifo_crebid_hard.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_fifo_crebid: cb_fifo_crebid.c
+	$(CC) -o cabi_fifo_crebid cb_fifo_crebid.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_defsrv_crebid: cb_defsrv_crebid.c
+	$(CC) -o cabi_defsrv_crebid cb_defsrv_crebid.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_ts_crebid: cb_ts_crebid.c
+	$(CC) -o cabi_ts_crebid cb_ts_crebid.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_ctimer: cb_timer.c 
+	$(CC) -o cabi_timer cb_timer.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+cabi_create_loop: cb_create_loop.c 
+	$(CC) -o cabi_create_loop cb_create_loop.c $(CFLAGS) $(INCLUDES) $(LIBS)
+
+clean:
+	rm -rf $(OBJS) *~
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_bind.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_bind.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_bind.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_bind.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sched.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+void usage(void) {
+	printf ("Usage: cabi_bind [object_id] [pid]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("  [object_id]      : cabi object id\n");
+	printf ("  [pid]            : bind process id\n");
+}
+
+int main (int argc, char *argv[])
+{
+	int ret;
+       	pid_t pid;
+	
+	unsigned long cabi_id; 	
+
+	if (argc != 3) {
+		usage();
+		return 0;
+	}
+	
+	/* set accounting object id */
+	cabi_id = atol (argv[1]);
+	pid = (pid_t) atoi (argv[2]);
+
+	switch (cabi_id) {
+	    case 0:
+		printf ("object id is 0.\n");
+		return 1;
+	    case 1:
+		printf ("[object_id]=1 is only for overload.");
+		return 1;
+	    default:
+		printf ("cabi bind pid\n");
+		break;
+	}
+
+	// Attach this process to the resource set 
+	if ((ret = cabi_account_bind_pid(cabi_id, pid)) != CABI_SUCCESS) {
+		printf ("cabi_account_bind_pid() faild.(%d)\n", ret);
+	} else {
+		printf ("cabi_account_bind_pid: object_id(%d) pid[%d]\n",
+			(int)cabi_id, pid);
+	}
+
+	return 0;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_create.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_create.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_create.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_create.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,119 @@
+/*
+ * This is sample program for CABI system.
+ * create accounting object.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+#define ONE_SEC 1000000
+
+void usage(void) {
+	printf ("cabi_create:\n");
+	printf ("  cabi terminate action [block]:\n");
+	printf ("  Usage: cabi_create [cpu_time(us)] [cpu_period(us)]\n");
+	printf ("  cabi terminate action [signal]:\n");
+	printf ("  Usage: cabi_create [cpu_time(us)] [cpu_period(us)] [pid] [sig] [flag]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("    [cpu_time(us)]   : cpu performance time (usec)\n");
+	printf ("    [cpu_period(us)] : cpu cycle time (usec)\n");
+	printf ("    [pid]            : signal receive pid\n");
+	printf ("    [sig]            : send signal number\n");
+	printf ("    [flag]           : default(current) or else\n");
+}
+
+int main (int argc, char *argv[])
+{
+	int ret;
+	struct cabi_uaccount *ucabi;
+	long long cpu_time, cpu_period;
+	int operation, sig, flag;
+	pid_t pid;
+	
+	/* set accounting object id */
+	switch (argc) {
+	    case 3:
+		operation	= OP_BLOCK;
+		cpu_time	= atoll (argv[1]);
+		cpu_period	= atoll (argv[2]);
+		pid		= 0;
+		sig		= 0;
+		flag		= 0;
+		break;
+	    case 6:
+		operation	= OP_SIGNAL;
+		cpu_time	= atoll (argv[1]);
+		cpu_period	= atoll (argv[2]);
+		pid		= atoi (argv[3]);
+		sig		= atoi (argv[4]);
+		flag		= atoi (argv[5]);
+		break;
+	    default:
+		usage();
+		return 1;
+	}
+	printf ("Create CABI object...\n");
+	if (cpu_time <= 0) {
+		printf ("Invalid parameter. cpu_time = %lld nsec\n", cpu_time);
+		return 1;
+	}
+	if (cpu_period <= 0) {
+		printf ("Invalid parameter. cpu_period = %lld nsec\n", cpu_period);
+		return 1;
+	}
+	if (cpu_time > cpu_period) {
+		printf ("Invalid parameter. cpu_time > cpu_period\n");
+		return 1;
+	}
+
+ 	/* create a user cabi  */
+	if(!(ucabi = (cabi_account_t) malloc (sizeof(struct cabi_uaccount)))) {
+		printf ("cabi_create: Memory allocation error.\n");
+		return 1;
+	}
+
+	ucabi->pm.policy = PO_CYCLIC;
+	ucabi->pm.operation = operation;
+	ucabi->pm.bind_proc_type = BIND_NORMAL_PROC;
+
+	ucabi->cpu_time.tv_sec = 0;
+	ucabi->cpu_time.tv_nsec = 0;
+	
+	ucabi->cpu_period.tv_sec = 0;
+	ucabi->cpu_period.tv_nsec = 0;
+
+	ucabi->cpu_time.tv_sec = (cpu_time / ONE_SEC);
+	ucabi->cpu_time.tv_nsec = (cpu_time % ONE_SEC) * 1000;
+
+	ucabi->cpu_period.tv_sec = (cpu_period / ONE_SEC);
+	ucabi->cpu_period.tv_nsec = (cpu_period % ONE_SEC) * 1000;
+
+	ucabi->pm.cabi_signal.pid  = pid;
+	ucabi->pm.cabi_signal.sig  = sig;
+	ucabi->pm.cabi_signal.flag = flag;
+
+	if ((ret = cabi_account_create (ucabi)) == CABI_SUCCESS) {
+		printf ("account set create. operation (%d) object_id [%d]\n",
+		       (int)ucabi->pm.operation,
+		       (int)ucabi->cabi_id);
+	} else { 
+		printf ("cabi_account_create failed.(%d)\n", ret);
+	}	
+
+	if (ucabi->cabi_id == 0) {
+		printf ("cabi_account_create faild on cpu %x\n", (int)ucabi);
+		free(ucabi);
+		return 1;
+	}
+
+	free(ucabi);
+
+	return 0;	
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_create_bind.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_create_bind.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_create_bind.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_create_bind.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,50 @@
+/*
+ * This is sample program for CABI system.
+ * create accounting object.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h> 
+
+
+int main (int argc, char *argv[])
+{
+	int ret;
+	pid_t pid;
+	
+	struct cabi_uaccount *ucabi;
+	
+	pid = getpid();
+ 	ucabi = (cabi_account_t) malloc (sizeof(struct cabi_uaccount));
+
+	memset (ucabi, 0x00, sizeof(struct cabi_uaccount));
+	/* set scheduling and enforce mode */
+	ucabi->pm.operation = OP_BLOCK;
+	ucabi->cpu_time.tv_sec = 0;
+	ucabi->cpu_time.tv_nsec = 1000 * 1000 * 20;
+	
+	ucabi->cpu_period.tv_sec = 0;
+	ucabi->cpu_period.tv_nsec = 1000 * 1000 * 100;
+
+	if (!(ret = cabi_create_bind (ucabi, pid))) {
+		printf("account set create. object_id [%d]\n",
+			(int)ucabi->cabi_id);
+	} else { 
+		printf("cabi_account_create failed.\n");
+	}	
+	
+	if (ucabi->cabi_id == 0) {
+		printf("cabi_account_create faild on cpu %x\n", (int)ucabi);
+		return 1;
+	}
+
+	free(ucabi);
+	
+	return 0;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_defsrv_crebid.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_defsrv_crebid.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_defsrv_crebid.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_defsrv_crebid.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h> 
+#include <sched.h>
+
+
+int main (
+	int argc, 
+	char *argv[])
+{
+	int i, ret;
+	pid_t pid;
+	int execution, period;
+	
+	//struct sched_param *sched_param;
+	struct cabi_uaccount *ucabi;	
+
+	if (argc < 3) {
+		printf("Usage:./program [execution] [period]\n");
+		return 0;
+	}
+	execution = atoi(argv[1]);
+	period = atoi(argv[2]);
+
+	/* get selef identity */
+	pid = getpid();
+	ucabi = malloc (sizeof(struct cabi_uaccount));
+
+	memset(ucabi, 0x00, sizeof(struct cabi_uaccount));
+
+	//ucabi->pm.term_act = CABI_TERM_BLOCK;
+	ucabi->pm.policy = PO_DEFSRV;
+	//ucabi->pm.operation = OP_BOOST;
+	ucabi->cpu_time.tv_sec = 0;
+        ucabi->cpu_time.tv_nsec = 1000 * 1000 * execution;
+
+        ucabi->cpu_period.tv_sec = 0;
+        ucabi->cpu_period.tv_nsec = 1000 * 1000 * period;
+
+	printf("make [DSV] process.\n");
+
+	/* set accounting object id */
+        if (!(ret = cabi_create_bind (ucabi, pid))) {
+                printf("account set create. object_id [%d]\n",
+                        (int)ucabi->cabi_id);
+        } else {
+               printf("cabi_account_create failed.\n");
+        }
+
+	if (ucabi->cabi_id == 0) {
+		printf("cabi_account_create failed on cpu %x\n", (int) ucabi);
+		return 1;
+	}
+	// do some work 
+	for (; ;)
+	{
+		for (i = 1; i < 100000000; i++) {
+		i =+ i;
+		//printf("count %d\n", i);
+		}
+	}
+			
+}
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_destroy.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_destroy.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_destroy.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_destroy.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+void usage(void) {
+	printf ("Usage: cabi_destroy [object_id]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("  [object_id]      : cabi object id\n");
+}
+
+int main (int argc, char *argv[])
+{
+	int ret;
+	unsigned long cabi_id; 	
+
+	if (argc != 2) {
+		usage();
+		return 0;
+	}
+	
+	/* set the object id */
+	cabi_id = atol (argv[1]);	
+
+	if ((ret = cabi_account_destroy(cabi_id)) != CABI_SUCCESS) {
+		printf ("destroy faild.(%d)\n", ret);
+	} else {
+		printf ("destroy succeed.\n");
+	}
+
+        return 1;
+								
+			
+}
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_exec_bind.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_exec_bind.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_exec_bind.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_exec_bind.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sched.h>
+#include <errno.h>
+#include <cabi/cabi.h> 
+#include <cabi/cabi_error.h> 
+
+extern int errno;
+
+void usage(void) {
+	printf ("Usage: cabi_exec_bind [object_id] [program...]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("  [object_id]      : cabi object id\n");
+	printf ("  [program...]     : execute program\n");
+}
+
+int main (int argc, char *argv[])
+{
+	unsigned long cabi_id; 	
+	pid_t	pid;
+	int	status;
+	int	ret;
+
+	if (argc < 3) {
+		usage();
+		return 0;
+	}
+
+	cabi_id = atoi (argv[1]);
+
+	if ((pid = fork()) == -1) {
+		perror("fork()");
+		return 1;
+	} else if (pid > 0) {
+		if ((ret = cabi_account_bind_pid(cabi_id, pid)) != CABI_SUCCESS) {
+			printf("exec_bind : cabi_account_bind_pid: faild. (%d)\n", ret);
+			return 1;
+		} else {
+			printf("exec_bind : cabi_account_bind_pid: Account ID(%d)[%d]\n",
+					(int)cabi_id, pid);
+		}
+		waitpid(-1, &status, WNOHANG);
+	} else {
+		if (execvp(argv[2], &argv[2]) == -1) {
+			perror("execv");
+			return 1;
+		}
+	}
+
+	return 0;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_fifo_crebid.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_fifo_crebid.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_fifo_crebid.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_fifo_crebid.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h> 
+#include <sched.h>
+
+#define RT_PRIORITY	99
+
+int main (
+	int argc, 
+	char *argv[])
+{
+	int i, j, ret;
+	pid_t pid;
+	int execution, period;
+	struct timeval tv1, tv2;
+	double sec1, sec2, sec;
+	
+	struct sched_param *sched_param;
+	struct cabi_uaccount *ucabi;	
+
+	if (argc < 3) {
+		printf("Usage:./program [execution] [period]\n");
+		return 0;
+	}
+	execution = atoi(argv[1]);
+	period = atoi(argv[2]);
+
+	/* get selef identity */
+	pid = getpid();
+	ucabi = malloc (sizeof(struct cabi_uaccount));
+
+	memset(ucabi, 0x00, sizeof(struct cabi_uaccount));
+
+	ucabi->pm.operation = OP_BLOCK;
+	ucabi->pm.policy = PO_CYCLIC;
+	ucabi->cpu_time.tv_sec = (execution/1000);
+        ucabi->cpu_time.tv_nsec = 1000 * 1000 * (execution % 1000);
+
+        ucabi->cpu_period.tv_sec = (period/1000);
+        ucabi->cpu_period.tv_nsec = 1000 * 1000 * (period % 1000);
+
+	printf("[cpu_time]   sec %ld nsec %ld\n", 
+				ucabi->cpu_time.tv_sec,
+				ucabi->cpu_time.tv_nsec);
+	printf("[cpu_period] sec %ld nsec %ld\n", 
+				ucabi->cpu_period.tv_sec,
+				ucabi->cpu_period.tv_nsec);
+
+	/* make this process real-time one */
+	sched_param = malloc(sizeof(struct sched_param));
+	sched_param->sched_priority = RT_PRIORITY;
+	sched_setscheduler(pid, SCHED_FIFO, sched_param);
+	printf("make RT[FIFO] process.\n");
+
+	cabi_account_eval (0);
+	/* set accounting object id */
+        if (!(ret = cabi_create_bind (ucabi, pid))) {
+                printf("account set create. object_id [%d]\n",
+                        (int)ucabi->cabi_id);
+        } else {
+                printf("cabi_account_create failed.\n");
+        }
+
+	if (ucabi->cabi_id == 0) {
+		printf("cabi_account_create failed on cpu %x\n", (int) ucabi);
+		return 1;
+	}
+
+	gettimeofday (&tv1, NULL);
+	for (j = 0;j < 100000 ; j++)
+	{
+		for (i = 1; i < 100000; i++) {
+		i =+ i;
+		//printf("count %d\n", i);
+		}
+	}
+	gettimeofday (&tv2, NULL);	
+	/*
+         * Get evaluation result
+	 */
+	cabi_account_eval(1);
+	sleep (2);
+	for (i = 0; i < 8; i++) {
+		cabi_account_eval(2);
+		sleep (2);
+	}
+	cabi_account_eval(3);
+
+	sec1 = tv1.tv_sec + tv1.tv_usec*1000;
+	sec2 = tv2.tv_sec + tv2.tv_usec*1000;
+	sec = sec2 - sec1;
+	printf ("time = %10.3f\n", sec);
+
+	
+			
+}
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_get.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_get.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_get.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_get.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,79 @@
+/*
+ * This is sample program for CABI system.
+ * create accounting object.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+void usage(void) {
+	printf ("Usage: cabi_get [object_id]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("  [object_id]      : cabi object id\n");
+}
+
+int main (int argc, char *argv[])
+{
+	struct cabi_uaccount *ucabi;
+	unsigned long cabi_id; 	
+	unsigned long long cpu_time, cpu_period;
+	float cpu_ratio;
+	int ret;
+
+	if (argc != 2) {
+		usage();
+		return 0;
+	}
+
+	/* set accounting object id */
+	cabi_id = atoi (argv[1]);
+	
+	if (cabi_id == 0) {
+		printf("object id is 0\n");
+		return 1;
+	}
+
+	/* create a user cabi */
+ 	ucabi = (struct cabi_uaccount *) malloc (sizeof(struct cabi_uaccount));
+
+	if ((ret = cabi_account_get (cabi_id, ucabi)) == CABI_SUCCESS) {
+		printf("object_id (%d)\n",
+			(int)ucabi->cabi_id);
+	} else {
+		if (ret == CABI_ENOEXIST) {
+			printf("object id(%ld) not found.\n", cabi_id);
+		} else {
+			printf("cabi_account_get failed.(%d)\n", ret);
+		}
+		return 1;
+	}	
+	if (!ucabi) {
+		printf("cabi_account_get faild on cpu %p\n", ucabi);
+		return 1;
+	}
+	
+	cpu_time = ucabi->cpu_time.tv_sec * 1000 + ucabi->cpu_time.tv_nsec / 1000000; 
+	cpu_period = ucabi->cpu_period.tv_sec * 1000 + ucabi->cpu_period.tv_nsec / 1000000;
+	cpu_ratio = (float)cpu_time / (float)cpu_period * 100.0;
+
+	printf ("OPERATION (%x)\n", ucabi->pm.operation);
+	printf ("CPU_TIME (%lu sec %02lu nsec) /CPU_PERIOD (%lu sec %02lu nsec)\n",
+	ucabi->cpu_time.tv_sec,
+	ucabi->cpu_time.tv_nsec,
+	ucabi->cpu_period.tv_sec,
+	ucabi->cpu_period.tv_nsec);
+	printf ("CPU_RATIO (%5.2f %%)\n", cpu_ratio);
+
+	free(ucabi);
+
+	return 0;	
+}
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_loop.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_loop.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_loop.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_loop.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <asm/unistd.h>
+
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+#include <cabi/cabi_error.h>
+
+#define LOOP 10
+#define DELAY_SEC 1
+#define DELAY_USEC 1
+
+
+int main (int argc, char **argv)
+{
+
+	struct sigaction act, oldact;
+	struct itimerval value, ovalue;
+	int i, ret, end;
+
+	/* waiting */
+	while (count < LOOP);
+	
+	/* release the interrupt */
+	setitimer (ITIMER_REAL, &ovalue, &value);
+	
+	ret = cabi_account_eval(end);
+
+	/* clear the action */
+	sigaction (SIGALRM, &oldact, NULL);
+
+	/* print the result */
+	for (i = 0; i<LOOP; i++) {
+		printf("%3d : %ld: %06ld\n", i, tv[i].tv_sec,
+		tv[i].tv_usec);
+	}
+	return 0;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_overload_create.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_overload_create.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_overload_create.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_overload_create.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,131 @@
+/*
+ * This is sample program for CABI system.
+ * create accounting object.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <unistd.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+#define ONE_SEC 1000000
+
+int count = 0;
+
+void usage(void) {
+	printf ("Usage: cabi_overload_create [overload_time(us)] [ovlerload_period(us)]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("  [overload_time(us)]   : overload cpu performance time (usec)\n");
+	printf ("  [overload_period(us)] : overload cycle time (usec)\n");
+	printf ("  [overload_time] / [overload_period] * 100 : overload limit persents.\n");
+}
+
+void sig(int sig)
+{
+	printf ("signal %d received. count(%d)\n", sig, count); 
+	count++;
+}
+
+
+void loop(void)
+{
+	int i;
+	signal(SIGCONT, sig);
+	for (;; i++) {
+		/* printf("%02d: prog running...\n", i); */
+		sleep (2);
+	}
+}
+
+int main (int argc, char *argv[])
+{
+	int ret;
+	pid_t admin;
+	long long cpu_time, cpu_period;
+	
+	struct cabi_uaccount *ucabi;
+	cabi_account_t cabi = NULL_ACCOUNT;
+	int status;
+	
+	if (argc != 3) {
+		usage();
+		return 0;
+	}
+	cpu_time   = atoll (argv[1]);
+	cpu_period = atoll (argv[2]);
+
+	if (cpu_time <= 0) {
+		printf ("Invalid parameter. cpu_time = %lld nsec\n", cpu_time);
+		return 1;
+	}
+	if (cpu_period <= 0) {
+		printf ("Invalid parameter. cpu_period = %lld nsec\n", cpu_period);
+		return 1;
+	}
+	if (cpu_time > cpu_period) {
+		printf ("Invalid parameter. cpu_time > cpu_period\n");
+		return 1;
+	}
+
+	/* create a resource set */
+ 	ucabi = malloc (sizeof(struct cabi_uaccount));
+
+	/* make admin process */
+	if ((admin = fork()) == -1) {
+		perror("fork()");
+		return 1;
+	}
+	else if (admin > IDLE_PROCESS) {
+		/* this is parent */
+		printf ("AO PID = %d\n", getpid());
+		printf ("Admin PID = %d\n", admin);
+		waitpid(-1, &status, WNOHANG);
+	} else {
+		printf ("Admin start..pid %d\n", getpid());
+		loop();
+	}
+	
+	printf ("Overload Create!\n");
+
+	/* set scheduling and enforce mode */
+	ucabi->pm.operation = OP_SIGNAL;
+	ucabi->pm.cabi_signal.pid = admin;
+	ucabi->pm.cabi_signal.sig = SIGCONT;
+
+	ucabi->cpu_time.tv_sec = 0;
+	ucabi->cpu_time.tv_nsec = 0;
+	
+	ucabi->cpu_period.tv_sec = 0;
+	ucabi->cpu_period.tv_nsec = 0;
+
+	ucabi->cpu_time.tv_sec = (cpu_time / ONE_SEC);
+	ucabi->cpu_time.tv_nsec = (cpu_time % ONE_SEC) * 1000;
+
+	ucabi->cpu_period.tv_sec = (cpu_period / ONE_SEC);
+	ucabi->cpu_period.tv_nsec = (cpu_period % ONE_SEC) * 1000;
+
+	if ((ret = cabi_overload_create (ucabi)) == CABI_SUCCESS) {
+		printf ("account set create[SIGNAL]. object_id (%d)"
+			"cabi object address (0x%x) , user cabi address(0x%x)\n", 
+			(int)ucabi->cabi_id, (int)cabi, (int)ucabi);
+	} else {
+		kill(admin, SIGKILL);
+		printf ("Kill admin process (pid = %d)\n", admin);
+		printf ("cabi_account_create failed.(%d)\n", ret);
+		return 1;
+	}	
+	
+	if (ucabi->cabi_id != OVERLOAD_CABI_ID) {
+		printf ("cabi_account_create faild on cpu %p\n", cabi);
+		return 1;
+	}
+
+	return 0;	
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_overload_destroy.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_overload_destroy.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_overload_destroy.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_overload_destroy.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,18 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <cabi/cabi.h> 
+#include <cabi/cabi_error.h>
+
+extern int cabi_overload_destroy(void);
+
+int main (void)
+{
+	int ret;
+
+	if ((ret = cabi_overload_destroy()) == CABI_SUCCESS) {
+		printf("overload destroy success.\n");
+	} else {
+		printf("overload destroy faild. (%d)\n", ret);
+	}
+	return ret;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_set.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_set.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_set.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_set.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,145 @@
+/*
+ * This is sample program for CABI system.
+ * create accounting object.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+#define ONE_SEC 1000000
+
+void usage(void) {
+	printf ("cabi_set:\n");
+	printf ("  only cpu ratio change:\n");
+	printf ("  Usage: cabi_set [object_id] [cpu_time(us)] [cpu_period(us)]\n");
+	printf ("  cabi terminate action [signel] -> [block]:\n");
+	printf ("  Usage: cabi_set [object_id] [cpu_time(us)] [cpu_period(us)] [term_act]\n");
+	printf ("  cabi terminate action [block] -> [signal]:\n");
+	printf ("  Usage: cabi_set [object_id] [cpu_time(us)] [cpu_period(us)] [term_act] [pid] [sig] [flag]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("    [object_id]      : cabi object id\n");
+	printf ("    [cpu_time(us)]   : cpu performance time (usec)\n");
+	printf ("    [cpu_period(us)] : cpu cycle time (usec)\n");
+	printf ("    [term_act]       : terminate action (block=1, signal=2)\n");
+	printf ("    [pid]            : signal receive process id\n");
+	printf ("    [sig]            : send signal number\n");
+	printf ("    [flag]           : default(current) or else\n");
+}
+
+int main (int argc, char *argv[])
+{
+  	int ret;
+  	struct cabi_uaccount *ucabi;
+	unsigned long cabi_id;
+	long long cpu_time, cpu_period;
+	int operation, sig, flag;
+	pid_t pid;
+
+	sig = flag = 0;
+	pid = 0;
+	/* set accounting object id */
+	switch (argc) {
+	    case 4:
+		operation = OP_BLOCK;
+		break;
+	    case 5:
+		operation = atoi(argv[4]);
+ 		if(operation != OP_BLOCK) {
+			usage();
+			return 1;
+		}
+		break;
+	    case 8:
+		operation = atoi(argv[4]);
+		break;
+	    default:
+		usage();
+		return 1;
+	}
+	cabi_id = atoi (argv[1]);
+	cpu_time = atoll (argv[2]);
+	cpu_period = atoll (argv[3]);
+
+	switch (operation) {
+	    case OP_BLOCK:
+		break;
+	    case OP_SIGNAL:
+		pid  = atoi (argv[5]);
+		sig  = atoi (argv[6]);
+		flag = atoi (argv[7]);
+		break;
+	    default:
+		printf ("Invalid parameter.operation = %d\n", operation);
+		return 1;
+	}
+	printf ("Setting...\n");
+	if (cabi_id == 0) {
+		return 1;
+	}
+	if (cpu_time <= 0) {
+		printf ("Invalid parameter. cpu_time = %lld nsec\n", cpu_time);
+		return 1;
+	}
+	if (cpu_period <= 0) {
+		printf ("Invalid parameter. cpu_period = %lld nsec\n", cpu_period);
+		return 1;
+	}
+	if (cpu_time > cpu_period) {
+		printf ("Invalid parameter. cpu_time > cpu_period\n");
+		return 1;
+	}
+
+	/* create a user cabi  */
+ 	ucabi = (cabi_account_t) malloc (sizeof(struct cabi_uaccount));
+
+	if ((ret = cabi_account_get (cabi_id, ucabi)) != CABI_SUCCESS) {
+		printf ("cabi_account_set : cabi_id(%ld) no exist. (%d)\n", cabi_id, ret);
+		return 1;
+	}	
+
+	/* set time to the timespec */
+	ucabi->cpu_time.tv_sec = 0;
+	ucabi->cpu_time.tv_nsec = 0;
+	ucabi->cpu_period.tv_sec = 0;
+	ucabi->cpu_period.tv_nsec = 0;
+
+	ucabi->cpu_time.tv_sec = (cpu_time / ONE_SEC);
+	ucabi->cpu_time.tv_nsec = (cpu_time % ONE_SEC) * 1000;
+
+	ucabi->cpu_period.tv_sec = (cpu_period / ONE_SEC);
+	ucabi->cpu_period.tv_nsec = (cpu_period % ONE_SEC) * 1000;
+
+	if (argc != 4) {
+		ucabi->pm.operation = operation;
+	}
+	if (operation == OP_SIGNAL) {
+		ucabi->pm.cabi_signal.pid  = pid;
+		ucabi->pm.cabi_signal.sig  = sig;
+		ucabi->pm.cabi_signal.flag = flag;
+	}
+
+	if ((ret = cabi_account_set (cabi_id, ucabi)) == CABI_SUCCESS) {
+		printf ("account set. object_id (%d)", (int)cabi_id);
+	} else { 
+		printf ("cabi_id = %ld : cabi_account_set failed. (%d)\n", cabi_id, ret);
+		return 1;
+	}	
+	
+	printf ("TERM_ACT (%d)\n", ucabi->pm.operation);
+	printf ("CPU_TIME (%lu sec %02lu nsec) /CPU_PERIOD (%lu sec %02lu nsec)\n",
+		ucabi->cpu_time.tv_sec,
+		ucabi->cpu_time.tv_nsec,
+		ucabi->cpu_period.tv_sec,
+		ucabi->cpu_period.tv_nsec);
+
+	free(ucabi);
+
+        return 0;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_timer.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_timer.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_timer.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_timer.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <asm/unistd.h>
+
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+#include <cabi/cabi_error.h>
+
+#define LOOP 10
+#define DELAY_SEC 1
+#define DELAY_USEC 1
+
+
+volatile  int count = 0;
+struct timeval tv[LOOP];
+struct timezone tz;
+
+
+void sig_action_handler ()
+{
+
+	//system("./ctcreate");
+	gettimeofday (tv+count, &tz);
+	count++;
+}
+
+int main ()
+{
+
+	struct sigaction act, oldact;
+	struct itimerval value, ovalue;
+	int i, ret, end;
+	end = 1;
+
+	/* set interrupt handler */
+	act.sa_handler = sig_action_handler;
+	act.sa_flags = 0;
+	sigaction (SIGALRM, &act, &oldact);
+
+	/* set interrupt */
+	/* the first interrupt ->it_value */
+	/* after the second -> it_interval */
+	value.it_value.tv_sec = DELAY_SEC;
+	value.it_value.tv_usec = DELAY_USEC;
+	value.it_interval.tv_sec = DELAY_SEC;
+	value.it_interval.tv_usec = DELAY_USEC;
+	setitimer (ITIMER_REAL, &value, &ovalue);
+
+	/* waiting */
+	while (count < LOOP);
+	
+	/* release the interrupt */
+	setitimer (ITIMER_REAL, &ovalue, &value);
+	
+	ret = cabi_account_eval(end);
+
+	/* clear the action */
+	sigaction (SIGALRM, &oldact, NULL);
+
+	/* print the result */
+	for (i = 0; i<LOOP; i++) {
+		printf("%3d : %ld: %06ld\n", i, tv[i].tv_sec,
+		tv[i].tv_usec);
+	}
+	return 0;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_ts_crebid.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_ts_crebid.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_ts_crebid.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_ts_crebid.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h> 
+#include <sched.h>
+
+#define RT_PRIORITY	50
+
+int main (
+	int argc, 
+	char *argv[])
+{
+	int i, ret;
+	pid_t pid;
+	int execution, period;
+	
+	struct cabi_uaccount *ucabi;	
+
+	if (argc < 3) {
+		printf("Usage:./program [execution] [period]\n");
+		return 0;
+	}
+	execution = atoi(argv[1]);
+	period = atoi(argv[2]);
+
+	/* get selef identity */
+	pid = getpid();
+	ucabi = malloc (sizeof(struct cabi_uaccount));
+
+	memset(ucabi, 0x00, sizeof(struct cabi_uaccount));
+
+	ucabi->pm.operation = OP_BLOCK;
+	ucabi->pm.policy = PO_CYCLIC;
+	ucabi->cpu_time.tv_sec = 0;
+        ucabi->cpu_time.tv_nsec = 1000 * 1000 * execution;
+
+        ucabi->cpu_period.tv_sec = 0;
+        ucabi->cpu_period.tv_nsec = 1000 * 1000 * period;
+
+	/* make this process real-time one */
+	printf("make TS[NORMAL] process.\n");
+
+	/* set accounting object id */
+        if (!(ret = cabi_create_bind (ucabi, pid))) {
+                printf("account set create. object_id [%d]\n",
+                        (int)ucabi->cabi_id);
+        } else {
+                printf("cabi_account_create failed.\n");
+        }
+
+	if (ucabi->cabi_id == 0) {
+		printf("cabi_account_create failed on cpu %x\n", (int) ucabi);
+		return 1;
+	}
+	// do some work 
+	for (; ;)
+	{
+		for (i = 1; i < 100000000; i++) {
+		i =+ i;
+		//printf("count %d\n", i);
+		}
+	}
+			
+}
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/examples/cb_unbind.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_unbind.c
--- ./linux-2.6.18.1/drivers/cabi/examples/cb_unbind.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/examples/cb_unbind.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cabi/cabi.h>
+#include <cabi/cabi_error.h>
+
+void usage(void) {
+	printf ("Usage: cabi_unbind [pid]\n");
+	printf ("--------------------------------------------------\n");
+	printf ("  [pid]            : unbind process id\n");
+}
+
+int main (int argc, char *argv[])
+{
+	int ret;
+	pid_t pid;
+
+	if (argc != 2) {
+		usage();
+		return 0;
+	}
+
+	pid = (pid_t) atoi (argv[1]);
+	printf ("cabi unbind pid: %d\n", pid);
+
+	// Dettach this process from accounting obiect
+	if ((ret = cabi_account_unbind(pid)) != CABI_SUCCESS) {
+	          printf ("unbind faild.(%d)\n", ret);
+	} else {
+	          printf ("unbaind succeed.\n");
+	}
+
+        return 0;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/Makefile
--- ./linux-2.6.18.1/drivers/cabi/lib/Makefile	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/Makefile	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,29 @@
+INSTALL       = /usr/bin/install -c -m 644
+DESTDIR       =
+KERNELDIR     = ../../..
+#KERNELDIR     = /usr/src/debug
+INCLUDEDIR    = /usr
+CC            = $(CROSS_COMPILE)gcc -g
+AR            = $(CROSS_COMPILE)ar
+RANLIB        = $(CROSS_COMPILE)ranlib
+SOURCES       = $(wildcard *.c)
+OBJS          = $(SOURCES:.c=.o)
+CFLAGS        = -DCONFIG_CABI -O3 -Wall -I$(KERNELDIR)/include -I$(INCLUDEDIR)/include 
+
+all: libcabi.so libcabi.a
+
+libcabi.a: $(OBJS)
+	$(AR) -r libcabi.a $(OBJS)
+	$(RANLIB) libcabi.a
+
+libcabi.so: $(OBJS)
+	$(CC) -fPIC $(OBJS) -shared -o libcabi.so
+
+clean:
+	rm -rf $(OBJS) libcabi.a
+
+install: libcabi.a
+	$(INSTALL) libcabi.a  $(DESTDIR)/lib/libcabi.a
+	$(INSTALL) libcabi.so $(DESTDIR)/lib/libcabi.so
+	$(INSTALL) $(KERNELDIR)/include/cabi/cabi.h $(DESTDIR)/usr/include/cabi/cabi.h
+	$(INSTALL) $(KERNELDIR)/include/cabi/cabi_error.h $(DESTDIR)/usr/include/cabi/cabi_error.h
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/README.cabi linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/README.cabi
--- ./linux-2.6.18.1/drivers/cabi/lib/README.cabi	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/README.cabi	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,11 @@
+README file for the cabi library.
+===================================
+This file describes the brief instruction abount the cabi library
+install. 
+
+Before make, please add the link for kernel for header
+directories as follows.
+
+#ln -sf /usr/src/linux/include/linux /usr/include/linux
+#ln -sf /usr/src/linux/asm-i386 /usr/include/asm
+
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_bind_pgid.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_bind_pgid.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_bind_pgid.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_bind_pgid.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,17 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+/*
+ * int sys_cabi_account_bind_pgid(unsigned long cabi_id, pid_t pgid)
+ */
+
+_syscall2(int, cabi_account_bind_pgid, unsigned long, cabi_id, pid_t, pgid)
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_bind_pid.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_bind_pid.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_bind_pid.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_bind_pid.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,17 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+/*
+ * int sys_cabi_account_bind_pid(unsigned long cabi_id, pid_t pid)
+ *  
+ */
+_syscall2(int, cabi_account_bind_pid, unsigned long, cabi_id, pid_t, pid)
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_create.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_create.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_create.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_create.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,19 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+
+/*
+ * int sys_cabi_account_create(cabi_object_t objectid, cabi_account_t cpu)
+ */
+_syscall1(int, cabi_account_create, struct cabi_uaccount *, ucabi)
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_destroy.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_destroy.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_destroy.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_destroy.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,17 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+
+/*
+ * int sys_cabi_account_destroy (unsigned long cabi_id)
+ */
+_syscall1(int, cabi_account_destroy, unsigned long, cabi_id)
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_eval.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_eval.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_eval.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_eval.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,19 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+
+/*
+ * int sys_cabi_account_create(cabi_object_t objectid, cabi_account_t cpu)
+ */
+_syscall1(int, cabi_account_eval, int, eventtype)
+
+
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_get.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_get.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_get.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_get.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,17 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+
+/*
+ * cabi_uaccount_t sys_cabi_account_get (unsigned long cabi_id, cabi_uaccount_t, ucabi) 
+ */
+_syscall2(int, cabi_account_get, unsigned long, cabi_id, cabi_uaccount_t, ucabi)
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_set.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_set.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_set.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_set.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,17 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+
+/*
+ * int sys_cabi_account_set (unsigned long cabi_id, struct cabi_uaccount_t *cpu) 
+ */
+_syscall2(int, cabi_account_set, unsigned long, cabi_id, struct cabi_uaccount *, ucabi)
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_unbind.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_unbind.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_account_unbind.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_account_unbind.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,17 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <cabi/cabi.h>
+#include <cabi/unistd.h>
+
+/*
+ * int sys_cabi_account_unbind (pid_t pid)
+ */
+_syscall1(int, cabi_account_unbind, pid_t, pid)
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_create_bind.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_create_bind.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_create_bind.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_create_bind.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,72 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <stdio.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi.h>
+/*
+ * int cabi_overload_create(cabi_uaccount_t ucabi)
+ */
+
+extern int cabi_account_create (struct cabi_uaccount *);
+extern int cabi_account_bind_pid(unsigned long cabi_id, pid_t pid);
+extern int cabi_account_destroy(unsigned long cabi_id);
+
+int
+cabi_create_bind (struct cabi_uaccount *ucabi, pid_t pid)
+{
+
+	int retval = -EPERM;
+
+	/* If ucabi has not valid address. */
+	if (!ucabi) {
+		return CABI_EINVAL;
+	}
+
+	/* call cabi_account_create() */
+        if (!(retval = cabi_account_create (ucabi))) {
+#ifdef DEBUG_CABILIB
+                fprintf(stderr, "create_bind: account set create. object_id (%d)"
+                        "user cabi address (0x%x)\n",
+                        (int) ucabi->cabi_id, (int)ucabi);
+#endif
+        } else {
+#ifdef DEBUG_CABILIB
+                fprintf(stderr, "create_bind failed. (%d)\n", retval);
+#endif
+                goto error;
+        }
+	
+        if (pid < 0) {
+#ifdef DEBUG_CABILIB
+                fprintf(stderr, "create_bind: invalid pid(%d)\n",
+                                                        (int)pid);
+#endif
+                retval = CABI_EINVAL;     /* Invalid argument */
+		goto error;
+        }
+	
+	/* call cabi_account_bind_pid */
+        if (!(retval = cabi_account_bind_pid(ucabi->cabi_id, pid))) {
+#ifdef DEBUG_CABILIB
+               fprintf(stderr, "cabi_account_bind_pid: Account ID(%d)[%d]\n",
+                                (int) ucabi->cabi_id, pid);
+#endif
+                return CABI_SUCCESS;
+        } else {
+#ifdef DEBUG_CABILIB
+                fprintf(stderr, "cabi_account_bind_pid() failed.\n");
+#endif
+		cabi_account_destroy(ucabi->cabi_id);
+        }
+
+error:
+        return retval;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_overload_create.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_overload_create.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_overload_create.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_overload_create.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,84 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <stdio.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi.h>
+
+#define DEBUG_CABILIB	1
+
+/*
+ * int cabi_overload_create(cabi_uaccount_t ucabi)
+ */
+
+extern int cabi_account_create (struct cabi_uaccount *);
+extern int cabi_account_bind_pid (unsigned long cabi_id, pid_t pid);
+
+int
+cabi_overload_create (struct cabi_uaccount *ucabi)
+{
+
+	pid_t pid;
+	int ret;
+
+	/* If ucabi has not valid address. */
+	if (!ucabi) {
+		return CABI_EINVAL;
+	}
+	
+	ucabi->pm.bind_proc_type = BIND_IDLE_PROC;
+
+#ifdef DEBUG_CABILIB
+        printf ("pm.bind_proc_type (%x)\n",
+                ucabi->pm.bind_proc_type);
+#endif	
+
+	if ((ret = cabi_account_create (ucabi)) == CABI_SUCCESS) {
+#ifdef DEBUG_CABILIB
+		fprintf(stderr, "overload account set create. object_id (%d)"
+			"user cabi address (0x%x)\n",
+			(int) ucabi->cabi_id, (int)ucabi);
+#endif
+	} else {
+#ifdef DEBUG_CABILIB
+		fprintf(stderr, "overload cabi_account_create failed. (%d)\n", 
+			ret);
+#endif
+		return ret;
+	} 
+	
+	/* if the system call has been success, then bind the idle process. */
+	if (ucabi->cabi_id != OVERLOAD_CABI_ID) {
+#ifdef DEBUG_CABILIB
+		fprintf (stderr, "cabi object_id(%d) did not set. error return.\n", (int) ucabi->cabi_id);
+#endif
+		return CABI_EINVAL;
+	}
+
+	/* set the idle process */
+	pid = IDLE_PROCESS;
+
+	if ((ret = cabi_account_bind_pid(ucabi->cabi_id, pid)) 
+						== CABI_SUCCESS) {
+#ifdef DEBUG_CABILIB
+		fprintf(stderr, "cabi_account_bind_pid: Account ID(%d)[%d]\n",
+				(int) ucabi->cabi_id, pid);
+#endif
+		return CABI_SUCCESS;
+	} else {
+#ifdef DEBUG_CABILIB
+		fprintf(stderr, "cabi_account_bind_pid() [idle] failed.\n");
+#endif
+		return ret;
+	} 
+	free (ucabi);
+	return CABI_SUCCESS;
+}
+
diff -urN ./linux-2.6.18.1/drivers/cabi/lib/cabi_overload_destroy.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_overload_destroy.c
--- ./linux-2.6.18.1/drivers/cabi/lib/cabi_overload_destroy.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/lib/cabi_overload_destroy.c	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,42 @@
+/* 
+ * Copyright (C) OS Research Group, Waseda University Nakajima Laboratory.
+ *               MontaVista Software, Inc. 
+ * All Rights Reserved.
+ * any improvements or extensions that they make and grant Waseda University
+ * the rights to redistribute these changes.
+ *
+ */
+
+#define __LIBRARY__
+#include <stdio.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi.h>
+
+/*
+ * int cabi_overload_destroy(void)
+ */
+
+extern int cabi_account_unbind(pid_t pid);
+extern int cabi_account_destroy(unsigned long cabi_id);
+
+int
+cabi_overload_destroy(void)
+{
+	pid_t pid = IDLE_PROCESS;
+	int ret;
+
+	if ((ret = cabi_account_unbind(pid)) != CABI_SUCCESS) {
+#ifdef DEBUG_CABILIB
+		fprintf (stderr, "cabi_overload_destroy : unbind error. (%d)\n", ret);
+#endif
+		return ret;
+	}
+	if ((ret = cabi_account_destroy(OVERLOAD_CABI_ID)) != CABI_SUCCESS) {
+#ifdef DEBUG_CABILIB
+		fprintf (stderr, "cabi_overload_destroy : destroy error. (%d)\n", ret);
+#endif
+		return ret;
+	}
+
+	return ret;
+}
diff -urN ./linux-2.6.18.1/drivers/cabi/udivdi3.S linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/udivdi3.S
--- ./linux-2.6.18.1/drivers/cabi/udivdi3.S	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/cabi/udivdi3.S	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,396 @@
+	.file	"libgcc2.c"
+	.version	"01.01"
+gcc2_compiled.:
+.section	.rodata
+	.type	 __clz_tab,@object
+__clz_tab:
+.byte 0
+.byte 1
+.byte 2
+.byte 2
+.byte 3
+.byte 3
+.byte 3
+.byte 3
+.byte 4
+.byte 4
+.byte 4
+.byte 4
+.byte 4
+.byte 4
+.byte 4
+.byte 4
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 5
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 6
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 7
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+.byte 8
+	.size	 __clz_tab,256
+.text
+	.align 4
+.globl __udivdi3
+	.type	 __udivdi3,@function
+__udivdi3:
+	subl $28,%esp
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+	movl 56(%esp),%ebx
+	movl 60(%esp),%edi
+	movl 48(%esp),%eax
+	movl %eax,40(%esp)
+	movl 52(%esp),%edx
+	movl %edx,16(%esp)
+	testl %edi,%edi
+	jne .L25
+	cmpl %edx,%ebx
+	jbe .L26
+#APP
+	divl %ebx
+#NO_APP
+	movl %eax,%esi
+	jmp .L30
+	.align 4
+.L26:
+	cmpl $0,56(%esp)
+	jne .L28
+	movl $1,%eax
+	xorl %edx,%edx
+	divl %ebx
+	movl %eax,%ebx
+.L28:
+	movl 16(%esp),%eax
+	xorl %edx,%edx
+#APP
+	divl %ebx
+#NO_APP
+	movl %eax,%edi
+	movl %edx,16(%esp)
+	movl 40(%esp),%eax
+#APP
+	divl %ebx
+#NO_APP
+	movl %eax,%esi
+	jmp .L30
+	.align 4
+.L25:
+	cmpl %edi,16(%esp)
+	jae .L31
+	xorl %edi,%edi
+	movl %edi,%esi
+	jmp .L30
+	.align 4
+.L31:
+#APP
+	bsrl %edi,%eax
+#NO_APP
+	movl %eax,%esi
+	xorl $31,%esi
+	jne .L38
+	cmpl %edi,16(%esp)
+	ja .L39
+	cmpl %ebx,40(%esp)
+	jb .L45
+.L39:
+	movl $1,%esi
+	jmp .L45
+	.align 4
+.L38:
+	movl $32,%ebp
+	subl %esi,%ebp
+	movl %esi,%ecx
+	sall %cl,%edi
+	movl %edi,20(%esp)
+	movl %ebx,%eax
+	movl %ebp,%ecx
+	shrl %cl,%eax
+	orl %eax,%edi
+	movl %esi,%ecx
+	sall %cl,%ebx
+	movl 16(%esp),%eax
+	movl %ebp,%ecx
+	shrl %cl,%eax
+	movl %eax,36(%esp)
+	movl 16(%esp),%eax
+	movl %esi,%ecx
+	sall %cl,%eax
+	movl %eax,20(%esp)
+	movl 40(%esp),%eax
+	movl %ebp,%ecx
+	shrl %cl,%eax
+	movl 20(%esp),%edx
+	orl %eax,%edx
+	movl %edx,16(%esp)
+	movl %esi,%ecx
+	sall %cl,40(%esp)
+	movl %edx,%eax
+	movl 36(%esp),%edx
+#APP
+	divl %edi
+#NO_APP
+	movl %eax,%esi
+	movl %edx,16(%esp)
+#APP
+	mull %ebx
+#NO_APP
+	movl %eax,%ebx
+	movl %edx,%edi
+	cmpl %edi,16(%esp)
+	jb .L44
+	jne .L45
+	cmpl %ebx,40(%esp)
+	jae .L45
+.L44:
+	decl %esi
+.L45:
+	xorl %edi,%edi
+.L30:
+	movl %esi,28(%esp)
+	movl %edi,32(%esp)
+	movl 28(%esp),%eax
+	movl 32(%esp),%edx
+	popl %ebx
+	popl %esi
+	popl %edi
+	popl %ebp
+	addl $28,%esp
+	ret
+.Lfe1:
+	.size	 __udivdi3,.Lfe1-__udivdi3
+	.ident	"GCC: (GNU) 2.8.1"
diff -urN ./linux-2.6.18.1/drivers/char/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/Kconfig
--- ./linux-2.6.18.1/drivers/char/Kconfig	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -741,6 +741,46 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called rtc.
 
+config RTC_HISTOGRAM
+	bool "Real Time Clock Histogram Support"
+	default n
+	depends on RTC
+	---help---
+	  If you say Y here then the kernel will track the delivery and
+	  wakeup latency of /dev/rtc using tasks and will report a
+	  histogram to the kernel log when the application closes /dev/rtc.
+
+config BLOCKER
+	tristate "Priority Inheritance Debugging (Blocker) Device Support"
+	depends on X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  pi_test suite uses to test and measure kernel locking primitives.
+
+config LPPTEST
+	tristate "Parallel Port Based Latency Measurement Device"
+	depends on !PARPORT && X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  testlpp utility uses to measure IRQ latencies of a target system
+	  from an independent measurement system.
+
+	  NOTE: this code assumes x86 PCs and that the parallel port is
+	  bidirectional and is on IRQ 7.
+
+	  to use the device, both the target and the source system needs to
+	  run a kernel with CONFIG_LPPTEST enabled. To measure latencies,
+	  use the scripts/testlpp utility in your kernel source directory,
+	  and run it (as root) on the source system - it will start printing
+	  out the latencies it took to get a response from the target system:
+
+	    Latency of response: 12.2 usecs (121265 cycles)
+
+	  then generate various workloads on the target system to see how
+	  (worst-case-) latencies are impacted.
+
 config SGI_DS1286
 	tristate "SGI DS1286 RTC support"
 	depends on SGI_IP22
diff -urN ./linux-2.6.18.1/drivers/char/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/Makefile
--- ./linux-2.6.18.1/drivers/char/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -89,6 +89,9 @@
 obj-$(CONFIG_TANBAC_TB0219)	+= tb0219.o
 obj-$(CONFIG_TELCLOCK)		+= tlclk.o
 
+obj-$(CONFIG_BLOCKER)		+= blocker.o
+obj-$(CONFIG_LPPTEST)		+= lpptest.o
+
 obj-$(CONFIG_WATCHDOG)		+= watchdog/
 obj-$(CONFIG_MWAVE)		+= mwave/
 obj-$(CONFIG_AGP)		+= agp/
diff -urN ./linux-2.6.18.1/drivers/char/blocker.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/blocker.c
--- ./linux-2.6.18.1/drivers/char/blocker.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/blocker.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,107 @@
+/*
+ * priority inheritance testing device
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+
+#define BLOCKER_MINOR		221
+
+#define BLOCK_IOCTL		4245
+#define BLOCK_SET_DEPTH		4246
+
+#define BLOCKER_MAX_LOCK_DEPTH		10
+
+void loop(int loops)
+{
+	int i;
+
+	for (i = 0; i < loops; i++)
+		get_cycles();
+}
+
+static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH];
+
+static unsigned int lock_depth = 1;
+
+void do_the_lock_and_loop(unsigned int args)
+{
+	int i, max;
+
+	if (rt_task(current))
+		max = lock_depth;
+	else if (lock_depth > 1)
+		max = (current->pid % lock_depth) + 1;
+	else
+		max = 1;
+
+	/* Always lock from the top down */
+	for (i = max-1; i >= 0; i--)
+		 spin_lock(&blocker_lock[i]);
+	loop(args);
+	for (i = 0; i < max; i++)
+		spin_unlock(&blocker_lock[i]);
+}
+
+static int blocker_open(struct inode *in, struct file *file)
+{
+	printk(KERN_INFO "blocker_open called\n");
+
+	return 0;
+}
+
+static long blocker_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long args)
+{
+	switch(cmd) {
+	case BLOCK_IOCTL:
+		do_the_lock_and_loop(args);
+		return 0;
+	case BLOCK_SET_DEPTH:
+		if (args >= BLOCKER_MAX_LOCK_DEPTH)
+			return -EINVAL;
+		lock_depth = args;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations blocker_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = blocker_ioctl,
+	.open		= blocker_open,
+};
+
+static struct miscdevice blocker_dev =
+{
+	BLOCKER_MINOR,
+	"blocker",
+	&blocker_fops
+};
+
+static int __init blocker_init(void)
+{
+	int i;
+
+	if (misc_register(&blocker_dev))
+		return -ENODEV;
+
+	for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++)
+		spin_lock_init(blocker_lock + i);
+
+	return 0;
+}
+
+void __exit blocker_exit(void)
+{
+	printk(KERN_INFO "blocker device uninstalled\n");
+	misc_deregister(&blocker_dev);
+}
+
+module_init(blocker_init);
+module_exit(blocker_exit);
+
+MODULE_LICENSE("GPL");
+
diff -urN ./linux-2.6.18.1/drivers/char/hangcheck-timer.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/hangcheck-timer.c
--- ./linux-2.6.18.1/drivers/char/hangcheck-timer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/hangcheck-timer.c	2007-05-19 23:58:35.000000000 +0900
@@ -117,7 +117,7 @@
 __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
 #endif /* not MODULE */
 
-#if defined(CONFIG_X86_64) || defined(CONFIG_S390)
+#ifdef CONFIG_S390
 # define HAVE_MONOTONIC
 # define TIMER_FREQ 1000000000ULL
 #elif defined(CONFIG_IA64)
diff -urN ./linux-2.6.18.1/drivers/char/hpet.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/hpet.c
--- ./linux-2.6.18.1/drivers/char/hpet.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/hpet.c	2007-05-19 23:58:35.000000000 +0900
@@ -28,6 +28,7 @@
 #include <linux/bcd.h>
 #include <linux/seq_file.h>
 #include <linux/bitops.h>
+#include <linux/clocksource.h>
 
 #include <asm/current.h>
 #include <asm/uaccess.h>
@@ -50,8 +51,34 @@
 
 #define HPET_RANGE_SIZE		1024	/* from HPET spec */
 
+#if BITS_PER_LONG == 64
+#define	write_counter(V, MC)	writeq(V, MC)
+#define	read_counter(MC)	readq(MC)
+#else
+#define	write_counter(V, MC) 	writel(V, MC)
+#define	read_counter(MC)	readl(MC)
+#endif
+
 static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 
+static void __iomem *hpet_mc_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)read_counter((void __iomem *)hpet_mc_ptr);
+}
+
+static struct clocksource clocksource_hpet = {
+        .name           = "hpet",
+        .rating         = 300,
+        .read           = read_hpet,
+        .mask           = 0xffffffffffffffffLL,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 10,
+        .is_continuous  = 1,
+};
+static struct clocksource *hpet_clocksource_p;
+
 /* A lock for concurrent access by app and isr hpet activity. */
 static DEFINE_SPINLOCK(hpet_lock);
 /* A lock for concurrent intermodule access to hpet and isr hpet activity. */
@@ -78,7 +105,7 @@
 	struct hpets *hp_next;
 	struct hpet __iomem *hp_hpet;
 	unsigned long hp_hpet_phys;
-	struct time_interpolator *hp_interpolator;
+	struct clocksource *hp_clocksource;
 	unsigned long long hp_tick_freq;
 	unsigned long hp_delta;
 	unsigned int hp_ntimer;
@@ -93,13 +120,6 @@
 #define	HPET_PERIODIC		0x0004
 #define	HPET_SHARED_IRQ		0x0008
 
-#if BITS_PER_LONG == 64
-#define	write_counter(V, MC)	writeq(V, MC)
-#define	read_counter(MC)	readq(MC)
-#else
-#define	write_counter(V, MC) 	writel(V, MC)
-#define	read_counter(MC)	readl(MC)
-#endif
 
 #ifndef readq
 static inline unsigned long long readq(void __iomem *addr)
@@ -736,27 +756,6 @@
 
 static struct ctl_table_header *sysctl_header;
 
-static void hpet_register_interpolator(struct hpets *hpetp)
-{
-#ifdef	CONFIG_TIME_INTERPOLATION
-	struct time_interpolator *ti;
-
-	ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-	if (!ti)
-		return;
-
-	ti->source = TIME_SOURCE_MMIO64;
-	ti->shift = 10;
-	ti->addr = &hpetp->hp_hpet->hpet_mc;
-	ti->frequency = hpetp->hp_tick_freq;
-	ti->drift = HPET_DRIFT;
-	ti->mask = -1;
-
-	hpetp->hp_interpolator = ti;
-	register_time_interpolator(ti);
-#endif
-}
-
 /*
  * Adjustment for when arming the timer with
  * initial conditions.  That is, main counter
@@ -908,7 +907,16 @@
 	}
 
 	hpetp->hp_delta = hpet_calibrate(hpetp);
-	hpet_register_interpolator(hpetp);
+
+	if (!hpet_clocksource_p) {
+#ifdef CONFIG_IA64
+        	clocksource_hpet.fsys_mmio_ptr = hpet_mc_ptr = &hpetp->hp_hpet->hpet_mc;
+#endif
+        	clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq,
+                                                   clocksource_hpet.shift);
+        	clocksource_register(&clocksource_hpet);
+		hpet_clocksource_p = hpetp->hp_clocksource = &clocksource_hpet;
+	}
 
 	return 0;
 }
@@ -994,7 +1002,7 @@
 
 static int hpet_acpi_remove(struct acpi_device *device, int type)
 {
-	/* XXX need to unregister interpolator, dealloc mem, etc */
+	/* XXX need to unregister clocksource, dealloc mem, etc */
 	return -EINVAL;
 }
 
diff -urN ./linux-2.6.18.1/drivers/char/lpptest.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/lpptest.c
--- ./linux-2.6.18.1/drivers/char/lpptest.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/lpptest.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,179 @@
+/*
+ * /dev/lpptest device: test IRQ handling latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
+ *
+ * licensed under the GPL
+ *
+ * You need to have CONFIG_PARPORT disabled for this device, it is a
+ * completely self-contained device that assumes sole ownership of the
+ * parallel port.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtc.h>
+
+/*
+ * API wrappers so that the code can be shared with the -rt tree:
+ */
+#ifndef local_irq_disable
+# define local_irq_disable	local_irq_disable
+# define local_irq_enable	local_irq_enable
+#endif
+
+#ifndef IRQ_NODELAY
+# define IRQ_NODELAY		0
+# define IRQF_NODELAY		0
+#endif
+
+/*
+ * Driver:
+ */
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_IRQ 7
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+static char dev_id[] = "lpptest";
+
+#define INIT_PORT()	outb(0x04, 0x37a)
+#define ENABLE_IRQ()	outb(0x10, 0x37a)
+#define DISABLE_IRQ()	outb(0, 0x37a)
+
+static unsigned char out = 0x5a;
+
+/**
+ * Interrupt handler. Flip a bit in the reply.
+ */
+static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs)
+{
+	out ^= 0xff;
+	outb(out, 0x378);
+
+	return IRQ_HANDLED;
+}
+
+static cycles_t test_response(void)
+{
+	cycles_t now, end;
+	unsigned char in;
+	int timeout = 0;
+
+	local_irq_disable();
+	in = inb(0x379);
+	inb(0x378);
+	outb(0x08, 0x378);
+	now = get_cycles();
+	while(1) {
+    		if (inb(0x379) != in)
+			break;
+		if (timeout++ > 1000000) {
+			outb(0x00, 0x378);
+			local_irq_enable();
+
+			return 0;
+		}
+	}
+	end = get_cycles();
+	outb(0x00, 0x378);
+	local_irq_enable();
+
+	return end - now;
+}
+
+static int lpptest_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int lpptest_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param)
+{
+	int retval = 0;
+
+	switch (ioctl_num) {
+
+	case LPPTEST_DISABLE:
+		DISABLE_IRQ();
+		break;
+
+	case LPPTEST_ENABLE:
+		ENABLE_IRQ();
+		break;
+
+	case LPPTEST_TEST: {
+
+		cycles_t diff = test_response();
+		if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff)))
+			goto errcpy;
+		break;
+	}
+	default: retval = -EINVAL;
+	}
+
+	return retval;
+
+ errcpy:
+	return -EFAULT;
+}
+
+static struct file_operations lpptest_dev_fops = {
+	.ioctl = lpptest_ioctl,
+	.open = lpptest_open,
+	.release = lpptest_close,
+};
+
+static int __init lpptest_init (void)
+{
+	if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops))
+	{
+		printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n",
+		       LPPTEST_CHAR_MAJOR);
+		return -EAGAIN;
+	}
+
+	if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) {
+		printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ);
+		unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+		return -EAGAIN;
+	}
+	irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY;
+	irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED;
+
+	INIT_PORT();
+	ENABLE_IRQ();
+
+	return 0;
+}
+module_init (lpptest_init);
+
+static void __exit lpptest_exit (void)
+{
+	DISABLE_IRQ();
+
+	free_irq(LPPTEST_IRQ, dev_id);
+	unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+}
+module_exit (lpptest_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("lpp test module");
+
diff -urN ./linux-2.6.18.1/drivers/char/random.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/random.c
--- ./linux-2.6.18.1/drivers/char/random.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/random.c	2007-05-19 23:58:35.000000000 +0900
@@ -580,8 +580,11 @@
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    (__get_cpu_var(trickle_count)++ & 0xfff))
-		goto out;
+	    (__get_cpu_var(trickle_count)++ & 0xfff)) {
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
@@ -626,9 +629,6 @@
 
 	if(input_pool.entropy_count >= random_read_wakeup_thresh)
 		wake_up_interruptible(&random_read_wait);
-
-out:
-	preempt_enable();
 }
 
 void add_input_randomness(unsigned int type, unsigned int code,
diff -urN ./linux-2.6.18.1/drivers/char/rtc.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/rtc.c
--- ./linux-2.6.18.1/drivers/char/rtc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/rtc.c	2007-05-20 00:11:12.000000000 +0900
@@ -82,10 +82,36 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#ifdef CONFIG_MIPS
+# include <asm/time.h>
+#endif
+
 #if defined(__i386__)
 #include <asm/hpet.h>
 #endif
 
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
 #ifdef __sparc__
 #include <linux/pci.h>
 #include <asm/ebus.h>
@@ -218,7 +244,146 @@
 	return uip;
 }
 
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		last_interrupt_time = get_cycles();
+		rtc_state = S_READ_MISSED;
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		last_interrupt_time = get_cycles();
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
 #ifdef RTC_IRQ
+
 /*
  *	A very tiny interrupt handler. It runs with IRQF_DISABLED set,
  *	but there is possibility of conflicting with the set_rtc_mmss()
@@ -262,7 +427,7 @@
 	if (rtc_callback)
 		rtc_callback->func(rtc_callback->private_data);
 	spin_unlock(&rtc_task_lock);
-	wake_up_interruptible(&rtc_wait);	
+	wake_up_interruptible(&rtc_wait);
 
 	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
 
@@ -376,6 +541,8 @@
 		schedule();
 	} while (1);
 
+	rtc_read_event();
+
 	if (count == sizeof(unsigned int))
 		retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int);
 	else
@@ -608,6 +775,11 @@
 		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
 #ifdef CONFIG_MACH_DECSTATION
 		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
 #endif
@@ -617,6 +789,7 @@
 		CMOS_WRITE(hrs, RTC_HOURS);
 		CMOS_WRITE(min, RTC_MINUTES);
 		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
 
 		CMOS_WRITE(save_control, RTC_CONTROL);
 		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
@@ -715,6 +888,7 @@
 	if(rtc_status & RTC_IS_OPEN)
 		goto out_busy;
 
+	rtc_open_event();
 	rtc_status |= RTC_IS_OPEN;
 
 	rtc_irq_data = 0;
@@ -770,6 +944,7 @@
 	rtc_irq_data = 0;
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock_irq (&rtc_lock);
+	rtc_close_event();
 	return 0;
 }
 
@@ -1153,6 +1328,7 @@
 	printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
 
 	/* Now we have new data */
+	rtc_wake_event();
 	wake_up_interruptible(&rtc_wait);
 
 	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
diff -urN ./linux-2.6.18.1/drivers/char/rtc.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/rtc.c.orig
--- ./linux-2.6.18.1/drivers/char/rtc.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/rtc.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1557 @@
+/*
+ *	Real Time Clock interface for Linux	
+ *
+ *	Copyright (C) 1996 Paul Gortmaker
+ *
+ *	This driver allows use of the real time clock (built into
+ *	nearly all computers) from user space. It exports the /dev/rtc
+ *	interface supporting various ioctl() and also the
+ *	/proc/driver/rtc pseudo-file for status information.
+ *
+ *	The ioctls can be used to set the interrupt behaviour and
+ *	generation rate from the RTC via IRQ 8. Then the /dev/rtc
+ *	interface can be used to make use of these timer interrupts,
+ *	be they interval or alarm based.
+ *
+ *	The /dev/rtc interface will block on reads until an interrupt
+ *	has been received. If a RTC interrupt has already happened,
+ *	it will output an unsigned long and then block. The output value
+ *	contains the interrupt status in the low byte and the number of
+ *	interrupts since the last read in the remaining high bytes. The 
+ *	/dev/rtc interface can also be used with the select(2) call.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Based on other minimal char device drivers, like Alan's
+ *	watchdog, Ted's random, etc. etc.
+ *
+ *	1.07	Paul Gortmaker.
+ *	1.08	Miquel van Smoorenburg: disallow certain things on the
+ *		DEC Alpha as the CMOS clock is also used for other things.
+ *	1.09	Nikita Schmidt: epoch support and some Alpha cleanup.
+ *	1.09a	Pete Zaitcev: Sun SPARC
+ *	1.09b	Jeff Garzik: Modularize, init cleanup
+ *	1.09c	Jeff Garzik: SMP cleanup
+ *	1.10    Paul Barton-Davis: add support for async I/O
+ *	1.10a	Andrea Arcangeli: Alpha updates
+ *	1.10b	Andrew Morton: SMP lock fix
+ *	1.10c	Cesar Barros: SMP locking fixes and cleanup
+ *	1.10d	Paul Gortmaker: delete paranoia check in rtc_exit
+ *	1.10e	Maciej W. Rozycki: Handle DECstation's year weirdness.
+ *      1.11    Takashi Iwai: Kernel access functions
+ *			      rtc_register/rtc_unregister/rtc_control
+ *      1.11a   Daniele Bellucci: Audit create_proc_read_entry in rtc_init
+ *	1.12	Venkatesh Pallipadi: Hooks for emulating rtc on HPET base-timer
+ *		CONFIG_HPET_EMULATE_RTC
+ *	1.12a	Maciej W. Rozycki: Handle memory-mapped chips properly.
+ *	1.12ac	Alan Cox: Allow read access to the day of week register
+ */
+
+#define RTC_VERSION		"1.12ac"
+
+/*
+ *	Note that *all* calls to CMOS_READ and CMOS_WRITE are done with
+ *	interrupts disabled. Due to the index-port/data-port (0x70/0x71)
+ *	design of the RTC, we don't want two different things trying to
+ *	get to it at once. (e.g. the periodic 11 min sync from time.c vs.
+ *	this driver.)
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/miscdevice.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/mc146818rtc.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/wait.h>
+#include <linux/bcd.h>
+#include <linux/delay.h>
+
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_MIPS
+# include <asm/time.h>
+#endif
+
+#if defined(__i386__)
+#include <asm/hpet.h>
+#endif
+
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
+#ifdef __sparc__
+#include <linux/pci.h>
+#include <asm/ebus.h>
+#ifdef __sparc_v9__
+#include <asm/isa.h>
+#endif
+
+static unsigned long rtc_port;
+static int rtc_irq = PCI_IRQ_NONE;
+#endif
+
+#ifdef	CONFIG_HPET_RTC_IRQ
+#undef	RTC_IRQ
+#endif
+
+#ifdef RTC_IRQ
+static int rtc_has_irq = 1;
+#endif
+
+#ifndef CONFIG_HPET_EMULATE_RTC
+#define is_hpet_enabled()			0
+#define hpet_set_alarm_time(hrs, min, sec) 	0
+#define hpet_set_periodic_freq(arg) 		0
+#define hpet_mask_rtc_irq_bit(arg) 		0
+#define hpet_set_rtc_irq_bit(arg) 		0
+#define hpet_rtc_timer_init() 			do { } while (0)
+#define hpet_rtc_dropped_irq() 			0
+static inline irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) {return 0;}
+#else
+extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
+#endif
+
+/*
+ *	We sponge a minor off of the misc major. No need slurping
+ *	up another valuable major dev number for this. If you add
+ *	an ioctl, make sure you don't conflict with SPARC's RTC
+ *	ioctls.
+ */
+
+static struct fasync_struct *rtc_async_queue;
+
+static DECLARE_WAIT_QUEUE_HEAD(rtc_wait);
+
+#ifdef RTC_IRQ
+static struct timer_list rtc_irq_timer;
+#endif
+
+static ssize_t rtc_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos);
+
+static int rtc_ioctl(struct inode *inode, struct file *file,
+		     unsigned int cmd, unsigned long arg);
+
+#ifdef RTC_IRQ
+static unsigned int rtc_poll(struct file *file, poll_table *wait);
+#endif
+
+static void get_rtc_alm_time (struct rtc_time *alm_tm);
+#ifdef RTC_IRQ
+static void rtc_dropped_irq(unsigned long data);
+
+static void set_rtc_irq_bit_locked(unsigned char bit);
+static void mask_rtc_irq_bit_locked(unsigned char bit);
+
+static inline void set_rtc_irq_bit(unsigned char bit)
+{
+	spin_lock_irq(&rtc_lock);
+	set_rtc_irq_bit_locked(bit);
+	spin_unlock_irq(&rtc_lock);
+}
+
+static void mask_rtc_irq_bit(unsigned char bit)
+{
+	spin_lock_irq(&rtc_lock);
+	mask_rtc_irq_bit_locked(bit);
+	spin_unlock_irq(&rtc_lock);
+}
+#endif
+
+static int rtc_proc_open(struct inode *inode, struct file *file);
+
+/*
+ *	Bits in rtc_status. (6 bits of room for future expansion)
+ */
+
+#define RTC_IS_OPEN		0x01	/* means /dev/rtc is in use	*/
+#define RTC_TIMER_ON		0x02	/* missed irq timer active	*/
+
+/*
+ * rtc_status is never changed by rtc_interrupt, and ioctl/open/close is
+ * protected by the big kernel lock. However, ioctl can still disable the timer
+ * in rtc_status and then with del_timer after the interrupt has read
+ * rtc_status but before mod_timer is called, which would then reenable the
+ * timer (but you would need to have an awful timing before you'd trip on it)
+ */
+static unsigned long rtc_status = 0;	/* bitmapped status byte.	*/
+static unsigned long rtc_freq = 0;	/* Current periodic IRQ rate	*/
+static unsigned long rtc_irq_data = 0;	/* our output to the world	*/
+static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
+
+#ifdef RTC_IRQ
+/*
+ * rtc_task_lock nests inside rtc_lock.
+ */
+static DEFINE_SPINLOCK(rtc_task_lock);
+static rtc_task_t *rtc_callback = NULL;
+#endif
+
+/*
+ *	If this driver ever becomes modularised, it will be really nice
+ *	to make the epoch retain its value across module reload...
+ */
+
+static unsigned long epoch = 1900;	/* year corresponding to 0x00	*/
+
+static const unsigned char days_in_mo[] = 
+{0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+
+/*
+ * Returns true if a clock update is in progress
+ */
+static inline unsigned char rtc_is_updating(void)
+{
+	unsigned char uip;
+
+	spin_lock_irq(&rtc_lock);
+	uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP);
+	spin_unlock_irq(&rtc_lock);
+	return uip;
+}
+
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		last_interrupt_time = get_cycles();
+		rtc_state = S_READ_MISSED;
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		last_interrupt_time = get_cycles();
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
+#ifdef RTC_IRQ
+
+/*
+ *	A very tiny interrupt handler. It runs with IRQF_DISABLED set,
+ *	but there is possibility of conflicting with the set_rtc_mmss()
+ *	call (the rtc irq and the timer irq can easily run at the same
+ *	time in two different CPUs). So we need to serialize
+ *	accesses to the chip with the rtc_lock spinlock that each
+ *	architecture should implement in the timer code.
+ *	(See ./arch/XXXX/kernel/time.c for the set_rtc_mmss() function.)
+ */
+
+irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	/*
+	 *	Can be an alarm interrupt, update complete interrupt,
+	 *	or a periodic interrupt. We store the status in the
+	 *	low byte and the number of interrupts received since
+	 *	the last read in the remainder of rtc_irq_data.
+	 */
+
+	spin_lock (&rtc_lock);
+	rtc_irq_data += 0x100;
+	rtc_irq_data &= ~0xff;
+	if (is_hpet_enabled()) {
+		/*
+		 * In this case it is HPET RTC interrupt handler
+		 * calling us, with the interrupt information
+		 * passed as arg1, instead of irq.
+		 */
+		rtc_irq_data |= (unsigned long)irq & 0xF0;
+	} else {
+		rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0);
+	}
+
+	if (rtc_status & RTC_TIMER_ON)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+
+	spin_unlock (&rtc_lock);
+
+	/* Now do the rest of the actions */
+	spin_lock(&rtc_task_lock);
+	if (rtc_callback)
+		rtc_callback->func(rtc_callback->private_data);
+	spin_unlock(&rtc_task_lock);
+	wake_up_interruptible(&rtc_wait);
+
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+
+	return IRQ_HANDLED;
+}
+#endif
+
+/*
+ * sysctl-tuning infrastructure.
+ */
+static ctl_table rtc_table[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "max-user-freq",
+		.data		= &rtc_max_user_freq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table rtc_root[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "rtc",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= rtc_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table dev_root[] = {
+	{
+		.ctl_name	= CTL_DEV,
+		.procname	= "dev",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= rtc_root,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *sysctl_header;
+
+static int __init init_sysctl(void)
+{
+    sysctl_header = register_sysctl_table(dev_root, 0);
+    return 0;
+}
+
+static void __exit cleanup_sysctl(void)
+{
+    unregister_sysctl_table(sysctl_header);
+}
+
+/*
+ *	Now all the various file operations that we export.
+ */
+
+static ssize_t rtc_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+#ifndef RTC_IRQ
+	return -EIO;
+#else
+	DECLARE_WAITQUEUE(wait, current);
+	unsigned long data;
+	ssize_t retval;
+	
+	if (rtc_has_irq == 0)
+		return -EIO;
+
+	/*
+	 * Historically this function used to assume that sizeof(unsigned long)
+	 * is the same in userspace and kernelspace.  This lead to problems
+	 * for configurations with multiple ABIs such a the MIPS o32 and 64
+	 * ABIs supported on the same kernel.  So now we support read of both
+	 * 4 and 8 bytes and assume that's the sizeof(unsigned long) in the
+	 * userspace ABI.
+	 */
+	if (count != sizeof(unsigned int) && count !=  sizeof(unsigned long))
+		return -EINVAL;
+
+	add_wait_queue(&rtc_wait, &wait);
+
+	do {
+		/* First make it right. Then make it fast. Putting this whole
+		 * block within the parentheses of a while would be too
+		 * confusing. And no, xchg() is not the answer. */
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		
+		spin_lock_irq (&rtc_lock);
+		data = rtc_irq_data;
+		rtc_irq_data = 0;
+		spin_unlock_irq (&rtc_lock);
+
+		if (data != 0)
+			break;
+
+		if (file->f_flags & O_NONBLOCK) {
+			retval = -EAGAIN;
+			goto out;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			goto out;
+		}
+		schedule();
+	} while (1);
+
+	rtc_read_event();
+
+	if (count == sizeof(unsigned int))
+		retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int);
+	else
+		retval = put_user(data, (unsigned long __user *)buf) ?: sizeof(long);
+	if (!retval)
+		retval = count;
+ out:
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&rtc_wait, &wait);
+
+	return retval;
+#endif
+}
+
+static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
+{
+	struct rtc_time wtime; 
+
+#ifdef RTC_IRQ
+	if (rtc_has_irq == 0) {
+		switch (cmd) {
+		case RTC_AIE_OFF:
+		case RTC_AIE_ON:
+		case RTC_PIE_OFF:
+		case RTC_PIE_ON:
+		case RTC_UIE_OFF:
+		case RTC_UIE_ON:
+		case RTC_IRQP_READ:
+		case RTC_IRQP_SET:
+			return -EINVAL;
+		};
+	}
+#endif
+
+	switch (cmd) {
+#ifdef RTC_IRQ
+	case RTC_AIE_OFF:	/* Mask alarm int. enab. bit	*/
+	{
+		mask_rtc_irq_bit(RTC_AIE);
+		return 0;
+	}
+	case RTC_AIE_ON:	/* Allow alarm interrupts.	*/
+	{
+		set_rtc_irq_bit(RTC_AIE);
+		return 0;
+	}
+	case RTC_PIE_OFF:	/* Mask periodic int. enab. bit	*/
+	{
+		unsigned long flags; /* can be called from isr via rtc_control() */
+		spin_lock_irqsave (&rtc_lock, flags);
+		mask_rtc_irq_bit_locked(RTC_PIE);
+		if (rtc_status & RTC_TIMER_ON) {
+			rtc_status &= ~RTC_TIMER_ON;
+			del_timer(&rtc_irq_timer);
+		}
+		spin_unlock_irqrestore (&rtc_lock, flags);
+		return 0;
+	}
+	case RTC_PIE_ON:	/* Allow periodic ints		*/
+	{
+		unsigned long flags; /* can be called from isr via rtc_control() */
+		/*
+		 * We don't really want Joe User enabling more
+		 * than 64Hz of interrupts on a multi-user machine.
+		 */
+		if (!kernel && (rtc_freq > rtc_max_user_freq) &&
+			(!capable(CAP_SYS_RESOURCE)))
+			return -EACCES;
+
+		spin_lock_irqsave (&rtc_lock, flags);
+		if (!(rtc_status & RTC_TIMER_ON)) {
+			rtc_irq_timer.expires = jiffies + HZ/rtc_freq + 2*HZ/100;
+			add_timer(&rtc_irq_timer);
+			rtc_status |= RTC_TIMER_ON;
+		}
+		set_rtc_irq_bit_locked(RTC_PIE);
+		spin_unlock_irqrestore (&rtc_lock, flags);
+		return 0;
+	}
+	case RTC_UIE_OFF:	/* Mask ints from RTC updates.	*/
+	{
+		mask_rtc_irq_bit(RTC_UIE);
+		return 0;
+	}
+	case RTC_UIE_ON:	/* Allow ints for RTC updates.	*/
+	{
+		set_rtc_irq_bit(RTC_UIE);
+		return 0;
+	}
+#endif
+	case RTC_ALM_READ:	/* Read the present alarm time */
+	{
+		/*
+		 * This returns a struct rtc_time. Reading >= 0xc0
+		 * means "don't care" or "match all". Only the tm_hour,
+		 * tm_min, and tm_sec values are filled in.
+		 */
+		memset(&wtime, 0, sizeof(struct rtc_time));
+		get_rtc_alm_time(&wtime);
+		break; 
+	}
+	case RTC_ALM_SET:	/* Store a time into the alarm */
+	{
+		/*
+		 * This expects a struct rtc_time. Writing 0xff means
+		 * "don't care" or "match all". Only the tm_hour,
+		 * tm_min and tm_sec are used.
+		 */
+		unsigned char hrs, min, sec;
+		struct rtc_time alm_tm;
+
+		if (copy_from_user(&alm_tm, (struct rtc_time __user *)arg,
+				   sizeof(struct rtc_time)))
+			return -EFAULT;
+
+		hrs = alm_tm.tm_hour;
+		min = alm_tm.tm_min;
+		sec = alm_tm.tm_sec;
+
+		spin_lock_irq(&rtc_lock);
+		if (hpet_set_alarm_time(hrs, min, sec)) {
+			/*
+			 * Fallthru and set alarm time in CMOS too,
+			 * so that we will get proper value in RTC_ALM_READ
+			 */
+		}
+		if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) ||
+		    RTC_ALWAYS_BCD)
+		{
+			if (sec < 60) BIN_TO_BCD(sec);
+			else sec = 0xff;
+
+			if (min < 60) BIN_TO_BCD(min);
+			else min = 0xff;
+
+			if (hrs < 24) BIN_TO_BCD(hrs);
+			else hrs = 0xff;
+		}
+		CMOS_WRITE(hrs, RTC_HOURS_ALARM);
+		CMOS_WRITE(min, RTC_MINUTES_ALARM);
+		CMOS_WRITE(sec, RTC_SECONDS_ALARM);
+		spin_unlock_irq(&rtc_lock);
+
+		return 0;
+	}
+	case RTC_RD_TIME:	/* Read the time/date from RTC	*/
+	{
+		memset(&wtime, 0, sizeof(struct rtc_time));
+		rtc_get_rtc_time(&wtime);
+		break;
+	}
+	case RTC_SET_TIME:	/* Set the RTC */
+	{
+		struct rtc_time rtc_tm;
+		unsigned char mon, day, hrs, min, sec, leap_yr;
+		unsigned char save_control, save_freq_select;
+		unsigned int yrs;
+#ifdef CONFIG_MACH_DECSTATION
+		unsigned int real_yrs;
+#endif
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		if (copy_from_user(&rtc_tm, (struct rtc_time __user *)arg,
+				   sizeof(struct rtc_time)))
+			return -EFAULT;
+
+		yrs = rtc_tm.tm_year + 1900;
+		mon = rtc_tm.tm_mon + 1;   /* tm_mon starts at zero */
+		day = rtc_tm.tm_mday;
+		hrs = rtc_tm.tm_hour;
+		min = rtc_tm.tm_min;
+		sec = rtc_tm.tm_sec;
+
+		if (yrs < 1970)
+			return -EINVAL;
+
+		leap_yr = ((!(yrs % 4) && (yrs % 100)) || !(yrs % 400));
+
+		if ((mon > 12) || (day == 0))
+			return -EINVAL;
+
+		if (day > (days_in_mo[mon] + ((mon == 2) && leap_yr)))
+			return -EINVAL;
+			
+		if ((hrs >= 24) || (min >= 60) || (sec >= 60))
+			return -EINVAL;
+
+		if ((yrs -= epoch) > 255)    /* They are unsigned */
+			return -EINVAL;
+
+		spin_lock_irq(&rtc_lock);
+#ifdef CONFIG_MACH_DECSTATION
+		real_yrs = yrs;
+		yrs = 72;
+
+		/*
+		 * We want to keep the year set to 73 until March
+		 * for non-leap years, so that Feb, 29th is handled
+		 * correctly.
+		 */
+		if (!leap_yr && mon < 3) {
+			real_yrs--;
+			yrs = 73;
+		}
+#endif
+		/* These limits and adjustments are independent of
+		 * whether the chip is in binary mode or not.
+		 */
+		if (yrs > 169) {
+			spin_unlock_irq(&rtc_lock);
+			return -EINVAL;
+		}
+		if (yrs >= 100)
+			yrs -= 100;
+
+		if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)
+		    || RTC_ALWAYS_BCD) {
+			BIN_TO_BCD(sec);
+			BIN_TO_BCD(min);
+			BIN_TO_BCD(hrs);
+			BIN_TO_BCD(day);
+			BIN_TO_BCD(mon);
+			BIN_TO_BCD(yrs);
+		}
+
+		save_control = CMOS_READ(RTC_CONTROL);
+		CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
+		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
+#ifdef CONFIG_MACH_DECSTATION
+		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
+#endif
+		CMOS_WRITE(yrs, RTC_YEAR);
+		CMOS_WRITE(mon, RTC_MONTH);
+		CMOS_WRITE(day, RTC_DAY_OF_MONTH);
+		CMOS_WRITE(hrs, RTC_HOURS);
+		CMOS_WRITE(min, RTC_MINUTES);
+		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
+
+		CMOS_WRITE(save_control, RTC_CONTROL);
+		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+
+		spin_unlock_irq(&rtc_lock);
+		return 0;
+	}
+#ifdef RTC_IRQ
+	case RTC_IRQP_READ:	/* Read the periodic IRQ rate.	*/
+	{
+		return put_user(rtc_freq, (unsigned long __user *)arg);
+	}
+	case RTC_IRQP_SET:	/* Set periodic IRQ rate.	*/
+	{
+		int tmp = 0;
+		unsigned char val;
+		unsigned long flags; /* can be called from isr via rtc_control() */
+
+		/* 
+		 * The max we can do is 8192Hz.
+		 */
+		if ((arg < 2) || (arg > 8192))
+			return -EINVAL;
+		/*
+		 * We don't really want Joe User generating more
+		 * than 64Hz of interrupts on a multi-user machine.
+		 */
+		if (!kernel && (arg > rtc_max_user_freq) && (!capable(CAP_SYS_RESOURCE)))
+			return -EACCES;
+
+		while (arg > (1<<tmp))
+			tmp++;
+
+		/*
+		 * Check that the input was really a power of 2.
+		 */
+		if (arg != (1<<tmp))
+			return -EINVAL;
+
+		spin_lock_irqsave(&rtc_lock, flags);
+		if (hpet_set_periodic_freq(arg)) {
+			spin_unlock_irqrestore(&rtc_lock, flags);
+			return 0;
+		}
+		rtc_freq = arg;
+
+		val = CMOS_READ(RTC_FREQ_SELECT) & 0xf0;
+		val |= (16 - tmp);
+		CMOS_WRITE(val, RTC_FREQ_SELECT);
+		spin_unlock_irqrestore(&rtc_lock, flags);
+		return 0;
+	}
+#endif
+	case RTC_EPOCH_READ:	/* Read the epoch.	*/
+	{
+		return put_user (epoch, (unsigned long __user *)arg);
+	}
+	case RTC_EPOCH_SET:	/* Set the epoch.	*/
+	{
+		/* 
+		 * There were no RTC clocks before 1900.
+		 */
+		if (arg < 1900)
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		epoch = arg;
+		return 0;
+	}
+	default:
+		return -ENOTTY;
+	}
+	return copy_to_user((void __user *)arg, &wtime, sizeof wtime) ? -EFAULT : 0;
+}
+
+static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		     unsigned long arg)
+{
+	return rtc_do_ioctl(cmd, arg, 0);
+}
+
+/*
+ *	We enforce only one user at a time here with the open/close.
+ *	Also clear the previous interrupt data on an open, and clean
+ *	up things on a close.
+ */
+
+/* We use rtc_lock to protect against concurrent opens. So the BKL is not
+ * needed here. Or anywhere else in this driver. */
+static int rtc_open(struct inode *inode, struct file *file)
+{
+	spin_lock_irq (&rtc_lock);
+
+	if(rtc_status & RTC_IS_OPEN)
+		goto out_busy;
+
+	rtc_open_event();
+	rtc_status |= RTC_IS_OPEN;
+
+	rtc_irq_data = 0;
+	spin_unlock_irq (&rtc_lock);
+	return 0;
+
+out_busy:
+	spin_unlock_irq (&rtc_lock);
+	return -EBUSY;
+}
+
+static int rtc_fasync (int fd, struct file *filp, int on)
+
+{
+	return fasync_helper (fd, filp, on, &rtc_async_queue);
+}
+
+static int rtc_release(struct inode *inode, struct file *file)
+{
+#ifdef RTC_IRQ
+	unsigned char tmp;
+
+	if (rtc_has_irq == 0)
+		goto no_irq;
+
+	/*
+	 * Turn off all interrupts once the device is no longer
+	 * in use, and clear the data.
+	 */
+
+	spin_lock_irq(&rtc_lock);
+	if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
+		tmp = CMOS_READ(RTC_CONTROL);
+		tmp &=  ~RTC_PIE;
+		tmp &=  ~RTC_AIE;
+		tmp &=  ~RTC_UIE;
+		CMOS_WRITE(tmp, RTC_CONTROL);
+		CMOS_READ(RTC_INTR_FLAGS);
+	}
+	if (rtc_status & RTC_TIMER_ON) {
+		rtc_status &= ~RTC_TIMER_ON;
+		del_timer(&rtc_irq_timer);
+	}
+	spin_unlock_irq(&rtc_lock);
+
+	if (file->f_flags & FASYNC) {
+		rtc_fasync (-1, file, 0);
+	}
+no_irq:
+#endif
+
+	spin_lock_irq (&rtc_lock);
+	rtc_irq_data = 0;
+	rtc_status &= ~RTC_IS_OPEN;
+	spin_unlock_irq (&rtc_lock);
+	rtc_close_event();
+	return 0;
+}
+
+#ifdef RTC_IRQ
+/* Called without the kernel lock - fine */
+static unsigned int rtc_poll(struct file *file, poll_table *wait)
+{
+	unsigned long l;
+
+	if (rtc_has_irq == 0)
+		return 0;
+
+	poll_wait(file, &rtc_wait, wait);
+
+	spin_lock_irq (&rtc_lock);
+	l = rtc_irq_data;
+	spin_unlock_irq (&rtc_lock);
+
+	if (l != 0)
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+#endif
+
+/*
+ * exported stuffs
+ */
+
+EXPORT_SYMBOL(rtc_register);
+EXPORT_SYMBOL(rtc_unregister);
+EXPORT_SYMBOL(rtc_control);
+
+int rtc_register(rtc_task_t *task)
+{
+#ifndef RTC_IRQ
+	return -EIO;
+#else
+	if (task == NULL || task->func == NULL)
+		return -EINVAL;
+	spin_lock_irq(&rtc_lock);
+	if (rtc_status & RTC_IS_OPEN) {
+		spin_unlock_irq(&rtc_lock);
+		return -EBUSY;
+	}
+	spin_lock(&rtc_task_lock);
+	if (rtc_callback) {
+		spin_unlock(&rtc_task_lock);
+		spin_unlock_irq(&rtc_lock);
+		return -EBUSY;
+	}
+	rtc_status |= RTC_IS_OPEN;
+	rtc_callback = task;
+	spin_unlock(&rtc_task_lock);
+	spin_unlock_irq(&rtc_lock);
+	return 0;
+#endif
+}
+
+int rtc_unregister(rtc_task_t *task)
+{
+#ifndef RTC_IRQ
+	return -EIO;
+#else
+	unsigned char tmp;
+
+	spin_lock_irq(&rtc_lock);
+	spin_lock(&rtc_task_lock);
+	if (rtc_callback != task) {
+		spin_unlock(&rtc_task_lock);
+		spin_unlock_irq(&rtc_lock);
+		return -ENXIO;
+	}
+	rtc_callback = NULL;
+	
+	/* disable controls */
+	if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
+		tmp = CMOS_READ(RTC_CONTROL);
+		tmp &= ~RTC_PIE;
+		tmp &= ~RTC_AIE;
+		tmp &= ~RTC_UIE;
+		CMOS_WRITE(tmp, RTC_CONTROL);
+		CMOS_READ(RTC_INTR_FLAGS);
+	}
+	if (rtc_status & RTC_TIMER_ON) {
+		rtc_status &= ~RTC_TIMER_ON;
+		del_timer(&rtc_irq_timer);
+	}
+	rtc_status &= ~RTC_IS_OPEN;
+	spin_unlock(&rtc_task_lock);
+	spin_unlock_irq(&rtc_lock);
+	return 0;
+#endif
+}
+
+int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
+{
+#ifndef RTC_IRQ
+	return -EIO;
+#else
+	unsigned long flags;
+	if (cmd != RTC_PIE_ON && cmd != RTC_PIE_OFF && cmd != RTC_IRQP_SET)
+		return -EINVAL;
+	spin_lock_irqsave(&rtc_task_lock, flags);
+	if (rtc_callback != task) {
+		spin_unlock_irqrestore(&rtc_task_lock, flags);
+		return -ENXIO;
+	}
+	spin_unlock_irqrestore(&rtc_task_lock, flags);
+	return rtc_do_ioctl(cmd, arg, 1);
+#endif
+}
+
+
+/*
+ *	The various file operations we support.
+ */
+
+static const struct file_operations rtc_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= rtc_read,
+#ifdef RTC_IRQ
+	.poll		= rtc_poll,
+#endif
+	.ioctl		= rtc_ioctl,
+	.open		= rtc_open,
+	.release	= rtc_release,
+	.fasync		= rtc_fasync,
+};
+
+static struct miscdevice rtc_dev = {
+	.minor		= RTC_MINOR,
+	.name		= "rtc",
+	.fops		= &rtc_fops,
+};
+
+static const struct file_operations rtc_proc_fops = {
+	.owner = THIS_MODULE,
+	.open = rtc_proc_open,
+	.read  = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+#if defined(RTC_IRQ) && !defined(__sparc__)
+static irqreturn_t (*rtc_int_handler_ptr)(int irq, void *dev_id, struct pt_regs *regs);
+#endif
+
+static int __init rtc_init(void)
+{
+	struct proc_dir_entry *ent;
+#if defined(__alpha__) || defined(__mips__)
+	unsigned int year, ctrl;
+	char *guess = NULL;
+#endif
+#ifdef __sparc__
+	struct linux_ebus *ebus;
+	struct linux_ebus_device *edev;
+#ifdef __sparc_v9__
+	struct sparc_isa_bridge *isa_br;
+	struct sparc_isa_device *isa_dev;
+#endif
+#endif
+#ifndef __sparc__
+	void *r;
+#endif
+
+#ifdef __sparc__
+	for_each_ebus(ebus) {
+		for_each_ebusdev(edev, ebus) {
+			if(strcmp(edev->prom_node->name, "rtc") == 0) {
+				rtc_port = edev->resource[0].start;
+				rtc_irq = edev->irqs[0];
+				goto found;
+			}
+		}
+	}
+#ifdef __sparc_v9__
+	for_each_isa(isa_br) {
+		for_each_isadev(isa_dev, isa_br) {
+			if (strcmp(isa_dev->prom_node->name, "rtc") == 0) {
+				rtc_port = isa_dev->resource.start;
+				rtc_irq = isa_dev->irq;
+				goto found;
+			}
+		}
+	}
+#endif
+	printk(KERN_ERR "rtc_init: no PC rtc found\n");
+	return -EIO;
+
+found:
+	if (rtc_irq == PCI_IRQ_NONE) {
+		rtc_has_irq = 0;
+		goto no_irq;
+	}
+
+	/*
+	 * XXX Interrupt pin #7 in Espresso is shared between RTC and
+	 * PCI Slot 2 INTA# (and some INTx# in Slot 1).
+	 */
+	if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc", (void *)&rtc_port)) {
+		printk(KERN_ERR "rtc: cannot register IRQ %d\n", rtc_irq);
+		return -EIO;
+	}
+no_irq:
+#else
+	if (RTC_IOMAPPED)
+		r = request_region(RTC_PORT(0), RTC_IO_EXTENT, "rtc");
+	else
+		r = request_mem_region(RTC_PORT(0), RTC_IO_EXTENT, "rtc");
+	if (!r) {
+		printk(KERN_ERR "rtc: I/O resource %lx is not free.\n",
+		       (long)(RTC_PORT(0)));
+		return -EIO;
+	}
+
+#ifdef RTC_IRQ
+	if (is_hpet_enabled()) {
+		rtc_int_handler_ptr = hpet_rtc_interrupt;
+	} else {
+		rtc_int_handler_ptr = rtc_interrupt;
+	}
+
+	if(request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED, "rtc", NULL)) {
+		/* Yeah right, seeing as irq 8 doesn't even hit the bus. */
+		printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ);
+		if (RTC_IOMAPPED)
+			release_region(RTC_PORT(0), RTC_IO_EXTENT);
+		else
+			release_mem_region(RTC_PORT(0), RTC_IO_EXTENT);
+		return -EIO;
+	}
+	hpet_rtc_timer_init();
+
+#endif
+
+#endif /* __sparc__ vs. others */
+
+	if (misc_register(&rtc_dev)) {
+#ifdef RTC_IRQ
+		free_irq(RTC_IRQ, NULL);
+#endif
+		release_region(RTC_PORT(0), RTC_IO_EXTENT);
+		return -ENODEV;
+	}
+
+	ent = create_proc_entry("driver/rtc", 0, NULL);
+	if (!ent) {
+#ifdef RTC_IRQ
+		free_irq(RTC_IRQ, NULL);
+#endif
+		release_region(RTC_PORT(0), RTC_IO_EXTENT);
+		misc_deregister(&rtc_dev);
+		return -ENOMEM;
+	}
+	ent->proc_fops = &rtc_proc_fops;
+
+#if defined(__alpha__) || defined(__mips__)
+	rtc_freq = HZ;
+	
+	/* Each operating system on an Alpha uses its own epoch.
+	   Let's try to guess which one we are using now. */
+	
+	if (rtc_is_updating() != 0)
+		msleep(20);
+	
+	spin_lock_irq(&rtc_lock);
+	year = CMOS_READ(RTC_YEAR);
+	ctrl = CMOS_READ(RTC_CONTROL);
+	spin_unlock_irq(&rtc_lock);
+	
+	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+		BCD_TO_BIN(year);       /* This should never happen... */
+	
+	if (year < 20) {
+		epoch = 2000;
+		guess = "SRM (post-2000)";
+	} else if (year >= 20 && year < 48) {
+		epoch = 1980;
+		guess = "ARC console";
+	} else if (year >= 48 && year < 72) {
+		epoch = 1952;
+		guess = "Digital UNIX";
+#if defined(__mips__)
+	} else if (year >= 72 && year < 74) {
+		epoch = 2000;
+		guess = "Digital DECstation";
+#else
+	} else if (year >= 70) {
+		epoch = 1900;
+		guess = "Standard PC (1900)";
+#endif
+	}
+	if (guess)
+		printk(KERN_INFO "rtc: %s epoch (%lu) detected\n", guess, epoch);
+#endif
+#ifdef RTC_IRQ
+	if (rtc_has_irq == 0)
+		goto no_irq2;
+
+	init_timer(&rtc_irq_timer);
+	rtc_irq_timer.function = rtc_dropped_irq;
+	spin_lock_irq(&rtc_lock);
+	rtc_freq = 1024;
+	if (!hpet_set_periodic_freq(rtc_freq)) {
+		/* Initialize periodic freq. to CMOS reset default, which is 1024Hz */
+		CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06), RTC_FREQ_SELECT);
+	}
+	spin_unlock_irq(&rtc_lock);
+no_irq2:
+#endif
+
+	(void) init_sysctl();
+
+	printk(KERN_INFO "Real Time Clock Driver v" RTC_VERSION "\n");
+
+	return 0;
+}
+
+static void __exit rtc_exit (void)
+{
+	cleanup_sysctl();
+	remove_proc_entry ("driver/rtc", NULL);
+	misc_deregister(&rtc_dev);
+
+#ifdef __sparc__
+	if (rtc_has_irq)
+		free_irq (rtc_irq, &rtc_port);
+#else
+	if (RTC_IOMAPPED)
+		release_region(RTC_PORT(0), RTC_IO_EXTENT);
+	else
+		release_mem_region(RTC_PORT(0), RTC_IO_EXTENT);
+#ifdef RTC_IRQ
+	if (rtc_has_irq)
+		free_irq (RTC_IRQ, NULL);
+#endif
+#endif /* __sparc__ */
+}
+
+module_init(rtc_init);
+module_exit(rtc_exit);
+
+#ifdef RTC_IRQ
+/*
+ * 	At IRQ rates >= 4096Hz, an interrupt may get lost altogether.
+ *	(usually during an IDE disk interrupt, with IRQ unmasking off)
+ *	Since the interrupt handler doesn't get called, the IRQ status
+ *	byte doesn't get read, and the RTC stops generating interrupts.
+ *	A timer is set, and will call this function if/when that happens.
+ *	To get it out of this stalled state, we just read the status.
+ *	At least a jiffy of interrupts (rtc_freq/HZ) will have been lost.
+ *	(You *really* shouldn't be trying to use a non-realtime system 
+ *	for something that requires a steady > 1KHz signal anyways.)
+ */
+
+static void rtc_dropped_irq(unsigned long data)
+{
+	unsigned long freq;
+
+	spin_lock_irq (&rtc_lock);
+
+	if (hpet_rtc_dropped_irq()) {
+		spin_unlock_irq(&rtc_lock);
+		return;
+	}
+
+	/* Just in case someone disabled the timer from behind our back... */
+	if (rtc_status & RTC_TIMER_ON)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+
+	rtc_irq_data += ((rtc_freq/HZ)<<8);
+	rtc_irq_data &= ~0xff;
+	rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0);	/* restart */
+
+	freq = rtc_freq;
+
+	spin_unlock_irq(&rtc_lock);
+
+	printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
+
+	/* Now we have new data */
+	rtc_wake_event();
+	wake_up_interruptible(&rtc_wait);
+
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+}
+#endif
+
+/*
+ *	Info exported via "/proc/driver/rtc".
+ */
+
+static int rtc_proc_show(struct seq_file *seq, void *v)
+{
+#define YN(bit) ((ctrl & bit) ? "yes" : "no")
+#define NY(bit) ((ctrl & bit) ? "no" : "yes")
+	struct rtc_time tm;
+	unsigned char batt, ctrl;
+	unsigned long freq;
+
+	spin_lock_irq(&rtc_lock);
+	batt = CMOS_READ(RTC_VALID) & RTC_VRT;
+	ctrl = CMOS_READ(RTC_CONTROL);
+	freq = rtc_freq;
+	spin_unlock_irq(&rtc_lock);
+
+
+	rtc_get_rtc_time(&tm);
+
+	/*
+	 * There is no way to tell if the luser has the RTC set for local
+	 * time or for Universal Standard Time (GMT). Probably local though.
+	 */
+	seq_printf(seq,
+		   "rtc_time\t: %02d:%02d:%02d\n"
+		   "rtc_date\t: %04d-%02d-%02d\n"
+		   "rtc_epoch\t: %04lu\n",
+		   tm.tm_hour, tm.tm_min, tm.tm_sec,
+		   tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, epoch);
+
+	get_rtc_alm_time(&tm);
+
+	/*
+	 * We implicitly assume 24hr mode here. Alarm values >= 0xc0 will
+	 * match any value for that particular field. Values that are
+	 * greater than a valid time, but less than 0xc0 shouldn't appear.
+	 */
+	seq_puts(seq, "alarm\t\t: ");
+	if (tm.tm_hour <= 24)
+		seq_printf(seq, "%02d:", tm.tm_hour);
+	else
+		seq_puts(seq, "**:");
+
+	if (tm.tm_min <= 59)
+		seq_printf(seq, "%02d:", tm.tm_min);
+	else
+		seq_puts(seq, "**:");
+
+	if (tm.tm_sec <= 59)
+		seq_printf(seq, "%02d\n", tm.tm_sec);
+	else
+		seq_puts(seq, "**\n");
+
+	seq_printf(seq,
+		   "DST_enable\t: %s\n"
+		   "BCD\t\t: %s\n"
+		   "24hr\t\t: %s\n"
+		   "square_wave\t: %s\n"
+		   "alarm_IRQ\t: %s\n"
+		   "update_IRQ\t: %s\n"
+		   "periodic_IRQ\t: %s\n"
+		   "periodic_freq\t: %ld\n"
+		   "batt_status\t: %s\n",
+		   YN(RTC_DST_EN),
+		   NY(RTC_DM_BINARY),
+		   YN(RTC_24H),
+		   YN(RTC_SQWE),
+		   YN(RTC_AIE),
+		   YN(RTC_UIE),
+		   YN(RTC_PIE),
+		   freq,
+		   batt ? "okay" : "dead");
+
+	return  0;
+#undef YN
+#undef NY
+}
+
+static int rtc_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rtc_proc_show, NULL);
+}
+
+void rtc_get_rtc_time(struct rtc_time *rtc_tm)
+{
+	unsigned long uip_watchdog = jiffies, flags;
+	unsigned char ctrl;
+#ifdef CONFIG_MACH_DECSTATION
+	unsigned int real_year;
+#endif
+
+	/*
+	 * read RTC once any update in progress is done. The update
+	 * can take just over 2ms. We wait 20ms. There is no need to
+	 * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
+	 * If you need to know *exactly* when a second has started, enable
+	 * periodic update complete interrupts, (via ioctl) and then 
+	 * immediately read /dev/rtc which will block until you get the IRQ.
+	 * Once the read clears, read the RTC time (again via ioctl). Easy.
+	 */
+
+	while (rtc_is_updating() != 0 && jiffies - uip_watchdog < 2*HZ/100) {
+		barrier();
+		cpu_relax();
+	}
+
+	/*
+	 * Only the values that we read from the RTC are set. We leave
+	 * tm_wday, tm_yday and tm_isdst untouched. Note that while the
+	 * RTC has RTC_DAY_OF_WEEK, we should usually ignore it, as it is
+	 * only updated by the RTC when initially set to a non-zero value.
+	 */
+	spin_lock_irqsave(&rtc_lock, flags);
+	rtc_tm->tm_sec = CMOS_READ(RTC_SECONDS);
+	rtc_tm->tm_min = CMOS_READ(RTC_MINUTES);
+	rtc_tm->tm_hour = CMOS_READ(RTC_HOURS);
+	rtc_tm->tm_mday = CMOS_READ(RTC_DAY_OF_MONTH);
+	rtc_tm->tm_mon = CMOS_READ(RTC_MONTH);
+	rtc_tm->tm_year = CMOS_READ(RTC_YEAR);
+	/* Only set from 2.6.16 onwards */
+	rtc_tm->tm_wday = CMOS_READ(RTC_DAY_OF_WEEK);
+
+#ifdef CONFIG_MACH_DECSTATION
+	real_year = CMOS_READ(RTC_DEC_YEAR);
+#endif
+	ctrl = CMOS_READ(RTC_CONTROL);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+	{
+		BCD_TO_BIN(rtc_tm->tm_sec);
+		BCD_TO_BIN(rtc_tm->tm_min);
+		BCD_TO_BIN(rtc_tm->tm_hour);
+		BCD_TO_BIN(rtc_tm->tm_mday);
+		BCD_TO_BIN(rtc_tm->tm_mon);
+		BCD_TO_BIN(rtc_tm->tm_year);
+		BCD_TO_BIN(rtc_tm->tm_wday);
+	}
+
+#ifdef CONFIG_MACH_DECSTATION
+	rtc_tm->tm_year += real_year - 72;
+#endif
+
+	/*
+	 * Account for differences between how the RTC uses the values
+	 * and how they are defined in a struct rtc_time;
+	 */
+	if ((rtc_tm->tm_year += (epoch - 1900)) <= 69)
+		rtc_tm->tm_year += 100;
+
+	rtc_tm->tm_mon--;
+}
+
+static void get_rtc_alm_time(struct rtc_time *alm_tm)
+{
+	unsigned char ctrl;
+
+	/*
+	 * Only the values that we read from the RTC are set. That
+	 * means only tm_hour, tm_min, and tm_sec.
+	 */
+	spin_lock_irq(&rtc_lock);
+	alm_tm->tm_sec = CMOS_READ(RTC_SECONDS_ALARM);
+	alm_tm->tm_min = CMOS_READ(RTC_MINUTES_ALARM);
+	alm_tm->tm_hour = CMOS_READ(RTC_HOURS_ALARM);
+	ctrl = CMOS_READ(RTC_CONTROL);
+	spin_unlock_irq(&rtc_lock);
+
+	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+	{
+		BCD_TO_BIN(alm_tm->tm_sec);
+		BCD_TO_BIN(alm_tm->tm_min);
+		BCD_TO_BIN(alm_tm->tm_hour);
+	}
+}
+
+#ifdef RTC_IRQ
+/*
+ * Used to disable/enable interrupts for any one of UIE, AIE, PIE.
+ * Rumour has it that if you frob the interrupt enable/disable
+ * bits in RTC_CONTROL, you should read RTC_INTR_FLAGS, to
+ * ensure you actually start getting interrupts. Probably for
+ * compatibility with older/broken chipset RTC implementations.
+ * We also clear out any old irq data after an ioctl() that
+ * meddles with the interrupt enable/disable bits.
+ */
+
+static void mask_rtc_irq_bit_locked(unsigned char bit)
+{
+	unsigned char val;
+
+	if (hpet_mask_rtc_irq_bit(bit))
+		return;
+	val = CMOS_READ(RTC_CONTROL);
+	val &=  ~bit;
+	CMOS_WRITE(val, RTC_CONTROL);
+	CMOS_READ(RTC_INTR_FLAGS);
+
+	rtc_irq_data = 0;
+}
+
+static void set_rtc_irq_bit_locked(unsigned char bit)
+{
+	unsigned char val;
+
+	if (hpet_set_rtc_irq_bit(bit))
+		return;
+	val = CMOS_READ(RTC_CONTROL);
+	val |= bit;
+	CMOS_WRITE(val, RTC_CONTROL);
+	CMOS_READ(RTC_INTR_FLAGS);
+
+	rtc_irq_data = 0;
+}
+#endif
+
+MODULE_AUTHOR("Paul Gortmaker");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(RTC_MINOR);
diff -urN ./linux-2.6.18.1/drivers/char/sysrq.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/sysrq.c
--- ./linux-2.6.18.1/drivers/char/sysrq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/sysrq.c	2007-05-19 23:58:35.000000000 +0900
@@ -176,6 +176,23 @@
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+#if defined(__i386__)
+
+static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs,
+				     struct tty_struct *tty)
+{
+	nmi_show_all_regs();
+}
+
+static struct sysrq_key_op sysrq_showallregs_op = {
+	.handler	= sysrq_handle_showallregs,
+	.help_msg	= "showalLcpupc",
+	.action_msg	= "Show Regs On All CPUs",
+};
+#else
+#define sysrq_showallregs_op (*(struct sysrq_key_op *)0)
+#endif
+
 static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs,
 				   struct tty_struct *tty)
 {
@@ -301,7 +318,7 @@
 	&sysrq_kill_op,			/* i */
 	NULL,				/* j */
 	&sysrq_SAK_op,			/* k */
-	NULL,				/* l */
+	&sysrq_showallregs_op,		/* l */
 	&sysrq_showmem_op,		/* m */
 	&sysrq_unrt_op,			/* n */
 	/* This will often be registered as 'Off' at init time */
diff -urN ./linux-2.6.18.1/drivers/char/tty_io.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/tty_io.c
--- ./linux-2.6.18.1/drivers/char/tty_io.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/char/tty_io.c	2007-05-19 23:58:35.000000000 +0900
@@ -254,6 +254,7 @@
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
 		       tty->name, tty->count, count, routine);
+		dump_stack();
 		return count;
        }	
 #endif
diff -urN ./linux-2.6.18.1/drivers/ide/ide-floppy.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-floppy.c
--- ./linux-2.6.18.1/drivers/ide/ide-floppy.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-floppy.c	2007-05-19 23:58:35.000000000 +0900
@@ -1666,9 +1666,9 @@
 		atapi_status_t status;
 		unsigned long flags;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		status.all = HWIF(drive)->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 
 		progress_indication = !status.b.dsc ? 0 : 0x10000;
 	}
diff -urN ./linux-2.6.18.1/drivers/ide/ide-io.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-io.c
--- ./linux-2.6.18.1/drivers/ide/ide-io.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-io.c	2007-05-19 23:58:35.000000000 +0900
@@ -1173,7 +1173,7 @@
 	ide_get_lock(ide_intr, hwgroup);
 
 	/* caller must own ide_lock */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	while (!hwgroup->busy) {
 		hwgroup->busy = 1;
@@ -1434,7 +1434,7 @@
 #endif /* DISABLE_IRQ_NOSYNC */
 			/* local CPU only,
 			 * as if we were handling an interrupt */
-			local_irq_disable();
+			local_irq_disable_nort();
 			if (hwgroup->polling) {
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
diff -urN ./linux-2.6.18.1/drivers/ide/ide-iops.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-iops.c
--- ./linux-2.6.18.1/drivers/ide/ide-iops.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-iops.c	2007-05-19 23:58:35.000000000 +0900
@@ -244,10 +244,10 @@
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -266,10 +266,10 @@
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -564,12 +564,12 @@
 				if (!(stat & BUSY_STAT))
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*startstop = ide_error(drive, "status timeout", stat);
 				return 1;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
@@ -731,17 +731,15 @@
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
-	local_irq_save(flags);
-	SELECT_MASK(drive, 0);
 	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
-	if (!id) {
-		local_irq_restore(flags);
+	if (!id)
 		return 0;
-	}
+	local_irq_save_nort(flags);
+	SELECT_MASK(drive, 0);
 	ata_input_data(drive, id, SECTOR_WORDS);
 	(void) hwif->INB(IDE_STATUS_REG);	/* clear drive IRQ */
-	local_irq_enable();
-	local_irq_restore(flags);
+	local_irq_enable_nort();
+	local_irq_restore_nort(flags);
 	ide_fix_driveid(id);
 	if (id) {
 		drive->id->dma_ultra = id->dma_ultra;
@@ -821,7 +819,7 @@
 			if (time_after(jiffies, timeout))
 				break;
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	/*
diff -urN ./linux-2.6.18.1/drivers/ide/ide-lib.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-lib.c
--- ./linux-2.6.18.1/drivers/ide/ide-lib.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-lib.c	2007-05-19 23:58:35.000000000 +0900
@@ -445,15 +445,16 @@
 
 static void ide_dump_opcode(ide_drive_t *drive)
 {
+	unsigned long flags;
 	struct request *rq;
 	u8 opcode = 0;
 	int found = 0;
 
-	spin_lock(&ide_lock);
+	spin_lock_irqsave(&ide_lock, flags);
 	rq = NULL;
 	if (HWGROUP(drive))
 		rq = HWGROUP(drive)->rq;
-	spin_unlock(&ide_lock);
+	spin_unlock_irqrestore(&ide_lock, flags);
 	if (!rq)
 		return;
 	if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
@@ -481,10 +482,8 @@
 static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	unsigned long flags;
 	u8 err = 0;
 
-	local_irq_save(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (stat & BUSY_STAT)
 		printk("Busy ");
@@ -544,7 +543,7 @@
 		printk("\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return err;
 }
 
@@ -559,14 +558,11 @@
 
 static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
-	unsigned long flags;
-
 	atapi_status_t status;
 	atapi_error_t error;
 
 	status.all = stat;
 	error.all = 0;
-	local_irq_save(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (status.b.bsy)
 		printk("Busy ");
@@ -592,7 +588,7 @@
 		printk("}\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return error.all;
 }
 
diff -urN ./linux-2.6.18.1/drivers/ide/ide-probe.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-probe.c
--- ./linux-2.6.18.1/drivers/ide/ide-probe.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-probe.c	2007-05-19 23:58:35.000000000 +0900
@@ -143,7 +143,7 @@
 	hwif->ata_input_data(drive, id, SECTOR_WORDS);
 
 	drive->id_read = 1;
-	local_irq_enable();
+	local_irq_enable_nort();
 	ide_fix_driveid(id);
 
 #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
@@ -325,14 +325,14 @@
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* drive returned ID */
 		do_identify(drive, cmd);
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
 		(void) hwif->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		/* drive refused ID */
 		rc = 2;
@@ -804,7 +804,7 @@
 		} while ((stat & BUSY_STAT) && time_after(timeout, jiffies));
 
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	/*
 	 * Use cached IRQ number. It might be (and is...) changed by probe
 	 * code above
diff -urN ./linux-2.6.18.1/drivers/ide/ide-taskfile.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-taskfile.c
--- ./linux-2.6.18.1/drivers/ide/ide-taskfile.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/ide-taskfile.c	2007-05-19 23:58:35.000000000 +0900
@@ -274,7 +274,7 @@
 	offset %= PAGE_SIZE;
 
 #ifdef CONFIG_HIGHMEM
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
@@ -294,7 +294,7 @@
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 #endif
 }
 
@@ -460,7 +460,7 @@
 	}
 
 	if (!drive->unmask)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 	ide_pio_datablock(drive, rq, 1);
diff -urN ./linux-2.6.18.1/drivers/ide/pci/alim15x3.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/pci/alim15x3.c
--- ./linux-2.6.18.1/drivers/ide/pci/alim15x3.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/pci/alim15x3.c	2007-05-19 23:58:35.000000000 +0900
@@ -322,7 +322,7 @@
 		if (r_clc >= 16)
 			r_clc = 0;
 	}
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	
 	/* 
 	 * PIO mode => ATA FIFO on, ATAPI FIFO off
@@ -344,7 +344,7 @@
 	
 	pci_write_config_byte(dev, port, s_clc);
 	pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	/*
 	 * setup   active  rec
@@ -600,7 +600,7 @@
 	}
 #endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -613,7 +613,7 @@
 		 * clear bit 7
 		 */
 		pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		return 0;
 	}
 
@@ -638,7 +638,7 @@
 	 * 0:0.0 so if we didn't find one we know what is cooking.
 	 */
 	if (north && north->vendor != PCI_VENDOR_ID_AL) {
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	        return 0;
 	}
 
@@ -661,7 +661,7 @@
 			pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02);
 		}
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	return 0;
 }
 
@@ -685,7 +685,7 @@
 	unsigned long flags;
 	u8 tmpbyte;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	if (m5229_revision >= 0xC2) {
 		/*
@@ -737,7 +737,7 @@
 
 	pci_write_config_byte(dev, 0x53, tmpbyte);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	return(ata66);
 }
diff -urN ./linux-2.6.18.1/drivers/ide/pci/cs5530.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/pci/cs5530.c
--- ./linux-2.6.18.1/drivers/ide/pci/cs5530.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/pci/cs5530.c	2007-05-19 23:58:35.000000000 +0900
@@ -241,8 +241,8 @@
 		return 0;
 	}
 
-	spin_lock_irqsave(&ide_lock, flags);
-		/* all CPUs (there should only be one CPU with this chipset) */
+	/* Local CPU. ide_lock is acquired in do_ide_setup_pci_device. */
+	local_irq_save(flags);
 
 	/*
 	 * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530:
@@ -294,7 +294,7 @@
 	pci_write_config_byte(master_0, 0x42, 0x00);
 	pci_write_config_byte(master_0, 0x43, 0xc1);
 
-	spin_unlock_irqrestore(&ide_lock, flags);
+	local_irq_restore(flags);
 
 	return 0;
 }
diff -urN ./linux-2.6.18.1/drivers/ide/pci/hpt366.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/pci/hpt366.c
--- ./linux-2.6.18.1/drivers/ide/pci/hpt366.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ide/pci/hpt366.c	2007-05-19 23:58:35.000000000 +0900
@@ -1496,7 +1496,7 @@
 
 	dma_old = hwif->INB(dmabase+2);
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 
 	dma_new = dma_old;
 	pci_read_config_byte(hwif->pci_dev, primary, &masterdma);
@@ -1507,7 +1507,7 @@
 	if (dma_new != dma_old)
 		hwif->OUTB(dma_new, dmabase+2);
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 
 	ide_setup_dma(hwif, dmabase, 8);
 }
diff -urN ./linux-2.6.18.1/drivers/ieee1394/ieee1394_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ieee1394/ieee1394_types.h
--- ./linux-2.6.18.1/drivers/ieee1394/ieee1394_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ieee1394/ieee1394_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,7 +19,7 @@
 	spinlock_t lock;
 	u8 next;
 	u32 allocations;
-	struct semaphore count;
+	struct compat_semaphore count;
 };
 
 #define HPSB_TPOOL_INIT(_tp)			\
diff -urN ./linux-2.6.18.1/drivers/ieee1394/nodemgr.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ieee1394/nodemgr.c
--- ./linux-2.6.18.1/drivers/ieee1394/nodemgr.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ieee1394/nodemgr.c	2007-05-19 23:58:35.000000000 +0900
@@ -167,7 +167,7 @@
 	struct hpsb_host *host;
 	struct list_head list;
 	struct completion exited;
-	struct semaphore reset_sem;
+	struct compat_semaphore reset_sem;
 	int pid;
 	char daemon_name[15];
 	int kill_me;
diff -urN ./linux-2.6.18.1/drivers/ieee1394/raw1394-private.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ieee1394/raw1394-private.h
--- ./linux-2.6.18.1/drivers/ieee1394/raw1394-private.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/ieee1394/raw1394-private.h	2007-05-19 23:58:35.000000000 +0900
@@ -29,7 +29,7 @@
 
         struct list_head req_pending;
         struct list_head req_complete;
-        struct semaphore complete_sem;
+        struct compat_semaphore complete_sem;
         spinlock_t reqlists_lock;
         wait_queue_head_t poll_wait_complete;
 
diff -urN ./linux-2.6.18.1/drivers/input/gameport/gameport.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/input/gameport/gameport.c
--- ./linux-2.6.18.1/drivers/input/gameport/gameport.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/input/gameport/gameport.c	2007-05-19 23:58:35.000000000 +0900
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/interrupt.h>
 #include <linux/sched.h>	/* HZ */
 #include <linux/mutex.h>
 
@@ -101,12 +102,12 @@
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -125,11 +126,11 @@
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
diff -urN ./linux-2.6.18.1/drivers/input/serio/i8042.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/input/serio/i8042.c
--- ./linux-2.6.18.1/drivers/input/serio/i8042.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/input/serio/i8042.c	2007-05-19 23:58:35.000000000 +0900
@@ -1084,7 +1084,7 @@
 		goto err_controller_cleanup;
 	}
 
-	mod_timer(&i8042_timer, jiffies + I8042_POLL_PERIOD);
+	mod_timer(&i8042_timer, jiffies + 2); //I8042_POLL_PERIOD);
 	return 0;
 
  err_unregister_ports:
diff -urN ./linux-2.6.18.1/drivers/input/serio/i8042.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/input/serio/i8042.h
--- ./linux-2.6.18.1/drivers/input/serio/i8042.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/input/serio/i8042.h	2007-05-19 23:58:35.000000000 +0900
@@ -43,7 +43,7 @@
  * polling.
  */
 
-#define I8042_POLL_PERIOD	HZ/20
+#define I8042_POLL_PERIOD	(10*HZ)
 
 /*
  * Status register bits.
diff -urN ./linux-2.6.18.1/drivers/media/dvb/dvb-core/dvb_frontend.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/media/dvb/dvb-core/dvb_frontend.c
--- ./linux-2.6.18.1/drivers/media/dvb/dvb-core/dvb_frontend.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/media/dvb/dvb-core/dvb_frontend.c	2007-05-19 23:58:35.000000000 +0900
@@ -97,7 +97,7 @@
 	struct dvb_device *dvbdev;
 	struct dvb_frontend_parameters parameters;
 	struct dvb_fe_events events;
-	struct semaphore sem;
+	struct compat_semaphore sem;
 	struct list_head list_head;
 	wait_queue_head_t wait_queue;
 	pid_t thread_pid;
diff -urN ./linux-2.6.18.1/drivers/media/dvb/dvb-core/dvb_frontend.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/media/dvb/dvb-core/dvb_frontend.h
--- ./linux-2.6.18.1/drivers/media/dvb/dvb-core/dvb_frontend.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/media/dvb/dvb-core/dvb_frontend.h	2007-05-19 23:58:35.000000000 +0900
@@ -138,7 +138,7 @@
 	int			  eventr;
 	int			  overflow;
 	wait_queue_head_t	  wait_queue;
-	struct semaphore	  sem;
+	struct compat_semaphore	  sem;
 };
 
 struct dvb_frontend {
diff -urN ./linux-2.6.18.1/drivers/net/3c527.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/3c527.c
--- ./linux-2.6.18.1/drivers/net/3c527.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/3c527.c	2007-05-19 23:58:35.000000000 +0900
@@ -182,7 +182,7 @@
 
 	u16 rx_ring_tail;       /* index to rx de-queue end */ 
 
-	struct semaphore cmd_mutex;    /* Serialises issuing of execute commands */
+	struct compat_semaphore cmd_mutex;    /* Serialises issuing of execute commands */
         struct completion execution_cmd; /* Card has completed an execute command */
 	struct completion xceiver_cmd;   /* Card has completed a tx or rx command */
 };
diff -urN ./linux-2.6.18.1/drivers/net/3c59x.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/3c59x.c
--- ./linux-2.6.18.1/drivers/net/3c59x.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/3c59x.c	2007-05-19 23:58:35.000000000 +0900
@@ -793,9 +793,9 @@
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
 	local_save_flags(flags);
-	local_irq_disable();
+	local_irq_disable_nort();
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 } 
 #endif
 
@@ -1724,6 +1724,7 @@
 	int next_tick = 60*HZ;
 	int ok = 0;
 	int media_status, old_window;
+	unsigned long flags;
 
 	if (vortex_debug > 2) {
 		printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n",
@@ -1731,7 +1732,7 @@
 		printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
 	}
 
-	disable_irq_lockdep(dev->irq);
+	spin_lock_irqsave(&vp->lock, flags);
 	old_window = ioread16(ioaddr + EL3_CMD) >> 13;
 	EL3WINDOW(4);
 	media_status = ioread16(ioaddr + Wn4_Media);
@@ -1754,9 +1755,7 @@
 	case XCVR_MII: case XCVR_NWAY:
 		{
 			ok = 1;
-			spin_lock_bh(&vp->lock);
 			vortex_check_media(dev, 0);
-			spin_unlock_bh(&vp->lock);
 		}
 		break;
 	  default:					/* Other media types handled by Tx timeouts. */
@@ -1812,7 +1811,7 @@
 			 dev->name, media_tbl[dev->if_port].name);
 
 	EL3WINDOW(old_window);
-	enable_irq_lockdep(dev->irq);
+	spin_unlock_irqrestore(&vp->lock, flags);
 	mod_timer(&vp->timer, RUN_AT(next_tick));
 	if (vp->deferred)
 		iowrite16(FakeIntr, ioaddr + EL3_CMD);
@@ -1845,13 +1844,17 @@
 			/*
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
+#ifndef CONFIG_PREEMPT_RT
 			unsigned long flags;
 			local_irq_save(flags);
+#endif
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev, NULL);
 			else
 				vortex_interrupt(dev->irq, dev, NULL);
+#ifndef CONFIG_PREEMPT_RT
 			local_irq_restore(flags);
+#endif
 		}
 	}
 
diff -urN ./linux-2.6.18.1/drivers/net/e1000/e1000_main.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/e1000/e1000_main.c
--- ./linux-2.6.18.1/drivers/net/e1000/e1000_main.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/e1000/e1000_main.c	2007-05-19 23:58:35.000000000 +0900
@@ -2965,10 +2965,8 @@
 	    (adapter->hw.mac_type == e1000_82573))
 		e1000_transfer_dhcp_info(adapter, skb);
 
-	local_irq_save(flags);
-	if (!spin_trylock(&tx_ring->tx_lock)) {
+	if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
 		/* Collision - tell upper layer to requeue */
-		local_irq_restore(flags);
 		return NETDEV_TX_LOCKED;
 	}
 
diff -urN ./linux-2.6.18.1/drivers/net/hamradio/6pack.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/hamradio/6pack.c
--- ./linux-2.6.18.1/drivers/net/hamradio/6pack.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/hamradio/6pack.c	2007-05-19 23:58:35.000000000 +0900
@@ -123,7 +123,7 @@
 	struct timer_list	tx_t;
 	struct timer_list	resync_t;
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 	spinlock_t		lock;
 };
 
diff -urN ./linux-2.6.18.1/drivers/net/hamradio/mkiss.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/hamradio/mkiss.c
--- ./linux-2.6.18.1/drivers/net/hamradio/mkiss.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/hamradio/mkiss.c	2007-05-19 23:58:35.000000000 +0900
@@ -84,7 +84,7 @@
 #define CRC_MODE_SMACK_TEST	4
 
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 };
 
 /*---------------------------------------------------------------------------*/
diff -urN ./linux-2.6.18.1/drivers/net/ibm_emac/ibm_emac_core.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ibm_emac/ibm_emac_core.c
--- ./linux-2.6.18.1/drivers/net/ibm_emac/ibm_emac_core.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ibm_emac/ibm_emac_core.c	2007-05-19 23:58:35.000000000 +0900
@@ -1061,6 +1061,8 @@
 	++dev->stats.tx_packets;
 	dev->stats.tx_bytes += len;
 
+	spin_unlock(&dev->tx_lock);
+
 	return 0;
 }
 
@@ -1074,6 +1076,7 @@
 	u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY |
 	    MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb);
 
+	spin_lock(&dev->tx_lock);
 	slot = dev->tx_slot++;
 	if (dev->tx_slot == NUM_TX_BUFF) {
 		dev->tx_slot = 0;
@@ -1243,6 +1246,7 @@
 	DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt,
 	     dev->ack_slot);
 
+	spin_lock(&dev->tx_lock);
 	if (dev->tx_cnt) {
 		u16 ctrl;
 		int slot = dev->ack_slot, n = 0;
@@ -1252,6 +1256,7 @@
 			struct sk_buff *skb = dev->tx_skb[slot];
 			++n;
 
+			spin_unlock(&dev->tx_lock);
 			if (skb) {
 				dev_kfree_skb(skb);
 				dev->tx_skb[slot] = NULL;
@@ -1261,6 +1266,7 @@
 			if (unlikely(EMAC_IS_BAD_TX(ctrl)))
 				emac_parse_tx_error(dev, ctrl);
 
+			spin_lock(&dev->tx_lock);
 			if (--dev->tx_cnt)
 				goto again;
 		}
@@ -1273,6 +1279,7 @@
 			DBG2("%d: tx %d pkts" NL, dev->def->index, n);
 		}
 	}
+	spin_unlock(&dev->tx_lock);
 }
 
 static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot,
@@ -1966,6 +1973,7 @@
 	dev->ldev = &ocpdev->dev;
 	dev->def = ocpdev->def;
 	SET_MODULE_OWNER(ndev);
+	spin_lock_init(&dev->tx_lock);
 
 	/* Find MAL device we are connected to */
 	maldev =
diff -urN ./linux-2.6.18.1/drivers/net/ibm_emac/ibm_emac_core.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ibm_emac/ibm_emac_core.h
--- ./linux-2.6.18.1/drivers/net/ibm_emac/ibm_emac_core.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ibm_emac/ibm_emac_core.h	2007-05-19 23:58:35.000000000 +0900
@@ -193,6 +193,8 @@
 	struct ibm_emac_error_stats	estats;
 	struct net_device_stats		nstats;
 
+	spinlock_t			tx_lock;
+
 	struct device*			ldev;
 };
 
diff -urN ./linux-2.6.18.1/drivers/net/netconsole.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/netconsole.c
--- ./linux-2.6.18.1/drivers/net/netconsole.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/netconsole.c	2007-05-19 23:58:35.000000000 +0900
@@ -74,16 +74,22 @@
 	if (!np.dev)
 		return;
 
-	local_irq_save(flags);
+	/*
+	 * A bit hairy. Netconsole uses mutexes (indirectly) and
+	 * thus must have interrupts enabled:
+	 */
+	local_irq_save_nort(flags);
 
 	for(left = len; left; ) {
 		frag = min(left, MAX_PRINT_CHUNK);
+		WARN_ON_RT(irqs_disabled());
 		netpoll_send_udp(&np, msg, frag);
+		WARN_ON_RT(irqs_disabled());
 		msg += frag;
 		left -= frag;
 	}
 
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 static struct console netconsole = {
diff -urN ./linux-2.6.18.1/drivers/net/plip.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/plip.c
--- ./linux-2.6.18.1/drivers/net/plip.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/plip.c	2007-05-19 23:58:35.000000000 +0900
@@ -227,7 +227,10 @@
 	                              struct hh_cache *hh);
 	spinlock_t lock;
 	atomic_t kill_timer;
-	struct semaphore killed_timer_sem;
+	/*
+	 * PREEMPT_RT: this isnt a mutex, it should be struct completion.
+	 */
+	struct compat_semaphore killed_timer_sem;
 };
 
 static inline void enable_parport_interrupts (struct net_device *dev)
diff -urN ./linux-2.6.18.1/drivers/net/ppp_async.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ppp_async.c
--- ./linux-2.6.18.1/drivers/net/ppp_async.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ppp_async.c	2007-05-19 23:58:35.000000000 +0900
@@ -67,7 +67,7 @@
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 	unsigned char	obuf[OBUFSIZE];
 };
diff -urN ./linux-2.6.18.1/drivers/net/ppp_synctty.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ppp_synctty.c
--- ./linux-2.6.18.1/drivers/net/ppp_synctty.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/ppp_synctty.c	2007-05-19 23:58:35.000000000 +0900
@@ -70,7 +70,7 @@
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 };
 
diff -urN ./linux-2.6.18.1/drivers/net/tulip/tulip_core.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/tulip/tulip_core.c
--- ./linux-2.6.18.1/drivers/net/tulip/tulip_core.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/tulip/tulip_core.c	2007-05-19 23:58:35.000000000 +0900
@@ -1804,6 +1804,7 @@
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
diff -urN ./linux-2.6.18.1/drivers/net/wireless/ipw2100.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/wireless/ipw2100.c
--- ./linux-2.6.18.1/drivers/net/wireless/ipw2100.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/net/wireless/ipw2100.c	2007-05-19 23:58:35.000000000 +0900
@@ -163,6 +163,7 @@
 #include <linux/firmware.h>
 #include <linux/acpi.h>
 #include <linux/ctype.h>
+#include <linux/latency.h>
 
 #include "ipw2100.h"
 
@@ -1697,6 +1698,11 @@
 		return 0;
 	}
 
+	/* the ipw2100 hardware really doesn't want power management delays
+	 * longer than 175usec
+	 */
+	modify_acceptable_latency("ipw2100", 175);
+
 	/* If the interrupt is enabled, turn it off... */
 	spin_lock_irqsave(&priv->low_lock, flags);
 	ipw2100_disable_interrupts(priv);
@@ -1849,6 +1855,8 @@
 	ipw2100_disable_interrupts(priv);
 	spin_unlock_irqrestore(&priv->low_lock, flags);
 
+	modify_acceptable_latency("ipw2100", INFINITE_LATENCY);
+
 #ifdef ACPI_CSTATE_LIMIT_DEFINED
 	if (priv->config & CFG_C3_DISABLED) {
 		IPW_DEBUG_INFO(": Resetting C3 transitions.\n");
@@ -6533,6 +6541,7 @@
 
 	ret = pci_module_init(&ipw2100_pci_driver);
 
+	set_acceptable_latency("ipw2100", INFINITE_LATENCY);
 #ifdef CONFIG_IPW2100_DEBUG
 	ipw2100_debug_level = debug;
 	driver_create_file(&ipw2100_pci_driver.driver,
@@ -6553,6 +6562,7 @@
 			   &driver_attr_debug_level);
 #endif
 	pci_unregister_driver(&ipw2100_pci_driver);
+	remove_acceptable_latency("ipw2100");
 }
 
 module_init(ipw2100_init);
diff -urN ./linux-2.6.18.1/drivers/oprofile/oprofilefs.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/oprofile/oprofilefs.c
--- ./linux-2.6.18.1/drivers/oprofile/oprofilefs.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/oprofile/oprofilefs.c	2007-05-19 23:58:35.000000000 +0900
@@ -21,7 +21,7 @@
 
 #define OPROFILEFS_MAGIC 0x6f70726f
 
-DEFINE_SPINLOCK(oprofilefs_lock);
+DEFINE_RAW_SPINLOCK(oprofilefs_lock);
 
 static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode)
 {
diff -urN ./linux-2.6.18.1/drivers/pci/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/Makefile
--- ./linux-2.6.18.1/drivers/pci/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -27,7 +27,8 @@
 obj-$(CONFIG_MIPS) += setup-bus.o setup-irq.o
 obj-$(CONFIG_X86_VISWS) += setup-irq.o
 
-msiobj-y := msi.o msi-apic.o
+msiobj-y := msi.o
+msiobj-$(CONFIG_IA64) += msi-apic.o
 msiobj-$(CONFIG_IA64_GENERIC) += msi-altix.o
 msiobj-$(CONFIG_IA64_SGI_SN2) += msi-altix.o
 obj-$(CONFIG_PCI_MSI) += $(msiobj-y)
diff -urN ./linux-2.6.18.1/drivers/pci/hotplug/cpci_hotplug_core.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/cpci_hotplug_core.c
--- ./linux-2.6.18.1/drivers/pci/hotplug/cpci_hotplug_core.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/cpci_hotplug_core.c	2007-05-19 23:58:35.000000000 +0900
@@ -59,8 +59,8 @@
 static atomic_t extracting;
 int cpci_debug;
 static struct cpci_hp_controller *controller;
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
 static int thread_finished = 1;
 
 static int enable_slot(struct hotplug_slot *slot);
diff -urN ./linux-2.6.18.1/drivers/pci/hotplug/cpqphp_ctrl.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/cpqphp_ctrl.c
--- ./linux-2.6.18.1/drivers/pci/hotplug/cpqphp_ctrl.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/cpqphp_ctrl.c	2007-05-19 23:58:35.000000000 +0900
@@ -44,8 +44,8 @@
 			u8 behind_bridge, struct resource_lists *resources);
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
diff -urN ./linux-2.6.18.1/drivers/pci/hotplug/ibmphp_hpc.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/ibmphp_hpc.c
--- ./linux-2.6.18.1/drivers/pci/hotplug/ibmphp_hpc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/ibmphp_hpc.c	2007-05-19 23:58:35.000000000 +0900
@@ -106,7 +106,7 @@
 static struct mutex sem_hpcaccess;	// lock access to HPC
 static struct semaphore semOperations;	// lock all operations and
 					// access to data structures
-static struct semaphore sem_exit;	// make sure polling thread goes away
+static struct compat_semaphore sem_exit;	// make sure polling thread goes away
 //----------------------------------------------------------------------------
 // local function prototypes
 //----------------------------------------------------------------------------
diff -urN ./linux-2.6.18.1/drivers/pci/hotplug/pciehp_ctrl.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/pciehp_ctrl.c
--- ./linux-2.6.18.1/drivers/pci/hotplug/pciehp_ctrl.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/hotplug/pciehp_ctrl.c	2007-05-19 23:58:35.000000000 +0900
@@ -37,8 +37,8 @@
 
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 static unsigned long surprise_rm_pending;	/* = 0 */
diff -urN ./linux-2.6.18.1/drivers/pci/msi-altix.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi-altix.c
--- ./linux-2.6.18.1/drivers/pci/msi-altix.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi-altix.c	2007-05-19 23:58:35.000000000 +0900
@@ -26,7 +26,7 @@
 static struct sn_msi_info *sn_msi_info;
 
 static void
-sn_msi_teardown(unsigned int vector)
+sn_msi_teardown(unsigned int irq)
 {
 	nasid_t nasid;
 	int widget;
@@ -36,7 +36,7 @@
 	struct pcibus_bussoft *bussoft;
 	struct sn_pcibus_provider *provider;
 
-	sn_irq_info = sn_msi_info[vector].sn_irq_info;
+	sn_irq_info = sn_msi_info[irq].sn_irq_info;
 	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
 		return;
 
@@ -45,9 +45,9 @@
 	provider = SN_PCIDEV_BUSPROVIDER(pdev);
 
 	(*provider->dma_unmap)(pdev,
-			       sn_msi_info[vector].pci_addr,
+			       sn_msi_info[irq].pci_addr,
 			       PCI_DMA_FROMDEVICE);
-	sn_msi_info[vector].pci_addr = 0;
+	sn_msi_info[irq].pci_addr = 0;
 
 	bussoft = SN_PCIDEV_BUSSOFT(pdev);
 	nasid = NASID_GET(bussoft->bs_base);
@@ -56,14 +56,13 @@
 			SWIN_WIDGETNUM(bussoft->bs_base);
 
 	sn_intr_free(nasid, widget, sn_irq_info);
-	sn_msi_info[vector].sn_irq_info = NULL;
+	sn_msi_info[irq].sn_irq_info = NULL;
 
 	return;
 }
 
 int
-sn_msi_setup(struct pci_dev *pdev, unsigned int vector,
-	     u32 *addr_hi, u32 *addr_lo, u32 *data)
+sn_msi_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 {
 	int widget;
 	int status;
@@ -93,7 +92,7 @@
 	if (! sn_irq_info)
 		return -ENOMEM;
 
-	status = sn_intr_alloc(nasid, widget, sn_irq_info, vector, -1, -1);
+	status = sn_intr_alloc(nasid, widget, sn_irq_info, irq, -1, -1);
 	if (status) {
 		kfree(sn_irq_info);
 		return -ENOMEM;
@@ -119,28 +118,27 @@
 		return -ENOMEM;
 	}
 
-	sn_msi_info[vector].sn_irq_info = sn_irq_info;
-	sn_msi_info[vector].pci_addr = bus_addr;
+	sn_msi_info[irq].sn_irq_info = sn_irq_info;
+	sn_msi_info[irq].pci_addr = bus_addr;
 
-	*addr_hi = (u32)(bus_addr >> 32);
-	*addr_lo = (u32)(bus_addr & 0x00000000ffffffff);
+	msg->address_hi = (u32)(bus_addr >> 32);
+	msg->address_lo = (u32)(bus_addr & 0x00000000ffffffff);
 
 	/*
 	 * In the SN platform, bit 16 is a "send vector" bit which
 	 * must be present in order to move the vector through the system.
 	 */
-	*data = 0x100 + (unsigned int)vector;
+	msg->data = 0x100 + irq;
 
 #ifdef CONFIG_SMP
-	set_irq_affinity_info((vector & 0xff), sn_irq_info->irq_cpuid, 0);
+	set_irq_affinity_info(irq, sn_irq_info->irq_cpuid, 0);
 #endif
 
 	return 0;
 }
 
 static void
-sn_msi_target(unsigned int vector, unsigned int cpu,
-	      u32 *addr_hi, u32 *addr_lo)
+sn_msi_target(unsigned int irq, cpumask_t cpu_mask, struct msi_msg *msg)
 {
 	int slice;
 	nasid_t nasid;
@@ -150,8 +148,10 @@
 	struct sn_irq_info *sn_irq_info;
 	struct sn_irq_info *new_irq_info;
 	struct sn_pcibus_provider *provider;
+	unsigned int cpu;
 
-	sn_irq_info = sn_msi_info[vector].sn_irq_info;
+	cpu = first_cpu(cpu_mask);
+	sn_irq_info = sn_msi_info[irq].sn_irq_info;
 	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
 		return;
 
@@ -163,15 +163,15 @@
 	pdev = sn_pdev->pdi_linux_pcidev;
 	provider = SN_PCIDEV_BUSPROVIDER(pdev);
 
-	bus_addr = (u64)(*addr_hi) << 32 | (u64)(*addr_lo);
+	bus_addr = (u64)(msg->address_hi) << 32 | (u64)(msg->address_lo);
 	(*provider->dma_unmap)(pdev, bus_addr, PCI_DMA_FROMDEVICE);
-	sn_msi_info[vector].pci_addr = 0;
+	sn_msi_info[irq].pci_addr = 0;
 
 	nasid = cpuid_to_nasid(cpu);
 	slice = cpuid_to_slice(cpu);
 
 	new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
-	sn_msi_info[vector].sn_irq_info = new_irq_info;
+	sn_msi_info[irq].sn_irq_info = new_irq_info;
 	if (new_irq_info == NULL)
 		return;
 
@@ -184,12 +184,13 @@
 					sizeof(new_irq_info->irq_xtalkaddr),
 					SN_DMA_MSI|SN_DMA_ADDR_XIO);
 
-	sn_msi_info[vector].pci_addr = bus_addr;
-	*addr_hi = (u32)(bus_addr >> 32);
-	*addr_lo = (u32)(bus_addr & 0x00000000ffffffff);
+	sn_msi_info[irq].pci_addr = bus_addr;
+	msg->address_hi = (u32)(bus_addr >> 32);
+	msg->address_lo = (u32)(bus_addr & 0x00000000ffffffff);
 }
 
 struct msi_ops sn_msi_ops = {
+	.needs_64bit_address = 1,
 	.setup = sn_msi_setup,
 	.teardown = sn_msi_teardown,
 #ifdef CONFIG_SMP
@@ -201,7 +202,7 @@
 sn_msi_init(void)
 {
 	sn_msi_info =
-		kzalloc(sizeof(struct sn_msi_info) * NR_VECTORS, GFP_KERNEL);
+		kzalloc(sizeof(struct sn_msi_info) * NR_IRQS, GFP_KERNEL);
 	if (! sn_msi_info)
 		return -ENOMEM;
 
diff -urN ./linux-2.6.18.1/drivers/pci/msi-apic.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi-apic.c
--- ./linux-2.6.18.1/drivers/pci/msi-apic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi-apic.c	2007-05-19 23:58:35.000000000 +0900
@@ -46,37 +46,36 @@
 
 
 static void
-msi_target_apic(unsigned int vector,
-		unsigned int dest_cpu,
-		u32 *address_hi,	/* in/out */
-		u32 *address_lo)	/* in/out */
+msi_target_apic(unsigned int irq, cpumask_t cpu_mask, struct msi_msg *msg)
 {
-	u32 addr = *address_lo;
+	u32 addr = msg->address_lo;
 
 	addr &= MSI_ADDR_DESTID_MASK;
-	addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(dest_cpu));
+	addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(first_cpu(cpu_mask)));
 
-	*address_lo = addr;
+	msg->address_lo = addr;
 }
 
 static int
 msi_setup_apic(struct pci_dev *pdev,	/* unused in generic */
-		unsigned int vector,
-		u32 *address_hi,
-		u32 *address_lo,
-		u32 *data)
+		unsigned int irq,
+		struct msi_msg *msg)
 {
 	unsigned long	dest_phys_id;
+	unsigned int	vector;
 
 	dest_phys_id = cpu_physical_id(first_cpu(cpu_online_map));
+	vector = irq;
 
-	*address_hi = 0;
-	*address_lo =	MSI_ADDR_HEADER |
-			MSI_ADDR_DESTMODE_PHYS |
-			MSI_ADDR_REDIRECTION_CPU |
-			MSI_ADDR_DESTID_CPU(dest_phys_id);
+	msg->address_hi = 0;
+	msg->address_lo =
+		MSI_ADDR_HEADER |
+		MSI_ADDR_DESTMODE_PHYS |
+		MSI_ADDR_REDIRECTION_CPU |
+		MSI_ADDR_DESTID_CPU(dest_phys_id);
 
-	*data = MSI_DATA_TRIGGER_EDGE |
+	msg->data =
+		MSI_DATA_TRIGGER_EDGE |
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(vector);
@@ -85,7 +84,7 @@
 }
 
 static void
-msi_teardown_apic(unsigned int vector)
+msi_teardown_apic(unsigned int irq)
 {
 	return;		/* no-op */
 }
@@ -95,6 +94,7 @@
  */
 
 struct msi_ops msi_apic_ops = {
+	.needs_64bit_address = 0,
 	.setup = msi_setup_apic,
 	.teardown = msi_teardown_apic,
 	.target = msi_target_apic,
diff -urN ./linux-2.6.18.1/drivers/pci/msi.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi.c
--- ./linux-2.6.18.1/drivers/pci/msi.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi.c	2007-05-19 23:58:35.000000000 +0900
@@ -6,6 +6,7 @@
  * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
  */
 
+#include <linux/err.h>
 #include <linux/mm.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -22,19 +23,11 @@
 #include "pci.h"
 #include "msi.h"
 
-static DEFINE_SPINLOCK(msi_lock);
+static DEFINE_RAW_SPINLOCK(msi_lock);
 static struct msi_desc* msi_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = NULL };
 static kmem_cache_t* msi_cachep;
 
 static int pci_msi_enable = 1;
-static int last_alloc_vector;
-static int nr_released_vectors;
-static int nr_reserved_vectors = NR_HP_RESERVED_VECTORS;
-static int nr_msix_devices;
-
-#ifndef CONFIG_X86_IO_APIC
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
-#endif
 
 static struct msi_ops *msi_ops;
 
@@ -61,11 +54,11 @@
 	return 0;
 }
 
-static void msi_set_mask_bit(unsigned int vector, int flag)
+static void msi_set_mask_bit(unsigned int irq, int flag)
 {
 	struct msi_desc *entry;
 
-	entry = (struct msi_desc *)msi_desc[vector];
+	entry = msi_desc[irq];
 	if (!entry || !entry->dev || !entry->mask_base)
 		return;
 	switch (entry->msi_attrib.type) {
@@ -93,84 +86,119 @@
 	}
 }
 
-#ifdef CONFIG_SMP
-static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
+static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
-	struct msi_desc *entry;
-	u32 address_hi, address_lo;
-	unsigned int irq = vector;
-	unsigned int dest_cpu = first_cpu(cpu_mask);
+	switch(entry->msi_attrib.type) {
+	case PCI_CAP_ID_MSI:
+	{
+		struct pci_dev *dev = entry->dev;
+		int pos = entry->msi_attrib.pos;
+		u16 data;
+
+		pci_read_config_dword(dev, msi_lower_address_reg(pos),
+					&msg->address_lo);
+		if (entry->msi_attrib.is_64) {
+			pci_read_config_dword(dev, msi_upper_address_reg(pos),
+						&msg->address_hi);
+			pci_read_config_word(dev, msi_data_reg(pos, 1), &data);
+		} else {
+			msg->address_hi = 0;
+			pci_read_config_word(dev, msi_data_reg(pos, 1), &data);
+		}
+		msg->data = data;
+		break;
+	}
+	case PCI_CAP_ID_MSIX:
+	{
+		void __iomem *base;
+		base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 
-	entry = (struct msi_desc *)msi_desc[vector];
-	if (!entry || !entry->dev)
-		return;
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+ 		break;
+ 	}
+ 	default:
+		BUG();
+	}
+}
 
+static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
-		int pos = pci_find_capability(entry->dev, PCI_CAP_ID_MSI);
+		struct pci_dev *dev = entry->dev;
+		int pos = entry->msi_attrib.pos;
 
-		if (!pos)
-			return;
-
-		pci_read_config_dword(entry->dev, msi_upper_address_reg(pos),
-			&address_hi);
-		pci_read_config_dword(entry->dev, msi_lower_address_reg(pos),
-			&address_lo);
-
-		msi_ops->target(vector, dest_cpu, &address_hi, &address_lo);
-
-		pci_write_config_dword(entry->dev, msi_upper_address_reg(pos),
-			address_hi);
-		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
-			address_lo);
-		set_native_irq_info(irq, cpu_mask);
+		pci_write_config_dword(dev, msi_lower_address_reg(pos),
+					msg->address_lo);
+		if (entry->msi_attrib.is_64) {
+			pci_write_config_dword(dev, msi_upper_address_reg(pos),
+						msg->address_hi);
+			pci_write_config_word(dev, msi_data_reg(pos, 1),
+						msg->data);
+		} else {
+			pci_write_config_word(dev, msi_data_reg(pos, 0),
+						msg->data);
+		}
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
 	{
-		int offset_hi =
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET;
-		int offset_lo =
-			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET;
-
-		address_hi = readl(entry->mask_base + offset_hi);
-		address_lo = readl(entry->mask_base + offset_lo);
-
-		msi_ops->target(vector, dest_cpu, &address_hi, &address_lo);
-
-		writel(address_hi, entry->mask_base + offset_hi);
-		writel(address_lo, entry->mask_base + offset_lo);
-		set_native_irq_info(irq, cpu_mask);
+		void __iomem *base;
+		base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		writel(msg->address_lo,
+			base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+		writel(msg->address_hi,
+			base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+		writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
 		break;
 	}
 	default:
-		break;
+		BUG();
 	}
 }
+
+#ifdef CONFIG_SMP
+static void set_msi_affinity(unsigned int irq, cpumask_t cpu_mask)
+{
+	struct msi_desc *entry;
+	struct msi_msg msg;
+
+	entry = msi_desc[irq];
+	if (!entry || !entry->dev)
+		return;
+
+	read_msi_msg(entry, &msg);
+	msi_ops->target(irq, cpu_mask, &msg);
+	write_msi_msg(entry, &msg);
+	set_native_irq_info(irq, cpu_mask);
+}
 #else
 #define set_msi_affinity NULL
 #endif /* CONFIG_SMP */
 
-static void mask_MSI_irq(unsigned int vector)
+static void mask_MSI_irq(unsigned int irq)
 {
-	msi_set_mask_bit(vector, 1);
+	msi_set_mask_bit(irq, 1);
 }
 
-static void unmask_MSI_irq(unsigned int vector)
+static void unmask_MSI_irq(unsigned int irq)
 {
-	msi_set_mask_bit(vector, 0);
+	msi_set_mask_bit(irq, 0);
 }
 
-static unsigned int startup_msi_irq_wo_maskbit(unsigned int vector)
+static unsigned int startup_msi_irq_wo_maskbit(unsigned int irq)
 {
 	struct msi_desc *entry;
 	unsigned long flags;
 
 	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[vector];
+	entry = msi_desc[irq];
 	if (!entry || !entry->dev) {
 		spin_unlock_irqrestore(&msi_lock, flags);
 		return 0;
@@ -181,39 +209,39 @@
 	return 0;	/* never anything pending */
 }
 
-static unsigned int startup_msi_irq_w_maskbit(unsigned int vector)
+static unsigned int startup_msi_irq_w_maskbit(unsigned int irq)
 {
-	startup_msi_irq_wo_maskbit(vector);
-	unmask_MSI_irq(vector);
+	startup_msi_irq_wo_maskbit(irq);
+	unmask_MSI_irq(irq);
 	return 0;	/* never anything pending */
 }
 
-static void shutdown_msi_irq(unsigned int vector)
+static void shutdown_msi_irq(unsigned int irq)
 {
 	struct msi_desc *entry;
 	unsigned long flags;
 
 	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[vector];
+	entry = msi_desc[irq];
 	if (entry && entry->dev)
 		entry->msi_attrib.state = 0;	/* Mark it not active */
 	spin_unlock_irqrestore(&msi_lock, flags);
 }
 
-static void end_msi_irq_wo_maskbit(unsigned int vector)
+static void end_msi_irq_wo_maskbit(unsigned int irq)
 {
-	move_native_irq(vector);
+	move_native_irq(irq);
 	ack_APIC_irq();
 }
 
-static void end_msi_irq_w_maskbit(unsigned int vector)
+static void end_msi_irq_w_maskbit(unsigned int irq)
 {
-	move_native_irq(vector);
-	unmask_MSI_irq(vector);
+	move_native_irq(irq);
+	unmask_MSI_irq(irq);
 	ack_APIC_irq();
 }
 
-static void do_nothing(unsigned int vector)
+static void do_nothing(unsigned int irq)
 {
 }
 
@@ -264,86 +292,7 @@
 	.set_affinity	= set_msi_affinity
 };
 
-static int msi_free_vector(struct pci_dev* dev, int vector, int reassign);
-static int assign_msi_vector(void)
-{
-	static int new_vector_avail = 1;
-	int vector;
-	unsigned long flags;
-
-	/*
-	 * msi_lock is provided to ensure that successful allocation of MSI
-	 * vector is assigned unique among drivers.
-	 */
-	spin_lock_irqsave(&msi_lock, flags);
-
-	if (!new_vector_avail) {
-		int free_vector = 0;
-
-		/*
-	 	 * vector_irq[] = -1 indicates that this specific vector is:
-	 	 * - assigned for MSI (since MSI have no associated IRQ) or
-	 	 * - assigned for legacy if less than 16, or
-	 	 * - having no corresponding 1:1 vector-to-IOxAPIC IRQ mapping
-	 	 * vector_irq[] = 0 indicates that this vector, previously
-		 * assigned for MSI, is freed by hotplug removed operations.
-		 * This vector will be reused for any subsequent hotplug added
-		 * operations.
-	 	 * vector_irq[] > 0 indicates that this vector is assigned for
-		 * IOxAPIC IRQs. This vector and its value provides a 1-to-1
-		 * vector-to-IOxAPIC IRQ mapping.
-	 	 */
-		for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
-			if (vector_irq[vector] != 0)
-				continue;
-			free_vector = vector;
-			if (!msi_desc[vector])
-			      	break;
-			else
-				continue;
-		}
-		if (!free_vector) {
-			spin_unlock_irqrestore(&msi_lock, flags);
-			return -EBUSY;
-		}
-		vector_irq[free_vector] = -1;
-		nr_released_vectors--;
-		spin_unlock_irqrestore(&msi_lock, flags);
-		if (msi_desc[free_vector] != NULL) {
-			struct pci_dev *dev;
-			int tail;
-
-			/* free all linked vectors before re-assign */
-			do {
-				spin_lock_irqsave(&msi_lock, flags);
-				dev = msi_desc[free_vector]->dev;
-				tail = msi_desc[free_vector]->link.tail;
-				spin_unlock_irqrestore(&msi_lock, flags);
-				msi_free_vector(dev, tail, 1);
-			} while (free_vector != tail);
-		}
-
-		return free_vector;
-	}
-	vector = assign_irq_vector(AUTO_ASSIGN);
-	last_alloc_vector = vector;
-	if (vector  == LAST_DEVICE_VECTOR)
-		new_vector_avail = 0;
-
-	spin_unlock_irqrestore(&msi_lock, flags);
-	return vector;
-}
-
-static int get_new_vector(void)
-{
-	int vector = assign_msi_vector();
-
-	if (vector > 0)
-		set_intr_gate(vector, interrupt[vector]);
-
-	return vector;
-}
-
+static int msi_free_irq(struct pci_dev* dev, int irq);
 static int msi_init(void)
 {
 	static int status = -ENOMEM;
@@ -367,13 +316,13 @@
 	}
 
 	if (! msi_ops) {
+		pci_msi_enable = 0;
 		printk(KERN_WARNING
 		       "PCI: MSI ops not registered. MSI disabled.\n");
 		status = -EINVAL;
 		return status;
 	}
 
-	last_alloc_vector = assign_irq_vector(AUTO_ASSIGN);
 	status = msi_cache_init();
 	if (status < 0) {
 		pci_msi_enable = 0;
@@ -381,23 +330,9 @@
 		return status;
 	}
 
-	if (last_alloc_vector < 0) {
-		pci_msi_enable = 0;
-		printk(KERN_WARNING "PCI: No interrupt vectors available for MSI\n");
-		status = -EBUSY;
-		return status;
-	}
-	vector_irq[last_alloc_vector] = 0;
-	nr_released_vectors++;
-
 	return status;
 }
 
-static int get_msi_vector(struct pci_dev *dev)
-{
-	return get_new_vector();
-}
-
 static struct msi_desc* alloc_msi_entry(void)
 {
 	struct msi_desc *entry;
@@ -413,29 +348,45 @@
 	return entry;
 }
 
-static void attach_msi_entry(struct msi_desc *entry, int vector)
+static void attach_msi_entry(struct msi_desc *entry, int irq)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&msi_lock, flags);
-	msi_desc[vector] = entry;
+	msi_desc[irq] = entry;
 	spin_unlock_irqrestore(&msi_lock, flags);
 }
 
-static void irq_handler_init(int cap_id, int pos, int mask)
+static int create_msi_irq(struct hw_interrupt_type *handler)
 {
-	unsigned long flags;
+	struct msi_desc *entry;
+	int irq;
+
+	entry = alloc_msi_entry();
+	if (!entry)
+		return -ENOMEM;
 
-	spin_lock_irqsave(&irq_desc[pos].lock, flags);
-	if (cap_id == PCI_CAP_ID_MSIX)
-		irq_desc[pos].chip = &msix_irq_type;
-	else {
-		if (!mask)
-			irq_desc[pos].chip = &msi_irq_wo_maskbit_type;
-		else
-			irq_desc[pos].chip = &msi_irq_w_maskbit_type;
+	irq = create_irq();
+	if (irq < 0) {
+		kmem_cache_free(msi_cachep, entry);
+		return -EBUSY;
 	}
-	spin_unlock_irqrestore(&irq_desc[pos].lock, flags);
+
+	set_irq_chip(irq, handler);
+	set_irq_data(irq, entry);
+
+	return irq;
+}
+
+static void destroy_msi_irq(unsigned int irq)
+{
+	struct msi_desc *entry;
+
+	entry = get_irq_data(irq);
+	set_irq_chip(irq, NULL);
+	set_irq_data(irq, NULL);
+	destroy_irq(irq);
+	kmem_cache_free(msi_cachep, entry);
 }
 
 static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
@@ -480,21 +431,21 @@
 	}
 }
 
-static int msi_lookup_vector(struct pci_dev *dev, int type)
+static int msi_lookup_irq(struct pci_dev *dev, int type)
 {
-	int vector;
+	int irq;
 	unsigned long flags;
 
 	spin_lock_irqsave(&msi_lock, flags);
-	for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
-		if (!msi_desc[vector] || msi_desc[vector]->dev != dev ||
-			msi_desc[vector]->msi_attrib.type != type ||
-			msi_desc[vector]->msi_attrib.default_vector != dev->irq)
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		if (!msi_desc[irq] || msi_desc[irq]->dev != dev ||
+			msi_desc[irq]->msi_attrib.type != type ||
+			msi_desc[irq]->msi_attrib.default_irq != dev->irq)
 			continue;
 		spin_unlock_irqrestore(&msi_lock, flags);
-		/* This pre-assigned MSI vector for this device
-		   already exits. Override dev->irq with this vector */
-		dev->irq = vector;
+		/* This pre-assigned MSI irq for this device
+		   already exits. Override dev->irq with this irq */
+		dev->irq = irq;
 		return 0;
 	}
 	spin_unlock_irqrestore(&msi_lock, flags);
@@ -506,11 +457,6 @@
 {
 	if (!dev)
 		return;
-
-   	if (pci_find_capability(dev, PCI_CAP_ID_MSIX) > 0)
-		nr_msix_devices++;
-	else if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0)
-		nr_reserved_vectors++;
 }
 
 #ifdef CONFIG_PM
@@ -584,7 +530,7 @@
 {
 	int pos;
 	int temp;
-	int vector, head, tail = 0;
+	int irq, head, tail = 0;
 	u16 control;
 	struct pci_cap_saved_state *save_state;
 
@@ -606,33 +552,20 @@
 
 	/* save the table */
 	temp = dev->irq;
-	if (msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
+	if (msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) {
 		kfree(save_state);
 		return -EINVAL;
 	}
 
-	vector = head = dev->irq;
+	irq = head = dev->irq;
 	while (head != tail) {
-		int j;
-		void __iomem *base;
 		struct msi_desc *entry;
 
-		entry = msi_desc[vector];
-		base = entry->mask_base;
-		j = entry->msi_attrib.entry_nr;
-
-		entry->address_lo_save =
-			readl(base + j * PCI_MSIX_ENTRY_SIZE +
-			      PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		entry->address_hi_save =
-			readl(base + j * PCI_MSIX_ENTRY_SIZE +
-			      PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		entry->data_save =
-			readl(base + j * PCI_MSIX_ENTRY_SIZE +
-			      PCI_MSIX_ENTRY_DATA_OFFSET);
+		entry = msi_desc[irq];
+		read_msi_msg(entry, &entry->msg_save);
 
-		tail = msi_desc[vector]->link.tail;
-		vector = tail;
+		tail = msi_desc[irq]->link.tail;
+		irq = tail;
 	}
 	dev->irq = temp;
 
@@ -645,9 +578,7 @@
 {
 	u16 save;
 	int pos;
-	int vector, head, tail = 0;
-	void __iomem *base;
-	int j;
+	int irq, head, tail = 0;
 	struct msi_desc *entry;
 	int temp;
 	struct pci_cap_saved_state *save_state;
@@ -665,26 +596,15 @@
 
 	/* route the table */
 	temp = dev->irq;
-	if (msi_lookup_vector(dev, PCI_CAP_ID_MSIX))
+	if (msi_lookup_irq(dev, PCI_CAP_ID_MSIX))
 		return;
-	vector = head = dev->irq;
+	irq = head = dev->irq;
 	while (head != tail) {
-		entry = msi_desc[vector];
-		base = entry->mask_base;
-		j = entry->msi_attrib.entry_nr;
-
-		writel(entry->address_lo_save,
-			base + j * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		writel(entry->address_hi_save,
-			base + j * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		writel(entry->data_save,
-			base + j * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_DATA_OFFSET);
+		entry = msi_desc[irq];
+		write_msi_msg(entry, &entry->msg_save);
 
-		tail = msi_desc[vector]->link.tail;
-		vector = tail;
+		tail = msi_desc[irq]->link.tail;
+		irq = tail;
 	}
 	dev->irq = temp;
 
@@ -696,29 +616,19 @@
 static int msi_register_init(struct pci_dev *dev, struct msi_desc *entry)
 {
 	int status;
-	u32 address_hi;
-	u32 address_lo;
-	u32 data;
-	int pos, vector = dev->irq;
+	struct msi_msg msg;
+	int pos;
 	u16 control;
 
-   	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	pos = entry->msi_attrib.pos;
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 
 	/* Configure MSI capability structure */
-	status = msi_ops->setup(dev, vector, &address_hi, &address_lo, &data);
+	status = msi_ops->setup(dev, dev->irq, &msg);
 	if (status < 0)
 		return status;
 
-	pci_write_config_dword(dev, msi_lower_address_reg(pos), address_lo);
-	if (is_64bit_address(control)) {
-		pci_write_config_dword(dev,
-			msi_upper_address_reg(pos), address_hi);
-		pci_write_config_word(dev,
-			msi_data_reg(pos, 1), data);
-	} else
-		pci_write_config_word(dev,
-			msi_data_reg(pos, 0), data);
+	write_msi_msg(entry, &msg);
 	if (entry->msi_attrib.maskbit) {
 		unsigned int maskbits, temp;
 		/* All MSIs are unmasked by default, Mask them all */
@@ -741,53 +651,54 @@
  * @dev: pointer to the pci_dev data structure of MSI device function
  *
  * Setup the MSI capability structure of device function with a single
- * MSI vector, regardless of device function is capable of handling
+ * MSI irq, regardless of device function is capable of handling
  * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI vector or non-zero for otherwise.
+ * of an entry zero with the new MSI irq or non-zero for otherwise.
  **/
 static int msi_capability_init(struct pci_dev *dev)
 {
 	int status;
 	struct msi_desc *entry;
-	int pos, vector;
+	int pos, irq;
 	u16 control;
+	struct hw_interrupt_type *handler;
 
    	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	/* MSI Entry Initialization */
-	entry = alloc_msi_entry();
-	if (!entry)
-		return -ENOMEM;
-
-	vector = get_msi_vector(dev);
-	if (vector < 0) {
-		kmem_cache_free(msi_cachep, entry);
-		return -EBUSY;
-	}
-	entry->link.head = vector;
-	entry->link.tail = vector;
+	handler = &msi_irq_wo_maskbit_type;
+	if (is_mask_bit_support(control))
+		handler = &msi_irq_w_maskbit_type;
+
+	irq = create_msi_irq(handler);
+	if (irq < 0)
+		return irq;
+
+	entry = get_irq_data(irq);
+	entry->link.head = irq;
+	entry->link.tail = irq;
 	entry->msi_attrib.type = PCI_CAP_ID_MSI;
 	entry->msi_attrib.state = 0;			/* Mark it not active */
+	entry->msi_attrib.is_64 = is_64bit_address(control);
 	entry->msi_attrib.entry_nr = 0;
 	entry->msi_attrib.maskbit = is_mask_bit_support(control);
-	entry->msi_attrib.default_vector = dev->irq;	/* Save IOAPIC IRQ */
-	dev->irq = vector;
+	entry->msi_attrib.default_irq = dev->irq;	/* Save IOAPIC IRQ */
+	entry->msi_attrib.pos = pos;
+	dev->irq = irq;
 	entry->dev = dev;
 	if (is_mask_bit_support(control)) {
 		entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
 				is_64bit_address(control));
 	}
-	/* Replace with MSI handler */
-	irq_handler_init(PCI_CAP_ID_MSI, vector, entry->msi_attrib.maskbit);
 	/* Configure MSI capability structure */
 	status = msi_register_init(dev, entry);
 	if (status != 0) {
-		dev->irq = entry->msi_attrib.default_vector;
-		kmem_cache_free(msi_cachep, entry);
+		dev->irq = entry->msi_attrib.default_irq;
+		destroy_msi_irq(irq);
 		return status;
 	}
 
-	attach_msi_entry(entry, vector);
+	attach_msi_entry(entry, irq);
 	/* Set MSI enabled bits	 */
 	enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
 
@@ -801,18 +712,16 @@
  * @nvec: number of @entries
  *
  * Setup the MSI-X capability structure of device function with a
- * single MSI-X vector. A return of zero indicates the successful setup of
- * requested MSI-X entries with allocated vectors or non-zero for otherwise.
+ * single MSI-X irq. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
 				struct msix_entry *entries, int nvec)
 {
 	struct msi_desc *head = NULL, *tail = NULL, *entry = NULL;
-	u32 address_hi;
-	u32 address_lo;
-	u32 data;
+	struct msi_msg msg;
 	int status;
-	int vector, pos, i, j, nr_entries, temp = 0;
+	int irq, pos, i, j, nr_entries, temp = 0;
 	unsigned long phys_addr;
 	u32 table_offset;
  	u16 control;
@@ -834,65 +743,58 @@
 
 	/* MSI-X Table Initialization */
 	for (i = 0; i < nvec; i++) {
-		entry = alloc_msi_entry();
-		if (!entry)
-			break;
-		vector = get_msi_vector(dev);
-		if (vector < 0) {
-			kmem_cache_free(msi_cachep, entry);
+		irq = create_msi_irq(&msix_irq_type);
+		if (irq < 0)
 			break;
-		}
 
+		entry = get_irq_data(irq);
  		j = entries[i].entry;
- 		entries[i].vector = vector;
+ 		entries[i].vector = irq;
 		entry->msi_attrib.type = PCI_CAP_ID_MSIX;
  		entry->msi_attrib.state = 0;		/* Mark it not active */
+		entry->msi_attrib.is_64 = 1;
 		entry->msi_attrib.entry_nr = j;
 		entry->msi_attrib.maskbit = 1;
-		entry->msi_attrib.default_vector = dev->irq;
+		entry->msi_attrib.default_irq = dev->irq;
+		entry->msi_attrib.pos = pos;
 		entry->dev = dev;
 		entry->mask_base = base;
 		if (!head) {
-			entry->link.head = vector;
-			entry->link.tail = vector;
+			entry->link.head = irq;
+			entry->link.tail = irq;
 			head = entry;
 		} else {
 			entry->link.head = temp;
 			entry->link.tail = tail->link.tail;
-			tail->link.tail = vector;
-			head->link.head = vector;
+			tail->link.tail = irq;
+			head->link.head = irq;
 		}
-		temp = vector;
+		temp = irq;
 		tail = entry;
-		/* Replace with MSI-X handler */
-		irq_handler_init(PCI_CAP_ID_MSIX, vector, 1);
 		/* Configure MSI-X capability structure */
-		status = msi_ops->setup(dev, vector,
-					&address_hi,
-					&address_lo,
-					&data);
-		if (status < 0)
+		status = msi_ops->setup(dev, irq, &msg);
+		if (status < 0) {
+			destroy_msi_irq(irq);
 			break;
+		}
 
-		writel(address_lo,
-			base + j * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-		writel(address_hi,
-			base + j * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-		writel(data,
-			base + j * PCI_MSIX_ENTRY_SIZE +
-			PCI_MSIX_ENTRY_DATA_OFFSET);
-		attach_msi_entry(entry, vector);
+		write_msi_msg(entry, &msg);
+		attach_msi_entry(entry, irq);
 	}
 	if (i != nvec) {
+		int avail = i - 1;
 		i--;
 		for (; i >= 0; i--) {
-			vector = (entries + i)->vector;
-			msi_free_vector(dev, vector, 0);
+			irq = (entries + i)->vector;
+			msi_free_irq(dev, irq);
 			(entries + i)->vector = 0;
 		}
-		return -EBUSY;
+		/* If we had some success report the number of irqs
+		 * we succeeded in setting up.
+		 */
+		if (avail <= 0)
+			avail = -EBUSY;
+		return avail;
 	}
 	/* Set MSI-X enabled bits */
 	enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
@@ -905,10 +807,10 @@
  * @dev: pointer to the pci_dev data structure of MSI device function
  *
  * Setup the MSI capability structure of device function with
- * a single MSI vector upon its software driver call to request for
+ * a single MSI irq upon its software driver call to request for
  * MSI mode enabled on its hardware device function. A return of zero
  * indicates the successful setup of an entry zero with the new MSI
- * vector or non-zero for otherwise.
+ * irq or non-zero for otherwise.
  **/
 int pci_enable_msi(struct pci_dev* dev)
 {
@@ -936,52 +838,29 @@
 	if (!pos)
 		return -EINVAL;
 
-	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
-		/* Lookup Sucess */
-		unsigned long flags;
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	if (!is_64bit_address(control) && msi_ops->needs_64bit_address)
+		return -EINVAL;
 
-		pci_read_config_word(dev, msi_control_reg(pos), &control);
-		if (control & PCI_MSI_FLAGS_ENABLE)
-			return 0;	/* Already in MSI mode */
-		spin_lock_irqsave(&msi_lock, flags);
-		if (!vector_irq[dev->irq]) {
-			msi_desc[dev->irq]->msi_attrib.state = 0;
-			vector_irq[dev->irq] = -1;
-			nr_released_vectors--;
-			spin_unlock_irqrestore(&msi_lock, flags);
-			status = msi_register_init(dev, msi_desc[dev->irq]);
-			if (status == 0)
-				enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
-			return status;
-		}
-		spin_unlock_irqrestore(&msi_lock, flags);
-		dev->irq = temp;
-	}
-	/* Check whether driver already requested for MSI-X vectors */
+	WARN_ON(!msi_lookup_irq(dev, PCI_CAP_ID_MSI));
+
+	/* Check whether driver already requested for MSI-X irqs */
 	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
+	if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) {
 			printk(KERN_INFO "PCI: %s: Can't enable MSI.  "
-			       "Device already has MSI-X vectors assigned\n",
+			       "Device already has MSI-X irq assigned\n",
 			       pci_name(dev));
 			dev->irq = temp;
 			return -EINVAL;
 	}
 	status = msi_capability_init(dev);
-	if (!status) {
-   		if (!pos)
-			nr_reserved_vectors--;	/* Only MSI capable */
-		else if (nr_msix_devices > 0)
-			nr_msix_devices--;	/* Both MSI and MSI-X capable,
-						   but choose enabling MSI */
-	}
-
 	return status;
 }
 
 void pci_disable_msi(struct pci_dev* dev)
 {
 	struct msi_desc *entry;
-	int pos, default_vector;
+	int pos, default_irq;
 	u16 control;
 	unsigned long flags;
 
@@ -998,6 +877,8 @@
 	if (!(control & PCI_MSI_FLAGS_ENABLE))
 		return;
 
+	disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
+
 	spin_lock_irqsave(&msi_lock, flags);
 	entry = msi_desc[dev->irq];
 	if (!entry || !entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) {
@@ -1007,32 +888,30 @@
 	if (entry->msi_attrib.state) {
 		spin_unlock_irqrestore(&msi_lock, flags);
 		printk(KERN_WARNING "PCI: %s: pci_disable_msi() called without "
-		       "free_irq() on MSI vector %d\n",
+		       "free_irq() on MSI irq %d\n",
 		       pci_name(dev), dev->irq);
 		BUG_ON(entry->msi_attrib.state > 0);
 	} else {
-		vector_irq[dev->irq] = 0; /* free it */
-		nr_released_vectors++;
-		default_vector = entry->msi_attrib.default_vector;
+		default_irq = entry->msi_attrib.default_irq;
 		spin_unlock_irqrestore(&msi_lock, flags);
-		/* Restore dev->irq to its default pin-assertion vector */
-		dev->irq = default_vector;
-		disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
-					PCI_CAP_ID_MSI);
+		msi_free_irq(dev, dev->irq);
+
+		/* Restore dev->irq to its default pin-assertion irq */
+		dev->irq = default_irq;
 	}
 }
 
-static int msi_free_vector(struct pci_dev* dev, int vector, int reassign)
+static int msi_free_irq(struct pci_dev* dev, int irq)
 {
 	struct msi_desc *entry;
 	int head, entry_nr, type;
 	void __iomem *base;
 	unsigned long flags;
 
-	msi_ops->teardown(vector);
+	msi_ops->teardown(irq);
 
 	spin_lock_irqsave(&msi_lock, flags);
-	entry = msi_desc[vector];
+	entry = msi_desc[irq];
 	if (!entry || entry->dev != dev) {
 		spin_unlock_irqrestore(&msi_lock, flags);
 		return -EINVAL;
@@ -1044,101 +923,43 @@
 	msi_desc[entry->link.head]->link.tail = entry->link.tail;
 	msi_desc[entry->link.tail]->link.head = entry->link.head;
 	entry->dev = NULL;
-	if (!reassign) {
-		vector_irq[vector] = 0;
-		nr_released_vectors++;
-	}
-	msi_desc[vector] = NULL;
+	msi_desc[irq] = NULL;
 	spin_unlock_irqrestore(&msi_lock, flags);
 
-	kmem_cache_free(msi_cachep, entry);
+	destroy_msi_irq(irq);
 
 	if (type == PCI_CAP_ID_MSIX) {
-		if (!reassign)
-			writel(1, base +
-				entry_nr * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+		writel(1, base + entry_nr * PCI_MSIX_ENTRY_SIZE +
+			PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
-		if (head == vector)
+		if (head == irq)
 			iounmap(base);
 	}
 
 	return 0;
 }
 
-static int reroute_msix_table(int head, struct msix_entry *entries, int *nvec)
-{
-	int vector = head, tail = 0;
-	int i, j = 0, nr_entries = 0;
-	void __iomem *base;
-	unsigned long flags;
-
-	spin_lock_irqsave(&msi_lock, flags);
-	while (head != tail) {
-		nr_entries++;
-		tail = msi_desc[vector]->link.tail;
-		if (entries[0].entry == msi_desc[vector]->msi_attrib.entry_nr)
-			j = vector;
-		vector = tail;
-	}
-	if (*nvec > nr_entries) {
-		spin_unlock_irqrestore(&msi_lock, flags);
-		*nvec = nr_entries;
-		return -EINVAL;
-	}
-	vector = ((j > 0) ? j : head);
-	for (i = 0; i < *nvec; i++) {
-		j = msi_desc[vector]->msi_attrib.entry_nr;
-		msi_desc[vector]->msi_attrib.state = 0;	/* Mark it not active */
-		vector_irq[vector] = -1;		/* Mark it busy */
-		nr_released_vectors--;
-		entries[i].vector = vector;
-		if (j != (entries + i)->entry) {
-			base = msi_desc[vector]->mask_base;
-			msi_desc[vector]->msi_attrib.entry_nr =
-				(entries + i)->entry;
-			writel( readl(base + j * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET), base +
-				(entries + i)->entry * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-			writel(	readl(base + j * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET), base +
-				(entries + i)->entry * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-			writel( (readl(base + j * PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_DATA_OFFSET) & 0xff00) | vector,
-				base + (entries+i)->entry*PCI_MSIX_ENTRY_SIZE +
-				PCI_MSIX_ENTRY_DATA_OFFSET);
-		}
-		vector = msi_desc[vector]->link.tail;
-	}
-	spin_unlock_irqrestore(&msi_lock, flags);
-
-	return 0;
-}
-
 /**
  * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of MSI-X entries
- * @nvec: number of MSI-X vectors requested for allocation by device driver
+ * @nvec: number of MSI-X irqs requested for allocation by device driver
  *
  * Setup the MSI-X capability structure of device function with the number
- * of requested vectors upon its software driver call to request for
+ * of requested irqs upon its software driver call to request for
  * MSI-X mode enabled on its hardware device function. A return of zero
  * indicates the successful configuration of MSI-X capability structure
- * with new allocated MSI-X vectors. A return of < 0 indicates a failure.
+ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
  * Or a return of > 0 indicates that driver request is exceeding the number
- * of vectors available. Driver should use the returned value to re-send
+ * of irqs available. Driver should use the returned value to re-send
  * its request.
  **/
 int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 {
 	struct pci_bus *bus;
-	int status, pos, nr_entries, free_vectors;
+	int status, pos, nr_entries;
 	int i, j, temp;
 	u16 control;
-	unsigned long flags;
 
 	if (!pci_msi_enable || !dev || !entries)
  		return -EINVAL;
@@ -1159,9 +980,6 @@
  		return -EINVAL;
 
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	if (control & PCI_MSIX_FLAGS_ENABLE)
-		return -EINVAL;			/* Already in MSI-X mode */
-
 	nr_entries = multi_msix_capable(control);
 	if (nvec > nr_entries)
 		return -EINVAL;
@@ -1176,56 +994,18 @@
 		}
 	}
 	temp = dev->irq;
-	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
-		/* Lookup Sucess */
-		nr_entries = nvec;
-		/* Reroute MSI-X table */
-		if (reroute_msix_table(dev->irq, entries, &nr_entries)) {
-			/* #requested > #previous-assigned */
-			dev->irq = temp;
-			return nr_entries;
-		}
-		dev->irq = temp;
-		enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
-		return 0;
-	}
-	/* Check whether driver already requested for MSI vector */
+	WARN_ON(!msi_lookup_irq(dev, PCI_CAP_ID_MSIX));
+
+	/* Check whether driver already requested for MSI irq */
    	if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0 &&
-		!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
+		!msi_lookup_irq(dev, PCI_CAP_ID_MSI)) {
 		printk(KERN_INFO "PCI: %s: Can't enable MSI-X.  "
-		       "Device already has an MSI vector assigned\n",
+		       "Device already has an MSI irq assigned\n",
 		       pci_name(dev));
 		dev->irq = temp;
 		return -EINVAL;
 	}
-
-	spin_lock_irqsave(&msi_lock, flags);
-	/*
-	 * msi_lock is provided to ensure that enough vectors resources are
-	 * available before granting.
-	 */
-	free_vectors = pci_vector_resources(last_alloc_vector,
-				nr_released_vectors);
-	/* Ensure that each MSI/MSI-X device has one vector reserved by
-	   default to avoid any MSI-X driver to take all available
- 	   resources */
-	free_vectors -= nr_reserved_vectors;
-	/* Find the average of free vectors among MSI-X devices */
-	if (nr_msix_devices > 0)
-		free_vectors /= nr_msix_devices;
-	spin_unlock_irqrestore(&msi_lock, flags);
-
-	if (nvec > free_vectors) {
-		if (free_vectors > 0)
-			return free_vectors;
-		else
-			return -EBUSY;
-	}
-
 	status = msix_capability_init(dev, entries, nvec);
-	if (!status && nr_msix_devices > 0)
-		nr_msix_devices--;
-
 	return status;
 }
 
@@ -1247,47 +1027,42 @@
 	if (!(control & PCI_MSIX_FLAGS_ENABLE))
 		return;
 
+	disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
+
 	temp = dev->irq;
-	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
-		int state, vector, head, tail = 0, warning = 0;
+	if (!msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) {
+		int state, irq, head, tail = 0, warning = 0;
 		unsigned long flags;
 
-		vector = head = dev->irq;
-		spin_lock_irqsave(&msi_lock, flags);
+		irq = head = dev->irq;
+		dev->irq = temp;			/* Restore pin IRQ */
 		while (head != tail) {
-			state = msi_desc[vector]->msi_attrib.state;
+			spin_lock_irqsave(&msi_lock, flags);
+			state = msi_desc[irq]->msi_attrib.state;
+			tail = msi_desc[irq]->link.tail;
+			spin_unlock_irqrestore(&msi_lock, flags);
 			if (state)
 				warning = 1;
-			else {
-				vector_irq[vector] = 0; /* free it */
-				nr_released_vectors++;
-			}
-			tail = msi_desc[vector]->link.tail;
-			vector = tail;
+			else if (irq != head)	/* Release MSI-X irq */
+				msi_free_irq(dev, irq);
+			irq = tail;
 		}
-		spin_unlock_irqrestore(&msi_lock, flags);
+		msi_free_irq(dev, irq);
 		if (warning) {
-			dev->irq = temp;
 			printk(KERN_WARNING "PCI: %s: pci_disable_msix() called without "
-			       "free_irq() on all MSI-X vectors\n",
+			       "free_irq() on all MSI-X irqs\n",
 			       pci_name(dev));
 			BUG_ON(warning > 0);
-		} else {
-			dev->irq = temp;
-			disable_msi_mode(dev,
-				pci_find_capability(dev, PCI_CAP_ID_MSIX),
-				PCI_CAP_ID_MSIX);
-
 		}
 	}
 }
 
 /**
- * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state
+ * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state
  * @dev: pointer to the pci_dev data structure of MSI(X) device function
  *
  * Being called during hotplug remove, from which the device function
- * is hot-removed. All previous assigned MSI/MSI-X vectors, if
+ * is hot-removed. All previous assigned MSI/MSI-X irqs, if
  * allocated for this device function, are reclaimed to unused state,
  * which may be used later on.
  **/
@@ -1301,42 +1076,42 @@
 
 	temp = dev->irq;		/* Save IOAPIC IRQ */
 	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
+	if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSI)) {
 		spin_lock_irqsave(&msi_lock, flags);
 		state = msi_desc[dev->irq]->msi_attrib.state;
 		spin_unlock_irqrestore(&msi_lock, flags);
 		if (state) {
 			printk(KERN_WARNING "PCI: %s: msi_remove_pci_irq_vectors() "
-			       "called without free_irq() on MSI vector %d\n",
+			       "called without free_irq() on MSI irq %d\n",
 			       pci_name(dev), dev->irq);
 			BUG_ON(state > 0);
-		} else /* Release MSI vector assigned to this device */
-			msi_free_vector(dev, dev->irq, 0);
+		} else /* Release MSI irq assigned to this device */
+			msi_free_irq(dev, dev->irq);
 		dev->irq = temp;		/* Restore IOAPIC IRQ */
 	}
 	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
-		int vector, head, tail = 0, warning = 0;
+	if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) {
+		int irq, head, tail = 0, warning = 0;
 		void __iomem *base = NULL;
 
-		vector = head = dev->irq;
+		irq = head = dev->irq;
 		while (head != tail) {
 			spin_lock_irqsave(&msi_lock, flags);
-			state = msi_desc[vector]->msi_attrib.state;
-			tail = msi_desc[vector]->link.tail;
-			base = msi_desc[vector]->mask_base;
+			state = msi_desc[irq]->msi_attrib.state;
+			tail = msi_desc[irq]->link.tail;
+			base = msi_desc[irq]->mask_base;
 			spin_unlock_irqrestore(&msi_lock, flags);
 			if (state)
 				warning = 1;
-			else if (vector != head) /* Release MSI-X vector */
-				msi_free_vector(dev, vector, 0);
-			vector = tail;
+			else if (irq != head) /* Release MSI-X irq */
+				msi_free_irq(dev, irq);
+			irq = tail;
 		}
-		msi_free_vector(dev, vector, 0);
+		msi_free_irq(dev, irq);
 		if (warning) {
 			iounmap(base);
 			printk(KERN_WARNING "PCI: %s: msi_remove_pci_irq_vectors() "
-			       "called without free_irq() on all MSI-X vectors\n",
+			       "called without free_irq() on all MSI-X irqs\n",
 			       pci_name(dev));
 			BUG_ON(warning > 0);
 		}
diff -urN ./linux-2.6.18.1/drivers/pci/msi.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi.h
--- ./linux-2.6.18.1/drivers/pci/msi.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/pci/msi.h	2007-05-19 23:58:35.000000000 +0900
@@ -6,85 +6,9 @@
 #ifndef MSI_H
 #define MSI_H
 
-/*
- * MSI operation vector.  Used by the msi core code (drivers/pci/msi.c)
- * to abstract platform-specific tasks relating to MSI address generation
- * and resource management.
- */
-struct msi_ops {
-	/**
-	 * setup - generate an MSI bus address and data for a given vector
-	 * @pdev: PCI device context (in)
-	 * @vector: vector allocated by the msi core (in)
-	 * @addr_hi: upper 32 bits of PCI bus MSI address (out)
-	 * @addr_lo: lower 32 bits of PCI bus MSI address (out)
-	 * @data: MSI data payload (out)
-	 *
-	 * Description: The setup op is used to generate a PCI bus addres and
-	 * data which the msi core will program into the card MSI capability
-	 * registers.  The setup routine is responsible for picking an initial
-	 * cpu to target the MSI at.  The setup routine is responsible for
-	 * examining pdev to determine the MSI capabilities of the card and
-	 * generating a suitable address/data.  The setup routine is
-	 * responsible for allocating and tracking any system resources it
-	 * needs to route the MSI to the cpu it picks, and for associating
-	 * those resources with the passed in vector.
-	 *
-	 * Returns 0 if the MSI address/data was successfully setup.
-	 **/
-
-	int	(*setup)    (struct pci_dev *pdev, unsigned int vector,
-			     u32 *addr_hi, u32 *addr_lo, u32 *data);
-
-	/**
-	 * teardown - release resources allocated by setup
-	 * @vector: vector context for resources (in)
-	 *
-	 * Description:  The teardown op is used to release any resources
-	 * that were allocated in the setup routine associated with the passed
-	 * in vector.
-	 **/
-
-	void	(*teardown) (unsigned int vector);
-
-	/**
-	 * target - retarget an MSI at a different cpu
-	 * @vector: vector context for resources (in)
-	 * @cpu:  new cpu to direct vector at (in)
-	 * @addr_hi: new value of PCI bus upper 32 bits (in/out)
-	 * @addr_lo: new value of PCI bus lower 32 bits (in/out)
-	 *
-	 * Description:  The target op is used to redirect an MSI vector
-	 * at a different cpu.  addr_hi/addr_lo coming in are the existing
-	 * values that the MSI core has programmed into the card.  The
-	 * target code is responsible for freeing any resources (if any)
-	 * associated with the old address, and generating a new PCI bus
-	 * addr_hi/addr_lo that will redirect the vector at the indicated cpu.
-	 **/
-
-	void	(*target)   (unsigned int vector, unsigned int cpu,
-			     u32 *addr_hi, u32 *addr_lo);
-};
-
-extern int msi_register(struct msi_ops *ops);
-
 #include <asm/msi.h>
 
 /*
- * Assume the maximum number of hot plug slots supported by the system is about
- * ten. The worstcase is that each of these slots is hot-added with a device,
- * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which
- * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined
- * as below to ensure at least one message is assigned to each detected MSI/
- * MSI-X device function.
- */
-#define NR_HP_RESERVED_VECTORS 	20
-
-extern int vector_irq[NR_VECTORS];
-extern void (*interrupt[NR_IRQS])(void);
-extern int pci_vector_resources(int last, int nr_released);
-
-/*
  * MSI-X Address Register
  */
 #define PCI_MSIX_FLAGS_QSIZE		0x7FF
@@ -110,8 +34,8 @@
 	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
 #define multi_msi_enable(control, num) \
 	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
-#define is_64bit_address(control)	(control & PCI_MSI_FLAGS_64BIT)
-#define is_mask_bit_support(control)	(control & PCI_MSI_FLAGS_MASKBIT)
+#define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
+#define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 #define msi_enable(control, num) multi_msi_enable(control, num); \
 	control |= PCI_MSI_FLAGS_ENABLE
 
@@ -130,10 +54,10 @@
 		__u8	type	: 5; 	/* {0: unused, 5h:MSI, 11h:MSI-X} */
 		__u8	maskbit	: 1; 	/* mask-pending bit supported ?   */
 		__u8	state	: 1; 	/* {0: free, 1: busy}		  */
-		__u8	reserved: 1; 	/* reserved			  */
-		__u8	entry_nr;    	/* specific enabled entry 	  */
-		__u8	default_vector; /* default pre-assigned vector    */
-		__u8	unused; 	/* formerly unused destination cpu*/
+		__u8	is_64	: 1;	/* Address size: 0=32bit 1=64bit  */
+		__u8	pos;	 	/* Location of the msi capability */
+		__u16	entry_nr;    	/* specific enabled entry 	  */
+		unsigned default_irq;	/* default pre-assigned irq	  */
 	}msi_attrib;
 
 	struct {
@@ -146,10 +70,7 @@
 
 #ifdef CONFIG_PM
 	/* PM save area for MSIX address/data */
-
-	u32	address_hi_save;
-	u32	address_lo_save;
-	u32	data_save;
+	struct msi_msg msg_save;
 #endif
 };
 
diff -urN ./linux-2.6.18.1/drivers/scsi/aacraid/aacraid.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/scsi/aacraid/aacraid.h
--- ./linux-2.6.18.1/drivers/scsi/aacraid/aacraid.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/scsi/aacraid/aacraid.h	2007-05-19 23:58:35.000000000 +0900
@@ -744,7 +744,7 @@
 	u32			unique;		// unique value representing this context
 	ulong			jiffies;	// used for cleanup - dmb changed to ulong
 	struct list_head	next;		// used to link context's into a linked list
-	struct semaphore 	wait_sem;	// this is used to wait for the next fib to arrive.
+	struct compat_semaphore	wait_sem;	// this is used to wait for the next fib to arrive.
 	int			wait;		// Set to true when thread is in WaitForSingleObject
 	unsigned long		count;		// total number of FIBs on FibList
 	struct list_head	fib_list;	// this holds fibs and their attachd hw_fibs
@@ -814,7 +814,7 @@
 	 *	This is the event the sendfib routine will wait on if the
 	 *	caller did not pass one and this is synch io.
 	 */
-	struct semaphore 	event_wait;
+	struct compat_semaphore	event_wait;
 	spinlock_t		event_lock;
 
 	u32			done;	/* gets set to 1 when fib is complete */
diff -urN ./linux-2.6.18.1/drivers/scsi/qla2xxx/qla_def.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/scsi/qla2xxx/qla_def.h
--- ./linux-2.6.18.1/drivers/scsi/qla2xxx/qla_def.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/scsi/qla2xxx/qla_def.h	2007-05-19 23:58:35.000000000 +0900
@@ -2284,7 +2284,7 @@
 	spinlock_t	mbx_reg_lock;   /* Mbx Cmd Register Lock */
 
 	struct semaphore mbx_cmd_sem;	/* Serialialize mbx access */
-	struct semaphore mbx_intr_sem;  /* Used for completion notification */
+	struct compat_semaphore mbx_intr_sem;  /* Used for completion notification */
 
 	uint32_t	mbx_flags;
 #define  MBX_IN_PROGRESS	BIT_0
diff -urN ./linux-2.6.18.1/drivers/serial/8250.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/serial/8250.c
--- ./linux-2.6.18.1/drivers/serial/8250.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/serial/8250.c	2007-05-19 23:58:35.000000000 +0900
@@ -2252,14 +2252,10 @@
 
 	touch_nmi_watchdog();
 
-	local_irq_save(flags);
-	if (up->port.sysrq) {
-		/* serial8250_handle_port() already took the lock */
-		locked = 0;
-	} else if (oops_in_progress) {
-		locked = spin_trylock(&up->port.lock);
-	} else
-		spin_lock(&up->port.lock);
+	if (up->port.sysrq || oops_in_progress)
+		locked = spin_trylock_irqsave(&up->port.lock, flags);
+	else
+		spin_lock_irqsave(&up->port.lock, flags);
 
 	/*
 	 *	First save the IER then disable the interrupts
@@ -2281,8 +2277,7 @@
 	serial_out(up, UART_IER, ier);
 
 	if (locked)
-		spin_unlock(&up->port.lock);
-	local_irq_restore(flags);
+		spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static int serial8250_console_setup(struct console *co, char *options)
diff -urN ./linux-2.6.18.1/drivers/usb/core/devio.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/core/devio.c
--- ./linux-2.6.18.1/drivers/usb/core/devio.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/core/devio.c	2007-05-19 23:58:35.000000000 +0900
@@ -304,10 +304,11 @@
         struct async *as = (struct async *)urb->context;
         struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
+	unsigned long flags;
 
-        spin_lock(&ps->lock);
-        list_move_tail(&as->asynclist, &ps->async_completed);
-        spin_unlock(&ps->lock);
+	spin_lock_irqsave(&ps->lock, flags);
+	list_move_tail(&as->asynclist, &ps->async_completed);
+	spin_unlock_irqrestore(&ps->lock, flags);
 	if (as->signr) {
 		sinfo.si_signo = as->signr;
 		sinfo.si_errno = as->urb->status;
diff -urN ./linux-2.6.18.1/drivers/usb/core/hcd.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/core/hcd.c
--- ./linux-2.6.18.1/drivers/usb/core/hcd.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/core/hcd.c	2007-05-19 23:58:35.000000000 +0900
@@ -515,13 +515,11 @@
 	}
 
 	/* any errors get returned through the urb completion */
-	local_irq_save (flags);
-	spin_lock (&urb->lock);
+	spin_lock_irqsave(&urb->lock, flags);
 	if (urb->status == -EINPROGRESS)
 		urb->status = status;
-	spin_unlock (&urb->lock);
+	spin_unlock_irqrestore(&urb->lock, flags);
 	usb_hcd_giveback_urb (hcd, urb, NULL);
-	local_irq_restore (flags);
 	return 0;
 }
 
@@ -549,8 +547,7 @@
 	if (length > 0) {
 
 		/* try to complete the status urb */
-		local_irq_save (flags);
-		spin_lock(&hcd_root_hub_lock);
+		spin_lock_irqsave(&hcd_root_hub_lock, flags);
 		urb = hcd->status_urb;
 		if (urb) {
 			spin_lock(&urb->lock);
@@ -566,14 +563,13 @@
 			spin_unlock(&urb->lock);
 		} else
 			length = 0;
-		spin_unlock(&hcd_root_hub_lock);
+		spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
 
 		/* local irqs are always blocked in completions */
 		if (length > 0)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
 		else
 			hcd->poll_pending = 1;
-		local_irq_restore (flags);
 	}
 
 	/* The USB 2.0 spec says 256 ms.  This is close enough and won't
@@ -656,17 +652,15 @@
 	} else {				/* Status URB */
 		if (!hcd->uses_new_polling)
 			del_timer_sync (&hcd->rh_timer);
-		local_irq_disable ();
-		spin_lock (&hcd_root_hub_lock);
+		spin_lock_irq(&hcd_root_hub_lock);
 		if (urb == hcd->status_urb) {
 			hcd->status_urb = NULL;
 			urb->hcpriv = NULL;
 		} else
 			urb = NULL;		/* wasn't fully queued */
-		spin_unlock (&hcd_root_hub_lock);
+		spin_unlock_irq(&hcd_root_hub_lock);
 		if (urb)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
-		local_irq_enable ();
 	}
 
 	return 0;
@@ -1371,15 +1365,13 @@
 	WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT &&
 			udev->state != USB_STATE_NOTATTACHED);
 
-	local_irq_disable ();
-
 	/* FIXME move most of this into message.c as part of its
 	 * endpoint disable logic
 	 */
 
 	/* ep is already gone from udev->ep_{in,out}[]; no more submits */
 rescan:
-	spin_lock (&hcd_data_lock);
+	spin_lock_irq(&hcd_data_lock);
 	list_for_each_entry (urb, &ep->urb_list, urb_list) {
 		int	tmp;
 
@@ -1392,13 +1384,13 @@
 		if (urb->status != -EINPROGRESS)
 			continue;
 		usb_get_urb (urb);
-		spin_unlock (&hcd_data_lock);
+		spin_unlock_irq(&hcd_data_lock);
 
-		spin_lock (&urb->lock);
+		spin_lock_irq(&urb->lock);
 		tmp = urb->status;
 		if (tmp == -EINPROGRESS)
 			urb->status = -ESHUTDOWN;
-		spin_unlock (&urb->lock);
+		spin_unlock_irq(&urb->lock);
 
 		/* kick hcd unless it's already returning this */
 		if (tmp == -EINPROGRESS) {
@@ -1421,8 +1413,7 @@
 		/* list contents may have changed */
 		goto rescan;
 	}
-	spin_unlock (&hcd_data_lock);
-	local_irq_enable ();
+	spin_unlock_irq(&hcd_data_lock);
 
 	/* synchronize with the hardware, so old configuration state
 	 * clears out immediately (and will be freed).
diff -urN ./linux-2.6.18.1/drivers/usb/core/message.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/core/message.c
--- ./linux-2.6.18.1/drivers/usb/core/message.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/core/message.c	2007-05-19 23:58:35.000000000 +0900
@@ -264,8 +264,9 @@
 static void sg_complete (struct urb *urb, struct pt_regs *regs)
 {
 	struct usb_sg_request	*io = (struct usb_sg_request *) urb->context;
+	unsigned long flags;
 
-	spin_lock (&io->lock);
+	spin_lock_irqsave (&io->lock, flags);
 
 	/* In 2.5 we require hcds' endpoint queues not to progress after fault
 	 * reports, until the completion callback (this!) returns.  That lets
@@ -299,7 +300,7 @@
 		 * unlink pending urbs so they won't rx/tx bad data.
 		 * careful: unlink can sometimes be synchronous...
 		 */
-		spin_unlock (&io->lock);
+		spin_unlock_irqrestore (&io->lock, flags);
 		for (i = 0, found = 0; i < io->entries; i++) {
 			if (!io->urbs [i] || !io->urbs [i]->dev)
 				continue;
@@ -314,7 +315,7 @@
 			} else if (urb == io->urbs [i])
 				found = 1;
 		}
-		spin_lock (&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	urb->dev = NULL;
 
@@ -324,7 +325,7 @@
 	if (!io->count)
 		complete (&io->complete);
 
-	spin_unlock (&io->lock);
+	spin_unlock_irqrestore (&io->lock, flags);
 }
 
 
@@ -586,7 +587,7 @@
 				dev_warn (&io->dev->dev, "%s, unlink --> %d\n",
 					__FUNCTION__, retval);
 		}
-		spin_lock (&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	spin_unlock_irqrestore (&io->lock, flags);
 }
diff -urN ./linux-2.6.18.1/drivers/usb/net/usbnet.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/net/usbnet.c
--- ./linux-2.6.18.1/drivers/usb/net/usbnet.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/net/usbnet.c	2007-05-19 23:58:35.000000000 +0900
@@ -818,6 +818,8 @@
 
 	urb->dev = NULL;
 	entry->state = tx_done;
+	spin_lock_rt(&dev->txq.lock);
+	spin_unlock_rt(&dev->txq.lock);
 	defer_bh(dev, skb, &dev->txq);
 }
 
diff -urN ./linux-2.6.18.1/drivers/usb/storage/usb.h linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/storage/usb.h
--- ./linux-2.6.18.1/drivers/usb/storage/usb.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/usb/storage/usb.h	2007-05-19 23:58:35.000000000 +0900
@@ -147,7 +147,7 @@
 	dma_addr_t		iobuf_dma;
 
 	/* mutual exclusion and synchronization structures */
-	struct semaphore	sema;		 /* to sleep thread on	    */
+	struct compat_semaphore	sema;		 /* to sleep thread on	    */
 	struct completion	notify;		 /* thread begin/end	    */
 	wait_queue_head_t	delay_wait;	 /* wait during scan, reset */
 
diff -urN ./linux-2.6.18.1/drivers/video/console/fbcon.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/video/console/fbcon.c
--- ./linux-2.6.18.1/drivers/video/console/fbcon.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/video/console/fbcon.c	2007-05-19 23:58:35.000000000 +0900
@@ -1247,7 +1247,6 @@
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1276,10 +1275,11 @@
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -3079,6 +3079,7 @@
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
diff -urN ./linux-2.6.18.1/drivers/video/console/vgacon.c linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/video/console/vgacon.c
--- ./linux-2.6.18.1/drivers/video/console/vgacon.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/drivers/video/console/vgacon.c	2007-05-19 23:58:35.000000000 +0900
@@ -52,7 +52,7 @@
 #include <video/vga.h>
 #include <asm/io.h>
 
-static DEFINE_SPINLOCK(vga_lock);
+static DEFINE_RAW_SPINLOCK(vga_lock);
 static int cursor_size_lastfrom;
 static int cursor_size_lastto;
 static u32 vgacon_xres;
diff -urN ./linux-2.6.18.1/fs/9p/debug.h linux-2.6.18.1-cabi-20070529-RT_HRT/fs/9p/debug.h
--- ./linux-2.6.18.1/fs/9p/debug.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/9p/debug.h	2007-05-20 14:14:28.000000000 +0900
@@ -71,7 +71,7 @@
 }
 #else				/* DEBUG_DUMP_PKT */
 static inline void dump_data(const unsigned char *data, unsigned int datalen)
-{
+	{
 
 }
 #endif				/* DEBUG_DUMP_PKT */
diff -urN ./linux-2.6.18.1/fs/aio.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/aio.c
--- ./linux-2.6.18.1/fs/aio.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/aio.c	2007-05-19 23:58:35.000000000 +0900
@@ -578,13 +578,15 @@
 	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
-	tsk->mm = mm;
-	tsk->active_mm = mm;
+	local_irq_disable(); // FIXME
 	/*
 	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
 	 * it won't work. Update it accordingly if you change it here
 	 */
 	activate_mm(active_mm, mm);
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
diff -urN ./linux-2.6.18.1/fs/bio.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/bio.c
--- ./linux-2.6.18.1/fs/bio.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/bio.c	2007-05-19 23:58:35.000000000 +0900
@@ -166,7 +166,7 @@
 
 		bio_init(bio);
 		if (likely(nr_iovecs)) {
-			unsigned long idx;
+			unsigned long idx = 0 /* shut up gcc warning */;
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
diff -urN ./linux-2.6.18.1/fs/block_dev.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/block_dev.c
--- ./linux-2.6.18.1/fs/block_dev.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/block_dev.c	2007-05-19 23:58:35.000000000 +0900
@@ -1055,14 +1055,32 @@
 	 * For now, block device ->open() routine must _not_
 	 * examine anything in 'inode' argument except ->i_rdev.
 	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file, BD_MUTEX_NORMAL);
+	struct file *fake_file;
+	struct dentry *fake_dentry;
+	int err = -ENOMEM;
+
+	fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL);
+	if (!fake_file)
+		goto out;
+	memset(fake_file, 0, sizeof(*fake_file));
+
+	fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL);
+	if (!fake_dentry)
+		goto out_free_file;
+	memset(fake_dentry, 0, sizeof(*fake_dentry));
+
+	fake_file->f_mode = mode;
+	fake_file->f_flags = flags;
+	fake_file->f_dentry = fake_dentry;
+	fake_dentry->d_inode = bdev->bd_inode;
+
+	err = do_open(bdev, fake_file, BD_MUTEX_NORMAL);
+
+	kfree(fake_dentry);
+out_free_file:
+	kfree(fake_file);
+out:
+	return err;
 }
 
 EXPORT_SYMBOL(blkdev_get);
diff -urN ./linux-2.6.18.1/fs/buffer.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/buffer.c
--- ./linux-2.6.18.1/fs/buffer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/buffer.c	2007-05-20 00:11:13.000000000 +0900
@@ -40,7 +40,7 @@
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
-#include <linux/bit_spinlock.h>
+//#include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void invalidate_bh_lrus(void);
@@ -532,8 +532,7 @@
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -546,8 +545,7 @@
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -559,8 +557,7 @@
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -594,8 +591,7 @@
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -607,14 +603,12 @@
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -3101,6 +3095,8 @@
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
@@ -3117,6 +3113,8 @@
 
 		memset(bh, 0, sizeof(*bh));
 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
+		spin_lock_init(&bh->b_uptodate_lock);
+		spin_lock_init(&bh->b_state_lock);
 	}
 }
 
diff -urN ./linux-2.6.18.1/fs/buffer.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/fs/buffer.c.orig
--- ./linux-2.6.18.1/fs/buffer.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/buffer.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,3185 @@
+/*
+ *  linux/fs/buffer.c
+ *
+ *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
+ */
+
+/*
+ * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
+ *
+ * Removed a lot of unnecessary code and simplified things now that
+ * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ *
+ * Speed up hash, lru, and free list operations.  Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
+ *
+ * Added 32k buffer block sizes - these are required older ARM systems. - RMK
+ *
+ * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
+ */
+
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/capability.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/hash.h>
+#include <linux/suspend.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/mpage.h>
+//#include <linux/bit_spinlock.h>
+
+static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static void invalidate_bh_lrus(void);
+
+#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
+
+inline void
+init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+	bh->b_end_io = handler;
+	bh->b_private = private;
+}
+
+static int sync_buffer(void *word)
+{
+	struct block_device *bd;
+	struct buffer_head *bh
+		= container_of(word, struct buffer_head, b_state);
+
+	smp_mb();
+	bd = bh->b_bdev;
+	if (bd)
+		blk_run_address_space(bd->bd_inode->i_mapping);
+	io_schedule();
+	return 0;
+}
+
+void fastcall __lock_buffer(struct buffer_head *bh)
+{
+	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+							TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__lock_buffer);
+
+void fastcall unlock_buffer(struct buffer_head *bh)
+{
+	clear_buffer_locked(bh);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&bh->b_state, BH_Lock);
+}
+
+/*
+ * Block until a buffer comes unlocked.  This doesn't stop it
+ * from becoming locked again - you have to lock it yourself
+ * if you want to preserve its state.
+ */
+void __wait_on_buffer(struct buffer_head * bh)
+{
+	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+}
+
+static void
+__clear_page_buffers(struct page *page)
+{
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page_cache_release(page);
+}
+
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * Default synchronous end-of-IO handler..  Just mark it up-to-date and
+ * unlock the buffer. This is what ll_rw_block uses too.
+ */
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+			buffer_io_error(bh);
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+				       bdevname(bh->b_bdev, b));
+		}
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	int ret = 0;
+
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return ret;
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+static void __fsync_super(struct super_block *sb)
+{
+	sync_inodes_sb(sb, 0);
+	DQUOT_SYNC(sb);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	sync_inodes_sb(sb, 1);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	__fsync_super(sb);
+	return sync_blockdev(sb->s_bdev);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = fsync_super(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * This takes the block device bd_mount_mutex to make sure no new mounts
+ * happen on bdev until thaw_bdev() is called.
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	mutex_lock(&bdev->bd_mount_mutex);
+	sb = get_super(bdev);
+	if (sb && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_frozen = SB_FREEZE_WRITE;
+		smp_wmb();
+
+		__fsync_super(sb);
+
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+
+		sync_blockdev(sb->s_bdev);
+
+		if (sb->s_op->write_super_lockfs)
+			sb->s_op->write_super_lockfs(sb);
+	}
+
+	sync_blockdev(bdev);
+	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	if (sb) {
+		BUG_ON(sb->s_bdev != bdev);
+
+		if (sb->s_op->unlockfs)
+			sb->s_op->unlockfs(sb);
+		sb->s_frozen = SB_UNFROZEN;
+		smp_wmb();
+		wake_up(&sb->s_wait_unfrozen);
+		drop_super(sb);
+	}
+
+	mutex_unlock(&bdev->bd_mount_mutex);
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+/*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
+static void do_sync(unsigned long wait)
+{
+	wakeup_pdflush(0);
+	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
+	DQUOT_SYNC(NULL);
+	sync_supers();		/* Write the superblocks */
+	sync_filesystems(0);	/* Start syncing the filesystems */
+	sync_filesystems(wait);	/* Waitingly sync the filesystems */
+	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
+	if (!wait)
+		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
+}
+
+asmlinkage long sys_sync(void)
+{
+	do_sync(1);
+	return 0;
+}
+
+void emergency_sync(void)
+{
+	pdflush_operation(do_sync, 0);
+}
+
+/*
+ * Generic function to fsync a file.
+ *
+ * filp may be NULL if called via the msync of a vma.
+ */
+ 
+int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	lock_super(sb);
+	if (sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+long do_fsync(struct file *file, int datasync)
+{
+	int ret;
+	int err;
+	struct address_space *mapping = file->f_mapping;
+
+	if (!file->f_op || !file->f_op->fsync) {
+		/* Why?  We can still call filemap_fdatawrite */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = filemap_fdatawrite(mapping);
+
+	/*
+	 * We need to protect against concurrent writers, which could cause
+	 * livelocks in fsync_buffers_list().
+	 */
+	mutex_lock(&mapping->host->i_mutex);
+	err = file->f_op->fsync(file, file->f_dentry, datasync);
+	if (!ret)
+		ret = err;
+	mutex_unlock(&mapping->host->i_mutex);
+	err = filemap_fdatawait(mapping);
+	if (!ret)
+		ret = err;
+out:
+	return ret;
+}
+
+static long __do_fsync(unsigned int fd, int datasync)
+{
+	struct file *file;
+	int ret = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		ret = do_fsync(file, datasync);
+		fput(file);
+	}
+	return ret;
+}
+
+asmlinkage long sys_fsync(unsigned int fd)
+{
+	return __do_fsync(fd, 0);
+}
+
+asmlinkage long sys_fdatasync(unsigned int fd)
+{
+	return __do_fsync(fd, 1);
+}
+
+/*
+ * Various filesystems appear to want __find_get_block to be non-blocking.
+ * But it's the page lock which protects the buffers.  To get around this,
+ * we get exclusion from try_to_free_buffers with the blockdev mapping's
+ * private_lock.
+ *
+ * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * may be quite high.  This code could TryLock the page, and if that
+ * succeeds, there is no need to take private_lock. (But if
+ * private_lock is contended then so is mapping->tree_lock).
+ */
+static struct buffer_head *
+__find_get_block_slow(struct block_device *bdev, sector_t block)
+{
+	struct inode *bd_inode = bdev->bd_inode;
+	struct address_space *bd_mapping = bd_inode->i_mapping;
+	struct buffer_head *ret = NULL;
+	pgoff_t index;
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct page *page;
+	int all_mapped = 1;
+
+	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+	page = find_get_page(bd_mapping, index);
+	if (!page)
+		goto out;
+
+	spin_lock(&bd_mapping->private_lock);
+	if (!page_has_buffers(page))
+		goto out_unlock;
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (bh->b_blocknr == block) {
+			ret = bh;
+			get_bh(bh);
+			goto out_unlock;
+		}
+		if (!buffer_mapped(bh))
+			all_mapped = 0;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/* we might be here because some of the buffers on this page are
+	 * not mapped.  This is due to various races between
+	 * file io on the block device and getblk.  It gets dealt with
+	 * elsewhere, don't buffer_error if we had some unmapped buffers
+	 */
+	if (all_mapped) {
+		printk("__find_get_block_slow() failed. "
+			"block=%llu, b_blocknr=%llu\n",
+			(unsigned long long)block,
+			(unsigned long long)bh->b_blocknr);
+		printk("b_state=0x%08lx, b_size=%zu\n",
+			bh->b_state, bh->b_size);
+		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
+	}
+out_unlock:
+	spin_unlock(&bd_mapping->private_lock);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+/* If invalidate_buffers() will trash dirty buffers, it means some kind
+   of fs corruption is going on. Trashing dirty data always imply losing
+   information that was supposed to be just stored on the physical layer
+   by the user.
+
+   Thus invalidate_buffers in general usage is not allwowed to trash
+   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
+   be preserved.  These buffers are simply skipped.
+  
+   We also skip buffers which are still in use.  For example this can
+   happen if a userspace program is reading the block device.
+
+   NOTE: In the case where the user removed a removable-media-disk even if
+   there's still dirty data not synced on disk (due a bug in the device driver
+   or due an error of the user), by not destroying the dirty buffers we could
+   generate corruption also on the next media inserted, thus a parameter is
+   necessary to handle this case in the most safe way possible (trying
+   to not corrupt also the new disk inserted with the data belonging to
+   the old now corrupted disk). Also for the ramdisk the natural thing
+   to do in order to release the ramdisk memory is to destroy dirty buffers.
+
+   These are two special cases. Normal usage imply the device driver
+   to issue a sync on the device (without waiting I/O completion) and
+   then an invalidate_buffers call that doesn't trash dirty buffers.
+
+   For handling cache coherency with the blkdev pagecache the 'update' case
+   is been introduced. It is needed to re-read from disk any pinned
+   buffer. NOTE: re-reading from disk is destructive so we can do it only
+   when we assume nobody is changing the buffercache under our I/O and when
+   we think the disk contains more recent information than the buffercache.
+   The update == 1 pass marks the buffers we need to update, the update == 2
+   pass does the actual I/O. */
+void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	invalidate_bh_lrus();
+	/*
+	 * FIXME: what about destroy_dirty_buffers?
+	 * We really want to use invalidate_inode_pages2() for
+	 * that, but not until that's cleaned up.
+	 */
+	invalidate_inode_pages(mapping);
+}
+
+/*
+ * Kick pdflush then try to free up some ZONE_NORMAL memory.
+ */
+static void free_more_memory(void)
+{
+	struct zone **zones;
+	pg_data_t *pgdat;
+
+	wakeup_pdflush(1024);
+	yield();
+
+	for_each_online_pgdat(pgdat) {
+		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
+		if (*zones)
+			try_to_free_pages(zones, GFP_NOFS);
+	}
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+	int page_uptodate = 1;
+
+	BUG_ON(!buffer_async_read(bh));
+
+	page = bh->b_page;
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		clear_buffer_uptodate(bh);
+		if (printk_ratelimit())
+			buffer_io_error(bh);
+		SetPageError(page);
+	}
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 */
+	first = page_buffers(page);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
+	clear_buffer_async_read(bh);
+	unlock_buffer(bh);
+	tmp = bh;
+	do {
+		if (!buffer_uptodate(tmp))
+			page_uptodate = 0;
+		if (buffer_async_read(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
+
+	/*
+	 * If none of the buffers had errors and they are all
+	 * uptodate then we can set the page uptodate.
+	 */
+	if (page_uptodate && !PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	return;
+
+still_busy:
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
+	return;
+}
+
+/*
+ * Completion handler for block_write_full_page() - pages which are unlocked
+ * during I/O, and which have PageWriteback cleared upon I/O completion.
+ */
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+
+	BUG_ON(!buffer_async_write(bh));
+
+	page = bh->b_page;
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (printk_ratelimit()) {
+			buffer_io_error(bh);
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+			       bdevname(bh->b_bdev, b));
+		}
+		set_bit(AS_EIO, &page->mapping->flags);
+		clear_buffer_uptodate(bh);
+		SetPageError(page);
+	}
+
+	first = page_buffers(page);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
+
+	clear_buffer_async_write(bh);
+	unlock_buffer(bh);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (buffer_async_write(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	}
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
+	end_page_writeback(page);
+	return;
+
+still_busy:
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
+	return;
+}
+
+/*
+ * If a page's buffers are under async readin (end_buffer_async_read
+ * completion) then there is a possibility that another thread of
+ * control could lock one of the buffers after it has completed
+ * but while some of the other buffers have not completed.  This
+ * locked buffer would confuse end_buffer_async_read() into not unlocking
+ * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
+ * that this buffer is not under async I/O.
+ *
+ * The page comes unlocked when it has no locked buffer_async buffers
+ * left.
+ *
+ * PageLocked prevents anyone starting new async I/O reads any of
+ * the buffers.
+ *
+ * PageWriteback is used to prevent simultaneous writeout of the same
+ * page.
+ *
+ * PageLocked prevents anyone from starting writeback of a page which is
+ * under read I/O (PageWriteback is only ever set against a locked page).
+ */
+static void mark_buffer_async_read(struct buffer_head *bh)
+{
+	bh->b_end_io = end_buffer_async_read;
+	set_buffer_async_read(bh);
+}
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+	bh->b_end_io = end_buffer_async_write;
+	set_buffer_async_write(bh);
+}
+EXPORT_SYMBOL(mark_buffer_async_write);
+
+
+/*
+ * fs/buffer.c contains helper functions for buffer-backed address space's
+ * fsync functions.  A common requirement for buffer-based filesystems is
+ * that certain data from the backing blockdev needs to be written out for
+ * a successful fsync().  For example, ext2 indirect blocks need to be
+ * written back and waited upon before fsync() returns.
+ *
+ * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * inode_has_buffers() and invalidate_inode_buffers() are provided for the
+ * management of a list of dependent buffers at ->i_mapping->private_list.
+ *
+ * Locking is a little subtle: try_to_free_buffers() will remove buffers
+ * from their controlling inode's queue when they are being freed.  But
+ * try_to_free_buffers() will be operating against the *blockdev* mapping
+ * at the time, not against the S_ISREG file which depends on those buffers.
+ * So the locking for private_list is via the private_lock in the address_space
+ * which backs the buffers.  Which is different from the address_space 
+ * against which the buffers are listed.  So for a particular address_space,
+ * mapping->private_lock does *not* protect mapping->private_list!  In fact,
+ * mapping->private_list will always be protected by the backing blockdev's
+ * ->private_lock.
+ *
+ * Which introduces a requirement: all buffers on an address_space's
+ * ->private_list must be from the same address_space: the blockdev's.
+ *
+ * address_spaces which do not place buffers at ->private_list via these
+ * utility functions are free to use private_lock and private_list for
+ * whatever they want.  The only requirement is that list_empty(private_list)
+ * be true at clear_inode() time.
+ *
+ * FIXME: clear_inode should not call invalidate_inode_buffers().  The
+ * filesystems should do that.  invalidate_inode_buffers() should just go
+ * BUG_ON(!list_empty).
+ *
+ * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
+ * take an address_space, not an inode.  And it should be called
+ * mark_buffer_dirty_fsync() to clearly define why those buffers are being
+ * queued up.
+ *
+ * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
+ * list if it is already on a list.  Because if the buffer is on a list,
+ * it *must* already be on the right one.  If not, the filesystem is being
+ * silly.  This will save a ton of locking.  But first we have to ensure
+ * that buffers are taken *off* the old inode's list when they are freed
+ * (presumably in truncate).  That requires careful auditing of all
+ * filesystems (do it inside bforget()).  It could also be done by bringing
+ * b_inode back.
+ */
+
+/*
+ * The buffer's backing address_space's private_lock must be held
+ */
+static inline void __remove_assoc_queue(struct buffer_head *bh)
+{
+	list_del_init(&bh->b_assoc_buffers);
+}
+
+int inode_has_buffers(struct inode *inode)
+{
+	return !list_empty(&inode->i_data.private_list);
+}
+
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct list_head *p;
+	int err = 0;
+
+	spin_lock(lock);
+repeat:
+	list_for_each_prev(p, list) {
+		bh = BH_ENTRY(p);
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			spin_unlock(lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(lock);
+			goto repeat;
+		}
+	}
+	spin_unlock(lock);
+	return err;
+}
+
+/**
+ * sync_mapping_buffers - write out and wait upon a mapping's "associated"
+ *                        buffers
+ * @mapping: the mapping which wants those buffers written
+ *
+ * Starts I/O against the buffers at mapping->private_list, and waits upon
+ * that I/O.
+ *
+ * Basically, this is a convenience function for fsync().
+ * @mapping is a file or directory which needs those buffers to be written for
+ * a successful fsync().
+ */
+int sync_mapping_buffers(struct address_space *mapping)
+{
+	struct address_space *buffer_mapping = mapping->assoc_mapping;
+
+	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
+		return 0;
+
+	return fsync_buffers_list(&buffer_mapping->private_lock,
+					&mapping->private_list);
+}
+EXPORT_SYMBOL(sync_mapping_buffers);
+
+/*
+ * Called when we've recently written block `bblock', and it is known that
+ * `bblock' was for a buffer_boundary() buffer.  This means that the block at
+ * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
+ * dirty, schedule it for IO.  So that indirects merge nicely with their data.
+ */
+void write_boundary_block(struct block_device *bdev,
+			sector_t bblock, unsigned blocksize)
+{
+	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+	if (bh) {
+		if (buffer_dirty(bh))
+			ll_rw_block(WRITE, 1, &bh);
+		put_bh(bh);
+	}
+}
+
+void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct address_space *buffer_mapping = bh->b_page->mapping;
+
+	mark_buffer_dirty(bh);
+	if (!mapping->assoc_mapping) {
+		mapping->assoc_mapping = buffer_mapping;
+	} else {
+		BUG_ON(mapping->assoc_mapping != buffer_mapping);
+	}
+	if (list_empty(&bh->b_assoc_buffers)) {
+		spin_lock(&buffer_mapping->private_lock);
+		list_move_tail(&bh->b_assoc_buffers,
+				&mapping->private_list);
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+}
+EXPORT_SYMBOL(mark_buffer_dirty_inode);
+
+/*
+ * Add a page to the dirty page list.
+ *
+ * It is a sad fact of life that this function is called from several places
+ * deeply under spinlocking.  It may not sleep.
+ *
+ * If the page has buffers, the uptodate buffers are set dirty, to preserve
+ * dirty-state coherency between the page and the buffers.  It the page does
+ * not have buffers then when they are later attached they will all be set
+ * dirty.
+ *
+ * The buffers are dirtied before the page is dirtied.  There's a small race
+ * window in which a writepage caller may see the page cleanness but not the
+ * buffer dirtiness.  That's fine.  If this code were to set the page dirty
+ * before the buffers, a concurrent writepage caller could clear the page dirty
+ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * page on the dirty page list.
+ *
+ * We use private_lock to lock against try_to_free_buffers while using the
+ * page's buffer list.  Also use this to protect against clean buffers being
+ * added to the page after it was set dirty.
+ *
+ * FIXME: may need to call ->reservepage here as well.  That's rather up to the
+ * address_space though.
+ */
+int __set_page_dirty_buffers(struct page *page)
+{
+	struct address_space * const mapping = page->mapping;
+
+	spin_lock(&mapping->private_lock);
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		do {
+			set_buffer_dirty(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	spin_unlock(&mapping->private_lock);
+
+	if (!TestSetPageDirty(page)) {
+		write_lock_irq(&mapping->tree_lock);
+		if (page->mapping) {	/* Race with truncate? */
+			if (mapping_cap_account_dirty(mapping))
+				__inc_zone_page_state(page, NR_FILE_DIRTY);
+			radix_tree_tag_set(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		write_unlock_irq(&mapping->tree_lock);
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__set_page_dirty_buffers);
+
+/*
+ * Write out and wait upon a list of buffers.
+ *
+ * We have conflicting pressures: we want to make sure that all
+ * initially dirty buffers get waited on, but that any subsequently
+ * dirtied buffers don't.  After all, we don't want fsync to last
+ * forever if somebody is actively writing to the file.
+ *
+ * Do this in two main stages: first we copy dirty buffers to a
+ * temporary inode list, queueing the writes as we go.  Then we clean
+ * up, waiting for those writes to complete.
+ * 
+ * During this second stage, any subsequent updates to the file may end
+ * up refiling the buffer on the original inode's dirty list again, so
+ * there is a chance we will end up with a buffer queued for write but
+ * not yet completed on that list.  So, as a final cleanup we go through
+ * the osync code to catch these locked, dirty buffers without requeuing
+ * any newly dirty buffers for write.
+ */
+static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
+{
+	struct buffer_head *bh;
+	struct list_head tmp;
+	int err = 0, err2;
+
+	INIT_LIST_HEAD(&tmp);
+
+	spin_lock(lock);
+	while (!list_empty(list)) {
+		bh = BH_ENTRY(list->next);
+		list_del_init(&bh->b_assoc_buffers);
+		if (buffer_dirty(bh) || buffer_locked(bh)) {
+			list_add(&bh->b_assoc_buffers, &tmp);
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(lock);
+				/*
+				 * Ensure any pending I/O completes so that
+				 * ll_rw_block() actually writes the current
+				 * contents - it is a noop if I/O is still in
+				 * flight on potentially older contents.
+				 */
+				ll_rw_block(SWRITE, 1, &bh);
+				brelse(bh);
+				spin_lock(lock);
+			}
+		}
+	}
+
+	while (!list_empty(&tmp)) {
+		bh = BH_ENTRY(tmp.prev);
+		__remove_assoc_queue(bh);
+		get_bh(bh);
+		spin_unlock(lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(lock);
+	}
+	
+	spin_unlock(lock);
+	err2 = osync_buffers_list(lock, list);
+	if (err)
+		return err;
+	else
+		return err2;
+}
+
+/*
+ * Invalidate any and all dirty buffers on a given inode.  We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync().  Just drop the buffers from the inode list.
+ *
+ * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
+ * assumes that all the buffers are against the blockdev.  Not true
+ * for reiserfs.
+ */
+void invalidate_inode_buffers(struct inode *inode)
+{
+	if (inode_has_buffers(inode)) {
+		struct address_space *mapping = &inode->i_data;
+		struct list_head *list = &mapping->private_list;
+		struct address_space *buffer_mapping = mapping->assoc_mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		while (!list_empty(list))
+			__remove_assoc_queue(BH_ENTRY(list->next));
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+}
+
+/*
+ * Remove any clean buffers from the inode's buffer list.  This is called
+ * when we're trying to free the inode itself.  Those buffers can pin it.
+ *
+ * Returns true if all buffers were removed.
+ */
+int remove_inode_buffers(struct inode *inode)
+{
+	int ret = 1;
+
+	if (inode_has_buffers(inode)) {
+		struct address_space *mapping = &inode->i_data;
+		struct list_head *list = &mapping->private_list;
+		struct address_space *buffer_mapping = mapping->assoc_mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		while (!list_empty(list)) {
+			struct buffer_head *bh = BH_ENTRY(list->next);
+			if (buffer_dirty(bh)) {
+				ret = 0;
+				break;
+			}
+			__remove_assoc_queue(bh);
+		}
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+	return ret;
+}
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created.  Return NULL if unable to create more
+ * buffers.
+ *
+ * The retry flag is used to differentiate async IO (paging, swapping)
+ * which may not fail from ordinary buffer allocations.
+ */
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+		int retry)
+{
+	struct buffer_head *bh, *head;
+	long offset;
+
+try_again:
+	head = NULL;
+	offset = PAGE_SIZE;
+	while ((offset -= size) >= 0) {
+		bh = alloc_buffer_head(GFP_NOFS);
+		if (!bh)
+			goto no_grow;
+
+		bh->b_bdev = NULL;
+		bh->b_this_page = head;
+		bh->b_blocknr = -1;
+		head = bh;
+
+		bh->b_state = 0;
+		atomic_set(&bh->b_count, 0);
+		bh->b_private = NULL;
+		bh->b_size = size;
+
+		/* Link the buffer to its page */
+		set_bh_page(bh, page, offset);
+
+		init_buffer(bh, NULL, NULL);
+	}
+	return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+	if (head) {
+		do {
+			bh = head;
+			head = head->b_this_page;
+			free_buffer_head(bh);
+		} while (head);
+	}
+
+	/*
+	 * Return failure for non-async IO requests.  Async IO requests
+	 * are not allowed to fail, so we have to wait until buffer heads
+	 * become available.  But we don't want tasks sleeping with 
+	 * partially complete buffers, so all were released above.
+	 */
+	if (!retry)
+		return NULL;
+
+	/* We're _really_ low on memory. Now we just
+	 * wait for old buffer heads to become free due to
+	 * finishing IO.  Since this is an async request and
+	 * the reserve list is empty, we're sure there are 
+	 * async buffer heads in use.
+	 */
+	free_more_memory();
+	goto try_again;
+}
+EXPORT_SYMBOL_GPL(alloc_page_buffers);
+
+static inline void
+link_dev_buffers(struct page *page, struct buffer_head *head)
+{
+	struct buffer_head *bh, *tail;
+
+	bh = head;
+	do {
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	attach_page_buffers(page, head);
+}
+
+/*
+ * Initialise the state of a blockdev page's buffers.
+ */ 
+static void
+init_page_buffers(struct page *page, struct block_device *bdev,
+			sector_t block, int size)
+{
+	struct buffer_head *head = page_buffers(page);
+	struct buffer_head *bh = head;
+	int uptodate = PageUptodate(page);
+
+	do {
+		if (!buffer_mapped(bh)) {
+			init_buffer(bh, NULL, NULL);
+			bh->b_bdev = bdev;
+			bh->b_blocknr = block;
+			if (uptodate)
+				set_buffer_uptodate(bh);
+			set_buffer_mapped(bh);
+		}
+		block++;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+/*
+ * Create the page-cache page that contains the requested block.
+ *
+ * This is user purely for blockdev mappings.
+ */
+static struct page *
+grow_dev_page(struct block_device *bdev, sector_t block,
+		pgoff_t index, int size)
+{
+	struct inode *inode = bdev->bd_inode;
+	struct page *page;
+	struct buffer_head *bh;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (!page)
+		return NULL;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_has_buffers(page)) {
+		bh = page_buffers(page);
+		if (bh->b_size == size) {
+			init_page_buffers(page, bdev, block, size);
+			return page;
+		}
+		if (!try_to_free_buffers(page))
+			goto failed;
+	}
+
+	/*
+	 * Allocate some buffers for this page
+	 */
+	bh = alloc_page_buffers(page, size, 0);
+	if (!bh)
+		goto failed;
+
+	/*
+	 * Link the page to the buffers and initialise them.  Take the
+	 * lock to be atomic wrt __find_get_block(), which does not
+	 * run under the page lock.
+	 */
+	spin_lock(&inode->i_mapping->private_lock);
+	link_dev_buffers(page, bh);
+	init_page_buffers(page, bdev, block, size);
+	spin_unlock(&inode->i_mapping->private_lock);
+	return page;
+
+failed:
+	BUG();
+	unlock_page(page);
+	page_cache_release(page);
+	return NULL;
+}
+
+/*
+ * Create buffers for the specified block device block's page.  If
+ * that page was dirty, the buffers are set dirty also.
+ *
+ * Except that's a bug.  Attaching dirty buffers to a dirty
+ * blockdev's page can result in filesystem corruption, because
+ * some of those buffers may be aliases of filesystem data.
+ * grow_dev_page() will go BUG() if this happens.
+ */
+static int
+grow_buffers(struct block_device *bdev, sector_t block, int size)
+{
+	struct page *page;
+	pgoff_t index;
+	int sizebits;
+
+	sizebits = -1;
+	do {
+		sizebits++;
+	} while ((size << sizebits) < PAGE_SIZE);
+
+	index = block >> sizebits;
+	block = index << sizebits;
+
+	/* Create a page with the proper size buffers.. */
+	page = grow_dev_page(bdev, block, index, size);
+	if (!page)
+		return 0;
+	unlock_page(page);
+	page_cache_release(page);
+	return 1;
+}
+
+static struct buffer_head *
+__getblk_slow(struct block_device *bdev, sector_t block, int size)
+{
+	/* Size must be multiple of hard sectorsize */
+	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+			(size < 512 || size > PAGE_SIZE))) {
+		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
+					size);
+		printk(KERN_ERR "hardsect size: %d\n",
+					bdev_hardsect_size(bdev));
+
+		dump_stack();
+		return NULL;
+	}
+
+	for (;;) {
+		struct buffer_head * bh;
+
+		bh = __find_get_block(bdev, block, size);
+		if (bh)
+			return bh;
+
+		if (!grow_buffers(bdev, block, size))
+			free_more_memory();
+	}
+}
+
+/*
+ * The relationship between dirty buffers and dirty pages:
+ *
+ * Whenever a page has any dirty buffers, the page's dirty bit is set, and
+ * the page is tagged dirty in its radix tree.
+ *
+ * At all times, the dirtiness of the buffers represents the dirtiness of
+ * subsections of the page.  If the page has buffers, the page dirty bit is
+ * merely a hint about the true dirty state.
+ *
+ * When a page is set dirty in its entirety, all its buffers are marked dirty
+ * (if the page has buffers).
+ *
+ * When a buffer is marked dirty, its page is dirtied, but the page's other
+ * buffers are not.
+ *
+ * Also.  When blockdev buffers are explicitly read with bread(), they
+ * individually become uptodate.  But their backing page remains not
+ * uptodate - even if all of its buffers are uptodate.  A subsequent
+ * block_read_full_page() against that page will discover all the uptodate
+ * buffers, will set the page uptodate and will perform no I/O.
+ */
+
+/**
+ * mark_buffer_dirty - mark a buffer_head as needing writeout
+ * @bh: the buffer_head to mark dirty
+ *
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
+ * backing page dirty, then tag the page as dirty in its address_space's radix
+ * tree and then attach the address_space's inode to its superblock's dirty
+ * inode list.
+ *
+ * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
+ * mapping->tree_lock and the global inode_lock.
+ */
+void fastcall mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
+		__set_page_dirty_nobuffers(bh->b_page);
+}
+
+/*
+ * Decrement a buffer_head's reference count.  If all buffers against a page
+ * have zero reference count, are clean and unlocked, and if the page is clean
+ * and unlocked then try_to_free_buffers() may strip the buffers from the page
+ * in preparation for freeing it (sometimes, rarely, buffers are removed from
+ * a page but it ends up not being freed, and buffers may later be reattached).
+ */
+void __brelse(struct buffer_head * buf)
+{
+	if (atomic_read(&buf->b_count)) {
+		put_bh(buf);
+		return;
+	}
+	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+	WARN_ON(1);
+}
+
+/*
+ * bforget() is like brelse(), except it discards any
+ * potentially dirty data.
+ */
+void __bforget(struct buffer_head *bh)
+{
+	clear_buffer_dirty(bh);
+	if (!list_empty(&bh->b_assoc_buffers)) {
+		struct address_space *buffer_mapping = bh->b_page->mapping;
+
+		spin_lock(&buffer_mapping->private_lock);
+		list_del_init(&bh->b_assoc_buffers);
+		spin_unlock(&buffer_mapping->private_lock);
+	}
+	__brelse(bh);
+}
+
+static struct buffer_head *__bread_slow(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	} else {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+		wait_on_buffer(bh);
+		if (buffer_uptodate(bh))
+			return bh;
+	}
+	brelse(bh);
+	return NULL;
+}
+
+/*
+ * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
+ * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
+ * refcount elevated by one when they're in an LRU.  A buffer can only appear
+ * once in a particular CPU's LRU.  A single buffer can be present in multiple
+ * CPU's LRUs at the same time.
+ *
+ * This is a transparent caching front-end to sb_bread(), sb_getblk() and
+ * sb_find_get_block().
+ *
+ * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
+ * a local interrupt disable for that.
+ */
+
+#define BH_LRU_SIZE	8
+
+struct bh_lru {
+	struct buffer_head *bhs[BH_LRU_SIZE];
+};
+
+static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
+
+#ifdef CONFIG_SMP
+#define bh_lru_lock()	local_irq_disable()
+#define bh_lru_unlock()	local_irq_enable()
+#else
+#define bh_lru_lock()	preempt_disable()
+#define bh_lru_unlock()	preempt_enable()
+#endif
+
+static inline void check_irqs_on(void)
+{
+#ifdef irqs_disabled
+	BUG_ON(irqs_disabled());
+#endif
+}
+
+/*
+ * The LRU management algorithm is dopey-but-simple.  Sorry.
+ */
+static void bh_lru_install(struct buffer_head *bh)
+{
+	struct buffer_head *evictee = NULL;
+	struct bh_lru *lru;
+
+	check_irqs_on();
+	bh_lru_lock();
+	lru = &__get_cpu_var(bh_lrus);
+	if (lru->bhs[0] != bh) {
+		struct buffer_head *bhs[BH_LRU_SIZE];
+		int in;
+		int out = 0;
+
+		get_bh(bh);
+		bhs[out++] = bh;
+		for (in = 0; in < BH_LRU_SIZE; in++) {
+			struct buffer_head *bh2 = lru->bhs[in];
+
+			if (bh2 == bh) {
+				__brelse(bh2);
+			} else {
+				if (out >= BH_LRU_SIZE) {
+					BUG_ON(evictee != NULL);
+					evictee = bh2;
+				} else {
+					bhs[out++] = bh2;
+				}
+			}
+		}
+		while (out < BH_LRU_SIZE)
+			bhs[out++] = NULL;
+		memcpy(lru->bhs, bhs, sizeof(bhs));
+	}
+	bh_lru_unlock();
+
+	if (evictee)
+		__brelse(evictee);
+}
+
+/*
+ * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
+ */
+static struct buffer_head *
+lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *ret = NULL;
+	struct bh_lru *lru;
+	int i;
+
+	check_irqs_on();
+	bh_lru_lock();
+	lru = &__get_cpu_var(bh_lrus);
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		struct buffer_head *bh = lru->bhs[i];
+
+		if (bh && bh->b_bdev == bdev &&
+				bh->b_blocknr == block && bh->b_size == size) {
+			if (i) {
+				while (i) {
+					lru->bhs[i] = lru->bhs[i - 1];
+					i--;
+				}
+				lru->bhs[0] = bh;
+			}
+			get_bh(bh);
+			ret = bh;
+			break;
+		}
+	}
+	bh_lru_unlock();
+	return ret;
+}
+
+/*
+ * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
+ * it in the LRU and mark it as accessed.  If it is not present then return
+ * NULL
+ */
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
+
+	if (bh == NULL) {
+		bh = __find_get_block_slow(bdev, block);
+		if (bh)
+			bh_lru_install(bh);
+	}
+	if (bh)
+		touch_buffer(bh);
+	return bh;
+}
+EXPORT_SYMBOL(__find_get_block);
+
+/*
+ * __getblk will locate (and, if necessary, create) the buffer_head
+ * which corresponds to the passed block_device, block and size. The
+ * returned buffer has its reference count incremented.
+ *
+ * __getblk() cannot fail - it just keeps trying.  If you pass it an
+ * illegal block number, __getblk() will happily return a buffer_head
+ * which represents the non-existent block.  Very weird.
+ *
+ * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
+ * attempt is failing.  FIXME, perhaps?
+ */
+struct buffer_head *
+__getblk(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = __find_get_block(bdev, block, size);
+
+	might_sleep();
+	if (bh == NULL)
+		bh = __getblk_slow(bdev, block, size);
+	return bh;
+}
+EXPORT_SYMBOL(__getblk);
+
+/*
+ * Do async read-ahead on a buffer..
+ */
+void __breadahead(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+	if (likely(bh)) {
+		ll_rw_block(READA, 1, &bh);
+		brelse(bh);
+	}
+}
+EXPORT_SYMBOL(__breadahead);
+
+/**
+ *  __bread() - reads a specified block and returns the bh
+ *  @bdev: the block_device to read from
+ *  @block: number of block
+ *  @size: size (in bytes) to read
+ * 
+ *  Reads a specified block, and returns buffer head that contains it.
+ *  It returns NULL if the block was unreadable.
+ */
+struct buffer_head *
+__bread(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+
+	if (likely(bh) && !buffer_uptodate(bh))
+		bh = __bread_slow(bh);
+	return bh;
+}
+EXPORT_SYMBOL(__bread);
+
+/*
+ * invalidate_bh_lrus() is called rarely - but not only at unmount.
+ * This doesn't race because it runs in each cpu either in irq
+ * or with preempt disabled.
+ */
+static void invalidate_bh_lru(void *arg)
+{
+	struct bh_lru *b = &get_cpu_var(bh_lrus);
+	int i;
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+	put_cpu_var(bh_lrus);
+}
+	
+static void invalidate_bh_lrus(void)
+{
+	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+}
+
+void set_bh_page(struct buffer_head *bh,
+		struct page *page, unsigned long offset)
+{
+	bh->b_page = page;
+	BUG_ON(offset >= PAGE_SIZE);
+	if (PageHighMem(page))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = page_address(page) + offset;
+}
+EXPORT_SYMBOL(set_bh_page);
+
+/*
+ * Called when truncating a buffer on a page completely.
+ */
+static void discard_buffer(struct buffer_head * bh)
+{
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	clear_buffer_delay(bh);
+	unlock_buffer(bh);
+}
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private).  If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+	struct address_space * const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	if (PageWriteback(page))
+		return 0;
+	
+	if (mapping && mapping->a_ops->releasepage)
+		return mapping->a_ops->releasepage(page, gfp_mask);
+	return try_to_free_buffers(page);
+}
+EXPORT_SYMBOL(try_to_release_page);
+
+/**
+ * block_invalidatepage - invalidate part of all of a buffer-backed page
+ *
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * block_invalidatepage() is called when all or part of the page has become
+ * invalidatedby a truncate operation.
+ *
+ * block_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point.  Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void block_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off)
+			discard_buffer(bh);
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * We release buffers only if the entire page is being invalidated.
+	 * The get_block cached value has been unconditionally invalidated,
+	 * so real IO is not possible anymore.
+	 */
+	if (offset == 0)
+		try_to_release_page(page, 0);
+out:
+	return;
+}
+EXPORT_SYMBOL(block_invalidatepage);
+
+void do_invalidatepage(struct page *page, unsigned long offset)
+{
+	void (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage ? :
+		block_invalidatepage;
+	(*invalidatepage)(page, offset);
+}
+
+/*
+ * We attach and possibly dirty the buffers atomically wrt
+ * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
+ * is already excluded via the page lock.
+ */
+void create_empty_buffers(struct page *page,
+			unsigned long blocksize, unsigned long b_state)
+{
+	struct buffer_head *bh, *head, *tail;
+
+	head = alloc_page_buffers(page, blocksize, 1);
+	bh = head;
+	do {
+		bh->b_state |= b_state;
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+
+	spin_lock(&page->mapping->private_lock);
+	if (PageUptodate(page) || PageDirty(page)) {
+		bh = head;
+		do {
+			if (PageDirty(page))
+				set_buffer_dirty(bh);
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	attach_page_buffers(page, head);
+	spin_unlock(&page->mapping->private_lock);
+}
+EXPORT_SYMBOL(create_empty_buffers);
+
+/*
+ * We are taking a block for data and we don't want any output from any
+ * buffer-cache aliases starting from return from that function and
+ * until the moment when something will explicitly mark the buffer
+ * dirty (hopefully that will not happen until we will free that block ;-)
+ * We don't even need to mark it not-uptodate - nobody can expect
+ * anything from a newly allocated buffer anyway. We used to used
+ * unmap_buffer() for such invalidation, but that was wrong. We definitely
+ * don't want to mark the alias unmapped, for example - it would confuse
+ * anyone who might pick it with bread() afterwards...
+ *
+ * Also..  Note that bforget() doesn't lock the buffer.  So there can
+ * be writeout I/O going on against recently-freed buffers.  We don't
+ * wait on that I/O in bforget() - it's more efficient to wait on the I/O
+ * only if we really need to.  That happens here.
+ */
+void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+{
+	struct buffer_head *old_bh;
+
+	might_sleep();
+
+	old_bh = __find_get_block_slow(bdev, block);
+	if (old_bh) {
+		clear_buffer_dirty(old_bh);
+		wait_on_buffer(old_bh);
+		clear_buffer_req(old_bh);
+		__brelse(old_bh);
+	}
+}
+EXPORT_SYMBOL(unmap_underlying_metadata);
+
+/*
+ * NOTE! All mapped/uptodate combinations are valid:
+ *
+ *	Mapped	Uptodate	Meaning
+ *
+ *	No	No		"unknown" - must do get_block()
+ *	No	Yes		"hole" - zero-filled
+ *	Yes	No		"allocated" - allocated on disk, not read in
+ *	Yes	Yes		"valid" - allocated and up-to-date in memory.
+ *
+ * "Dirty" is valid only with the last case (mapped+uptodate).
+ */
+
+/*
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time.  We handle that by only looking at the buffer
+ * state inside lock_buffer().
+ *
+ * If block_write_full_page() is called for regular writeback
+ * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
+ * locked buffer.   This only can happen if someone has written the buffer
+ * directly, with submit_bh().  At the address_space level PageWriteback
+ * prevents this contention from occurring.
+ */
+static int __block_write_full_page(struct inode *inode, struct page *page,
+			get_block_t *get_block, struct writeback_control *wbc)
+{
+	int err;
+	sector_t block;
+	sector_t last_block;
+	struct buffer_head *bh, *head;
+	const unsigned blocksize = 1 << inode->i_blkbits;
+	int nr_underway = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, blocksize,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	head = page_buffers(page);
+	bh = head;
+
+	/*
+	 * Get all the dirty buffers mapped to disk addresses and
+	 * handle any aliases from the underlying blockdev's mapping.
+	 */
+	do {
+		if (block > last_block) {
+			/*
+			 * mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress.
+			 */
+			/*
+			 * The buffer was zeroed by block_write_full_page()
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto recover;
+			if (buffer_new(bh)) {
+				/* blockdev mappings never come here */
+				clear_buffer_new(bh);
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else if (test_set_buffer_locked(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	err = 0;
+done:
+	if (nr_underway == 0) {
+		/*
+		 * The page was marked dirty, but the buffers were
+		 * clean.  Someone wrote them back by hand with
+		 * ll_rw_block/submit_bh.  A rare case.
+		 */
+		int uptodate = 1;
+		do {
+			if (!buffer_uptodate(bh)) {
+				uptodate = 0;
+				break;
+			}
+			bh = bh->b_this_page;
+		} while (bh != head);
+		if (uptodate)
+			SetPageUptodate(page);
+		end_page_writeback(page);
+		/*
+		 * The page and buffer_heads can be released at any time from
+		 * here on.
+		 */
+		wbc->pages_skipped++;	/* We didn't write this page */
+	}
+	return err;
+
+recover:
+	/*
+	 * ENOSPC, or some other error.  We may already have added some
+	 * blocks to the file, so we need to write these out to avoid
+	 * exposing stale data.
+	 * The page is currently locked and not marked for writeback
+	 */
+	bh = head;
+	/* Recovery: lock and submit the mapped buffers */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+			lock_buffer(bh);
+			mark_buffer_async_write(bh);
+		} else {
+			/*
+			 * The buffer may have been set dirty during
+			 * attachment to a dirty page.
+			 */
+			clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(WRITE, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	goto done;
+}
+
+static int __block_prepare_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to, get_block_t *get_block)
+{
+	unsigned block_start, block_end;
+	sector_t block;
+	int err = 0;
+	unsigned blocksize, bbits;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > to);
+
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+	head = page_buffers(page);
+
+	bbits = inode->i_blkbits;
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+
+	for(bh = head, block_start = 0; bh != head || !block_start;
+	    block++, block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		if (!buffer_mapped(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				break;
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+				if (PageUptodate(page)) {
+					set_buffer_uptodate(bh);
+					continue;
+				}
+				if (block_end > to || block_start < from) {
+					void *kaddr;
+
+					kaddr = kmap_atomic(page, KM_USER0);
+					if (block_end > to)
+						memset(kaddr+to, 0,
+							block_end-to);
+					if (block_start < from)
+						memset(kaddr+block_start,
+							0, from-block_start);
+					flush_dcache_page(page);
+					kunmap_atomic(kaddr, KM_USER0);
+				}
+				continue;
+			}
+		}
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			continue; 
+		}
+		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+	}
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			err = -EIO;
+	}
+	if (!err) {
+		bh = head;
+		do {
+			if (buffer_new(bh))
+				clear_buffer_new(bh);
+		} while ((bh = bh->b_this_page) != head);
+		return 0;
+	}
+	/* Error case: */
+	/*
+	 * Zero out any newly allocated blocks to avoid exposing stale
+	 * data.  If BH_New is set, we know that the block was newly
+	 * allocated in the above loop.
+	 */
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start+blocksize;
+		if (block_end <= from)
+			goto next_bh;
+		if (block_start >= to)
+			break;
+		if (buffer_new(bh)) {
+			void *kaddr;
+
+			clear_buffer_new(bh);
+			kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr+block_start, 0, bh->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+		}
+next_bh:
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return err;
+}
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	for(bh = head = page_buffers(page), block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+		}
+	}
+
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	sector_t iblock, lblock;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize;
+	int nr, i;
+	int fully_mapped = 1;
+
+	BUG_ON(!PageLocked(page));
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+	head = page_buffers(page);
+
+	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+	bh = head;
+	nr = 0;
+	i = 0;
+
+	do {
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			int err = 0;
+
+			fully_mapped = 0;
+			if (iblock < lblock) {
+				WARN_ON(bh->b_size != blocksize);
+				err = get_block(inode, iblock, bh, 0);
+				if (err)
+					SetPageError(page);
+			}
+			if (!buffer_mapped(bh)) {
+				void *kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + i * blocksize, 0, blocksize);
+				flush_dcache_page(page);
+				kunmap_atomic(kaddr, KM_USER0);
+				if (!err)
+					set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * get_block() might have updated the buffer
+			 * synchronously
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+		}
+		arr[nr++] = bh;
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	if (fully_mapped)
+		SetPageMappedToDisk(page);
+
+	if (!nr) {
+		/*
+		 * All buffers are uptodate - we can set the page uptodate
+		 * as well. But not if get_block() returned an error.
+		 */
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* Stage two: lock the buffers */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		lock_buffer(bh);
+		mark_buffer_async_read(bh);
+	}
+
+	/*
+	 * Stage 3: start the IO.  Check for uptodateness
+	 * inside the buffer lock in case another process reading
+	 * the underlying blockdev brought it uptodate (the sct fix).
+	 */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (buffer_uptodate(bh))
+			end_buffer_async_read(bh, 1);
+		else
+			submit_bh(READ, bh);
+	}
+	return 0;
+}
+
+/* utility function for filesystems that need to do work on expanding
+ * truncates.  Uses prepare/commit_write to allow the filesystem to
+ * deal with the hole.  
+ */
+static int __generic_cont_expand(struct inode *inode, loff_t size,
+				 pgoff_t index, unsigned int offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long limit;
+	int err;
+
+	err = -EFBIG;
+        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
+		send_sig(SIGXFSZ, current, 0);
+		goto out;
+	}
+	if (size > inode->i_sb->s_maxbytes)
+		goto out;
+
+	err = -ENOMEM;
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		goto out;
+	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
+	if (err) {
+		/*
+		 * ->prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again.
+		 */
+		unlock_page(page);
+		page_cache_release(page);
+		vmtruncate(inode, inode->i_size);
+		goto out;
+	}
+
+	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+
+	unlock_page(page);
+	page_cache_release(page);
+	if (err > 0)
+		err = 0;
+out:
+	return err;
+}
+
+int generic_cont_expand(struct inode *inode, loff_t size)
+{
+	pgoff_t index;
+	unsigned int offset;
+
+	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* caller must handle this extra byte. */
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	loff_t pos = size - 1;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+
+	/* prepare/commit_write can handle even if from==to==start of block. */
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
+/*
+ * For moronic filesystems that do not allow holes in file.
+ * We may have to extend the file.
+ */
+
+int cont_prepare_write(struct page *page, unsigned offset,
+		unsigned to, get_block_t *get_block, loff_t *bytes)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct page *new_page;
+	pgoff_t pgpos;
+	long status;
+	unsigned zerofrom;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	void *kaddr;
+
+	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+		status = -ENOMEM;
+		new_page = grab_cache_page(mapping, pgpos);
+		if (!new_page)
+			goto out;
+		/* we might sleep */
+		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+			unlock_page(new_page);
+			page_cache_release(new_page);
+			continue;
+		}
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		status = __block_prepare_write(inode, new_page, zerofrom,
+						PAGE_CACHE_SIZE, get_block);
+		if (status)
+			goto out_unmap;
+		kaddr = kmap_atomic(new_page, KM_USER0);
+		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+		flush_dcache_page(new_page);
+		kunmap_atomic(kaddr, KM_USER0);
+		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
+		unlock_page(new_page);
+		page_cache_release(new_page);
+	}
+
+	if (page->index < pgpos) {
+		/* completely inside the area */
+		zerofrom = offset;
+	} else {
+		/* page covers the boundary, find the boundary offset */
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+
+		/* if we will expand the thing last block will be filled */
+		if (to > zerofrom && (zerofrom & (blocksize-1))) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+
+		/* starting below the boundary? Nothing to zero out */
+		if (offset <= zerofrom)
+			zerofrom = offset;
+	}
+	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
+	if (status)
+		goto out1;
+	if (zerofrom < offset) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr+zerofrom, 0, offset-zerofrom);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		__block_commit_write(inode, page, zerofrom, offset);
+	}
+	return 0;
+out1:
+	ClearPageUptodate(page);
+	return status;
+
+out_unmap:
+	ClearPageUptodate(new_page);
+	unlock_page(new_page);
+	page_cache_release(new_page);
+out:
+	return status;
+}
+
+int block_prepare_write(struct page *page, unsigned from, unsigned to,
+			get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	int err = __block_prepare_write(inode, page, from, to, get_block);
+	if (err)
+		ClearPageUptodate(page);
+	return err;
+}
+
+int block_commit_write(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	__block_commit_write(inode,page,from,to);
+	return 0;
+}
+
+int generic_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	__block_commit_write(inode,page,from,to);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 */
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+
+/*
+ * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
+ * immediately, while under the page lock.  So it needs a special end_io
+ * handler which does not touch the bh after unlocking it.
+ *
+ * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
+ * a race there is benign: unlock_buffer() only use the bh's address for
+ * hashing after unlocking the buffer, so it doesn't actually touch the bh
+ * itself.
+ */
+static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+}
+
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
+			get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	struct buffer_head map_bh;
+	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
+	unsigned block_in_page;
+	unsigned block_start;
+	sector_t block_in_file;
+	char *kaddr;
+	int nr_reads = 0;
+	int i;
+	int ret = 0;
+	int is_mapped_to_disk = 1;
+	int dirtied_it = 0;
+
+	if (PageMappedToDisk(page))
+		return 0;
+
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	map_bh.b_page = page;
+
+	/*
+	 * We loop across all blocks in the page, whether or not they are
+	 * part of the affected region.  This is so we can discover if the
+	 * page is fully mapped-to-disk.
+	 */
+	for (block_start = 0, block_in_page = 0;
+		  block_start < PAGE_CACHE_SIZE;
+		  block_in_page++, block_start += blocksize) {
+		unsigned block_end = block_start + blocksize;
+		int create;
+
+		map_bh.b_state = 0;
+		create = 1;
+		if (block_start >= to)
+			create = 0;
+		map_bh.b_size = blocksize;
+		ret = get_block(inode, block_in_file + block_in_page,
+					&map_bh, create);
+		if (ret)
+			goto failed;
+		if (!buffer_mapped(&map_bh))
+			is_mapped_to_disk = 0;
+		if (buffer_new(&map_bh))
+			unmap_underlying_metadata(map_bh.b_bdev,
+							map_bh.b_blocknr);
+		if (PageUptodate(page))
+			continue;
+		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_start < from) {
+				memset(kaddr+block_start, 0, from-block_start);
+				dirtied_it = 1;
+			}
+			if (block_end > to) {
+				memset(kaddr + to, 0, block_end - to);
+				dirtied_it = 1;
+			}
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+			continue;
+		}
+		if (buffer_uptodate(&map_bh))
+			continue;	/* reiserfs does this */
+		if (block_start < from || block_end > to) {
+			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
+
+			if (!bh) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+			bh->b_state = map_bh.b_state;
+			atomic_set(&bh->b_count, 0);
+			bh->b_this_page = NULL;
+			bh->b_page = page;
+			bh->b_blocknr = map_bh.b_blocknr;
+			bh->b_size = blocksize;
+			bh->b_data = (char *)(long)block_start;
+			bh->b_bdev = map_bh.b_bdev;
+			bh->b_private = NULL;
+			read_bh[nr_reads++] = bh;
+		}
+	}
+
+	if (nr_reads) {
+		struct buffer_head *bh;
+
+		/*
+		 * The page is locked, so these buffers are protected from
+		 * any VM or truncate activity.  Hence we don't need to care
+		 * for the buffer_head refcounts.
+		 */
+		for (i = 0; i < nr_reads; i++) {
+			bh = read_bh[i];
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_read_nobh;
+			submit_bh(READ, bh);
+		}
+		for (i = 0; i < nr_reads; i++) {
+			bh = read_bh[i];
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				ret = -EIO;
+			free_buffer_head(bh);
+			read_bh[i] = NULL;
+		}
+		if (ret)
+			goto failed;
+	}
+
+	if (is_mapped_to_disk)
+		SetPageMappedToDisk(page);
+	SetPageUptodate(page);
+
+	/*
+	 * Setting the page dirty here isn't necessary for the prepare_write
+	 * function - commit_write will do that.  But if/when this function is
+	 * used within the pagefault handler to ensure that all mmapped pages
+	 * have backing space in the filesystem, we will need to dirty the page
+	 * if its contents were altered.
+	 */
+	if (dirtied_it)
+		set_page_dirty(page);
+
+	return 0;
+
+failed:
+	for (i = 0; i < nr_reads; i++) {
+		if (read_bh[i])
+			free_buffer_head(read_bh[i]);
+	}
+
+	/*
+	 * Error recovery is pretty slack.  Clear the page and mark it dirty
+	 * so we'll later zero out any blocks which _were_ allocated.
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(kaddr, KM_USER0);
+	SetPageUptodate(page);
+	set_page_dirty(page);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_prepare_write);
+
+int nobh_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	set_page_dirty(page);
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nobh_commit_write);
+
+/*
+ * nobh_writepage() - based on block_full_write_page() except
+ * that it tries to operate without attaching bufferheads to
+ * the page.
+ */
+int nobh_writepage(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	void *kaddr;
+	int ret;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto out;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+#if 0
+		/* Not really sure about this  - do we need this ? */
+		if (page->mapping->a_ops->invalidatepage)
+			page->mapping->a_ops->invalidatepage(page, offset);
+#endif
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+out:
+	ret = mpage_writepage(page, get_block, wbc);
+	if (ret == -EAGAIN)
+		ret = __block_write_full_page(inode, page, get_block, wbc);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_writepage);
+
+/*
+ * This function assumes that ->prepare_write() uses nobh_prepare_write().
+ */
+int nobh_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned to;
+	struct page *page;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	char *kaddr;
+	int ret = 0;
+
+	if ((offset & (blocksize - 1)) == 0)
+		goto out;
+
+	ret = -ENOMEM;
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		goto out;
+
+	to = (offset + blocksize) & ~(blocksize - 1);
+	ret = a_ops->prepare_write(NULL, page, offset, to);
+	if (ret == 0) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+	}
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(nobh_truncate_page);
+
+int block_truncate_page(struct address_space *mapping,
+			loff_t from, get_block_t *get_block)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	sector_t iblock;
+	unsigned length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head *bh;
+	void *kaddr;
+	int err;
+
+	blocksize = 1 << inode->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (!buffer_mapped(bh)) {
+		WARN_ON(bh->b_size != blocksize);
+		err = get_block(inode, iblock, bh, 0);
+		if (err)
+			goto unlock;
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, length);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mark_buffer_dirty(bh);
+	err = 0;
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+			struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	void *kaddr;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		return __block_write_full_page(inode, page, get_block, wbc);
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		do_invalidatepage(page, 0);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	return __block_write_full_page(inode, page, get_block, wbc);
+}
+
+sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
+			    get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	struct inode *inode = mapping->host;
+	tmp.b_state = 0;
+	tmp.b_blocknr = 0;
+	tmp.b_size = 1 << inode->i_blkbits;
+	get_block(inode, block, &tmp, 0);
+	return tmp.b_blocknr;
+}
+
+static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct buffer_head *bh = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		set_bit(BH_Eopnotsupp, &bh->b_state);
+	}
+
+	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+	bio_put(bio);
+	return 0;
+}
+
+int submit_bh(int rw, struct buffer_head * bh)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	BUG_ON(!buffer_locked(bh));
+	BUG_ON(!buffer_mapped(bh));
+	BUG_ON(!bh->b_end_io);
+
+	if (buffer_ordered(bh) && (rw == WRITE))
+		rw = WRITE_BARRIER;
+
+	/*
+	 * Only clear out a write error when rewriting, should this
+	 * include WRITE_SYNC as well?
+	 */
+	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
+		clear_buffer_write_io_error(bh);
+
+	/*
+	 * from here on down, it's all bio -- do the initial mapping,
+	 * submit_bio -> generic_make_request may further map this bio around
+	 */
+	bio = bio_alloc(GFP_NOIO, 1);
+
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_io_vec[0].bv_page = bh->b_page;
+	bio->bi_io_vec[0].bv_len = bh->b_size;
+	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
+
+	bio->bi_vcnt = 1;
+	bio->bi_idx = 0;
+	bio->bi_size = bh->b_size;
+
+	bio->bi_end_io = end_bio_bh_io_sync;
+	bio->bi_private = bh;
+
+	bio_get(bio);
+	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
+}
+
+/**
+ * ll_rw_block: low-level access to block devices (DEPRECATED)
+ * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @nr: number of &struct buffer_heads in the array
+ * @bhs: array of pointers to &struct buffer_head
+ *
+ * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
+ * requests an I/O operation on them, either a %READ or a %WRITE.  The third
+ * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
+ * are sent to disk. The fourth %READA option is described in the documentation
+ * for generic_make_request() which ll_rw_block() calls.
+ *
+ * This function drops any buffer that it cannot get a lock on (with the
+ * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
+ * clean when doing a write request, and any buffer that appears to be
+ * up-to-date when doing read request.  Further it marks as clean buffers that
+ * are processed for writing (the buffer cache won't assume that they are
+ * actually clean until the buffer gets unlocked).
+ *
+ * ll_rw_block sets b_end_io to simple completion handler that marks
+ * the buffer up-to-date (if approriate), unlocks the buffer and wakes
+ * any waiters. 
+ *
+ * All of the buffers must be for the same device, and must also be a
+ * multiple of the current approved size for the device.
+ */
+void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
+{
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		struct buffer_head *bh = bhs[i];
+
+		if (rw == SWRITE)
+			lock_buffer(bh);
+		else if (test_set_buffer_locked(bh))
+			continue;
+
+		if (rw == WRITE || rw == SWRITE) {
+			if (test_clear_buffer_dirty(bh)) {
+				bh->b_end_io = end_buffer_write_sync;
+				get_bh(bh);
+				submit_bh(WRITE, bh);
+				continue;
+			}
+		} else {
+			if (!buffer_uptodate(bh)) {
+				bh->b_end_io = end_buffer_read_sync;
+				get_bh(bh);
+				submit_bh(rw, bh);
+				continue;
+			}
+		}
+		unlock_buffer(bh);
+	}
+}
+
+/*
+ * For a data-integrity writeout, we need to wait upon any in-progress I/O
+ * and then start new I/O and then wait upon it.  The caller must have a ref on
+ * the buffer_head.
+ */
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+	int ret = 0;
+
+	WARN_ON(atomic_read(&bh->b_count) < 1);
+	lock_buffer(bh);
+	if (test_clear_buffer_dirty(bh)) {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_write_sync;
+		ret = submit_bh(WRITE, bh);
+		wait_on_buffer(bh);
+		if (buffer_eopnotsupp(bh)) {
+			clear_buffer_eopnotsupp(bh);
+			ret = -EOPNOTSUPP;
+		}
+		if (!ret && !buffer_uptodate(bh))
+			ret = -EIO;
+	} else {
+		unlock_buffer(bh);
+	}
+	return ret;
+}
+
+/*
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and releases them if so.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the page or by holding its mapping's private_lock.
+ *
+ * If the page is dirty but all the buffers are clean then we need to
+ * be sure to mark the page clean as well.  This is because the page
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty page will set *all* buffers dirty.  Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem pages: if all the buffers are
+ * clean then we set the page clean and proceed.  To do that, we require
+ * total exclusion from __set_page_dirty_buffers().  That is obtained with
+ * private_lock.
+ *
+ * try_to_free_buffers() is non-blocking.
+ */
+static inline int buffer_busy(struct buffer_head *bh)
+{
+	return atomic_read(&bh->b_count) |
+		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
+}
+
+static int
+drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
+{
+	struct buffer_head *head = page_buffers(page);
+	struct buffer_head *bh;
+
+	bh = head;
+	do {
+		if (buffer_write_io_error(bh) && page->mapping)
+			set_bit(AS_EIO, &page->mapping->flags);
+		if (buffer_busy(bh))
+			goto failed;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+
+		if (!list_empty(&bh->b_assoc_buffers))
+			__remove_assoc_queue(bh);
+		bh = next;
+	} while (bh != head);
+	*buffers_to_free = head;
+	__clear_page_buffers(page);
+	return 1;
+failed:
+	return 0;
+}
+
+int try_to_free_buffers(struct page *page)
+{
+	struct address_space * const mapping = page->mapping;
+	struct buffer_head *buffers_to_free = NULL;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (PageWriteback(page))
+		return 0;
+
+	if (mapping == NULL) {		/* can this still happen? */
+		ret = drop_buffers(page, &buffers_to_free);
+		goto out;
+	}
+
+	spin_lock(&mapping->private_lock);
+	ret = drop_buffers(page, &buffers_to_free);
+	if (ret) {
+		/*
+		 * If the filesystem writes its buffers by hand (eg ext3)
+		 * then we can have clean buffers against a dirty page.  We
+		 * clean the page here; otherwise later reattachment of buffers
+		 * could encounter a non-uptodate page, which is unresolvable.
+		 * This only applies in the rare case where try_to_free_buffers
+		 * succeeds but the page is not freed.
+		 */
+		clear_page_dirty(page);
+	}
+	spin_unlock(&mapping->private_lock);
+out:
+	if (buffers_to_free) {
+		struct buffer_head *bh = buffers_to_free;
+
+		do {
+			struct buffer_head *next = bh->b_this_page;
+			free_buffer_head(bh);
+			bh = next;
+		} while (bh != buffers_to_free);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(try_to_free_buffers);
+
+void block_sync_page(struct page *page)
+{
+	struct address_space *mapping;
+
+	smp_mb();
+	mapping = page_mapping(page);
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info, page);
+}
+
+/*
+ * There are no bdflush tunables left.  But distributions are
+ * still running obsolete flush daemons, so we terminate them here.
+ *
+ * Use of bdflush() is deprecated and will be removed in a future kernel.
+ * The `pdflush' kernel threads fully replace bdflush daemons and this call.
+ */
+asmlinkage long sys_bdflush(int func, long data)
+{
+	static int msg_count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (msg_count < 5) {
+		msg_count++;
+		printk(KERN_INFO
+			"warning: process `%s' used the obsolete bdflush"
+			" system call\n", current->comm);
+		printk(KERN_INFO "Fix your initscripts?\n");
+	}
+
+	if (func == 1)
+		do_exit(0);
+	return 0;
+}
+
+/*
+ * Buffer-head allocation
+ */
+static kmem_cache_t *bh_cachep;
+
+/*
+ * Once the number of bh's in the machine exceeds this level, we start
+ * stripping them in writeback.
+ */
+static int max_buffer_heads;
+
+int buffer_heads_over_limit;
+
+struct bh_accounting {
+	int nr;			/* Number of live bh's */
+	int ratelimit;		/* Limit cacheline bouncing */
+};
+
+static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
+
+static void recalc_bh_state(void)
+{
+	int i;
+	int tot = 0;
+
+	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+		return;
+	__get_cpu_var(bh_accounting).ratelimit = 0;
+	for_each_online_cpu(i)
+		tot += per_cpu(bh_accounting, i).nr;
+	buffer_heads_over_limit = (tot > max_buffer_heads);
+}
+	
+struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
+{
+	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+	if (ret) {
+		get_cpu_var(bh_accounting).nr++;
+		recalc_bh_state();
+		put_cpu_var(bh_accounting);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(alloc_buffer_head);
+
+void free_buffer_head(struct buffer_head *bh)
+{
+	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
+	kmem_cache_free(bh_cachep, bh);
+	get_cpu_var(bh_accounting).nr--;
+	recalc_bh_state();
+	put_cpu_var(bh_accounting);
+}
+EXPORT_SYMBOL(free_buffer_head);
+
+static void
+init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+			    SLAB_CTOR_CONSTRUCTOR) {
+		struct buffer_head * bh = (struct buffer_head *)data;
+
+		memset(bh, 0, sizeof(*bh));
+		INIT_LIST_HEAD(&bh->b_assoc_buffers);
+		spin_lock_init(&bh->b_uptodate_lock);
+		spin_lock_init(&bh->b_state_lock);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void buffer_exit_cpu(int cpu)
+{
+	int i;
+	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+	per_cpu(bh_accounting, cpu).nr = 0;
+	put_cpu_var(bh_accounting);
+}
+
+static int buffer_cpu_notify(struct notifier_block *self,
+			      unsigned long action, void *hcpu)
+{
+	if (action == CPU_DEAD)
+		buffer_exit_cpu((unsigned long)hcpu);
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init buffer_init(void)
+{
+	int nrpages;
+
+	bh_cachep = kmem_cache_create("buffer_head",
+					sizeof(struct buffer_head), 0,
+					(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
+					SLAB_MEM_SPREAD),
+					init_buffer_head,
+					NULL);
+
+	/*
+	 * Limit the bh occupancy to 10% of ZONE_NORMAL
+	 */
+	nrpages = (nr_free_buffer_pages() * 10) / 100;
+	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
+	hotcpu_notifier(buffer_cpu_notify, 0);
+}
+
+EXPORT_SYMBOL(__bforget);
+EXPORT_SYMBOL(__brelse);
+EXPORT_SYMBOL(__wait_on_buffer);
+EXPORT_SYMBOL(block_commit_write);
+EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(block_read_full_page);
+EXPORT_SYMBOL(block_sync_page);
+EXPORT_SYMBOL(block_truncate_page);
+EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(cont_prepare_write);
+EXPORT_SYMBOL(end_buffer_read_sync);
+EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(file_fsync);
+EXPORT_SYMBOL(fsync_bdev);
+EXPORT_SYMBOL(generic_block_bmap);
+EXPORT_SYMBOL(generic_commit_write);
+EXPORT_SYMBOL(generic_cont_expand);
+EXPORT_SYMBOL(generic_cont_expand_simple);
+EXPORT_SYMBOL(init_buffer);
+EXPORT_SYMBOL(invalidate_bdev);
+EXPORT_SYMBOL(ll_rw_block);
+EXPORT_SYMBOL(mark_buffer_dirty);
+EXPORT_SYMBOL(submit_bh);
+EXPORT_SYMBOL(sync_dirty_buffer);
+EXPORT_SYMBOL(unlock_buffer);
diff -urN ./linux-2.6.18.1/fs/dnotify.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/dnotify.c
--- ./linux-2.6.18.1/fs/dnotify.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/dnotify.c	2007-05-19 23:58:35.000000000 +0900
@@ -162,7 +162,7 @@
 
 	spin_lock(&dentry->d_lock);
 	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
+	if (unlikely(parent->d_inode->i_dnotify_mask & event)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
 		__inode_dir_notify(parent->d_inode, event);
diff -urN ./linux-2.6.18.1/fs/eventpoll.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/eventpoll.c
--- ./linux-2.6.18.1/fs/eventpoll.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/eventpoll.c	2007-05-19 23:58:35.000000000 +0900
@@ -497,7 +497,7 @@
  */
 asmlinkage long sys_epoll_create(int size)
 {
-	int error, fd;
+	int error, fd = -1 /* shut up gcc warning */;
 	struct eventpoll *ep;
 	struct inode *inode;
 	struct file *file;
diff -urN ./linux-2.6.18.1/fs/exec.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/exec.c
--- ./linux-2.6.18.1/fs/exec.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/exec.c	2007-05-19 23:58:35.000000000 +0900
@@ -47,6 +47,7 @@
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/delay.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
 
@@ -551,11 +552,16 @@
 		}
 	}
 	task_lock(tsk);
+
+	local_irq_disable();
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	local_irq_enable();
+
 	task_unlock(tsk);
+
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
@@ -670,7 +676,7 @@
 		 */
 		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
-			yield();
+			msleep(1);
 
 		/*
 		 * The only record we have of the real-time age of a
diff -urN ./linux-2.6.18.1/fs/file.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/file.c
--- ./linux-2.6.18.1/fs/file.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/file.c	2007-05-19 23:58:35.000000000 +0900
@@ -138,7 +138,9 @@
 		kfree(fdt->fd);
 		kfree(fdt);
 	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
+
+		fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id());
+
 		spin_lock(&fddef->lock);
 		fdt->next = fddef->next;
 		fddef->next = fdt;
@@ -150,7 +152,6 @@
 		if (!schedule_work(&fddef->wq))
 			mod_timer(&fddef->timer, 5);
 		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
 	}
 }
 
diff -urN ./linux-2.6.18.1/fs/isofs/namei.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/isofs/namei.c
--- ./linux-2.6.18.1/fs/isofs/namei.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/isofs/namei.c	2007-05-19 23:58:35.000000000 +0900
@@ -159,7 +159,7 @@
 struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
 	int found;
-	unsigned long block, offset;
+	unsigned long block = 0, offset = 0 /* avoid stupid gcc warning */;
 	struct inode *inode;
 	struct page *page;
 
diff -urN ./linux-2.6.18.1/fs/jbd/transaction.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jbd/transaction.c
--- ./linux-2.6.18.1/fs/jbd/transaction.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jbd/transaction.c	2007-05-19 23:58:35.000000000 +0900
@@ -1492,7 +1492,7 @@
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
 	if (transaction)
 		assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1934,7 +1934,7 @@
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2023,7 +2023,7 @@
 	int was_dirty;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	if (jh->b_transaction)
 		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
diff -urN ./linux-2.6.18.1/fs/jffs2/erase.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jffs2/erase.c
--- ./linux-2.6.18.1/fs/jffs2/erase.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jffs2/erase.c	2007-05-19 23:58:35.000000000 +0900
@@ -364,7 +364,7 @@
 {
 	size_t retlen;
 	int ret;
-	uint32_t bad_offset;
+	uint32_t bad_offset = 0 /* shut up gcc */;
 
 	switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
 	case -EAGAIN:	goto refile;
diff -urN ./linux-2.6.18.1/fs/jffs2/jffs2_fs_i.h linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jffs2/jffs2_fs_i.h
--- ./linux-2.6.18.1/fs/jffs2/jffs2_fs_i.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jffs2/jffs2_fs_i.h	2007-05-19 23:58:35.000000000 +0900
@@ -15,7 +15,7 @@
 	   before letting GC proceed. Or we'd have to put ugliness
 	   into the GC code so it didn't attempt to obtain the i_mutex
 	   for the inode(s) which are already locked */
-	struct semaphore sem;
+	struct compat_semaphore sem;
 
 	/* The highest (datanode) version number used for this ino */
 	uint32_t highest_version;
diff -urN ./linux-2.6.18.1/fs/jfs/jfs_txnmgr.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jfs/jfs_txnmgr.c
--- ./linux-2.6.18.1/fs/jfs/jfs_txnmgr.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/jfs/jfs_txnmgr.c	2007-05-19 23:58:35.000000000 +0900
@@ -1919,7 +1919,7 @@
 	 * header ?
 	 */
 	if (tlck->type & tlckTRUNCATE) {
-		pxd_t pxd;	/* truncated extent of xad */
+		pxd_t pxd = { 0, } /* gcc */;	/* truncated extent of xad */
 		int twm;
 
 		/*
diff -urN ./linux-2.6.18.1/fs/lockd/svc.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/lockd/svc.c
--- ./linux-2.6.18.1/fs/lockd/svc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/lockd/svc.c	2007-05-19 23:58:35.000000000 +0900
@@ -304,16 +304,12 @@
 	 * Wait for the lockd process to exit, but since we're holding
 	 * the lockd semaphore, we can't wait around forever ...
 	 */
-	clear_thread_flag(TIF_SIGPENDING);
-	interruptible_sleep_on_timeout(&lockd_exit, HZ);
-	if (nlmsvc_pid) {
+	if (wait_event_interruptible_timeout(lockd_exit,
+					     nlmsvc_pid == 0, HZ) <= 0) {
 		printk(KERN_WARNING 
 			"lockd_down: lockd failed to exit, clearing pid\n");
 		nlmsvc_pid = 0;
 	}
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 out:
 	mutex_unlock(&nlmsvc_mutex);
 }
diff -urN ./linux-2.6.18.1/fs/nfsd/nfsctl.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/nfsd/nfsctl.c
--- ./linux-2.6.18.1/fs/nfsd/nfsctl.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/nfsd/nfsctl.c	2007-05-19 23:58:35.000000000 +0900
@@ -288,7 +288,7 @@
 	 * qword quoting is used, so filehandle will be \x....
 	 */
 	char *dname, *path;
-	int maxsize;
+	int maxsize = 0;
 	char *mesg = buf;
 	int len;
 	struct auth_domain *dom;
diff -urN ./linux-2.6.18.1/fs/ntfs/aops.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/ntfs/aops.c
--- ./linux-2.6.18.1/fs/ntfs/aops.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/ntfs/aops.c	2007-05-19 23:58:35.000000000 +0900
@@ -105,8 +105,7 @@
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -121,8 +120,7 @@
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -155,8 +153,7 @@
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
diff -urN ./linux-2.6.18.1/fs/pipe.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/pipe.c
--- ./linux-2.6.18.1/fs/pipe.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/pipe.c	2007-05-19 23:58:35.000000000 +0900
@@ -324,8 +324,14 @@
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -504,8 +510,14 @@
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_update_time(filp);
+#endif
 	return ret;
 }
 
diff -urN ./linux-2.6.18.1/fs/proc/array.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/proc/array.c
--- ./linux-2.6.18.1/fs/proc/array.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/proc/array.c	2007-05-19 23:58:35.000000000 +0900
@@ -130,17 +130,19 @@
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
 	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_RUNNING_MUTEX |
 					    TASK_INTERRUPTIBLE |
 					    TASK_UNINTERRUPTIBLE |
 					    TASK_STOPPED |
@@ -293,6 +295,19 @@
 			    cap_t(p->cap_effective));
 }
 
+#define get_blocked_on(t)	(-1)
+
+static char *show_blocked_on(struct task_struct *task, char *buffer)
+{
+	pid_t pid = get_blocked_on(task);
+
+	if (pid < 0)
+		return buffer;
+
+	return buffer + sprintf(buffer,"BlckOn: %d\n",pid);
+}
+
+
 int proc_pid_status(struct task_struct *task, char * buffer)
 {
 	char * orig = buffer;
@@ -311,6 +326,7 @@
 #if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
+	buffer = show_blocked_on(task,buffer);
 	return buffer - orig;
 }
 
diff -urN ./linux-2.6.18.1/fs/proc/proc_misc.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/proc/proc_misc.c
--- ./linux-2.6.18.1/fs/proc/proc_misc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/proc/proc_misc.c	2007-05-19 23:58:35.000000000 +0900
@@ -95,6 +95,27 @@
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static int loadavg_rt_read_proc(char *page, char **start, off_t off,
+				 int count, int *eof, void *data)
+{
+	extern unsigned long avenrun_rt[];
+	extern unsigned long rt_nr_running(void);
+	int a, b, c;
+	int len;
+
+	a = avenrun_rt[0] + (FIXED_1/200);
+	b = avenrun_rt[1] + (FIXED_1/200);
+	c = avenrun_rt[2] + (FIXED_1/200);
+	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		rt_nr_running(), nr_threads, last_pid);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif
+
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -438,10 +459,11 @@
 {
 	int i;
 	unsigned long jif;
-	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t user_rt, user, nice, system_rt, system, idle,
+		    iowait, irq, softirq, steal;
 	u64 sum = 0;
 
-	user = nice = system = idle = iowait =
+	user_rt = user = nice = system_rt = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
 	jif = - wall_to_monotonic.tv_sec;
 	if (wall_to_monotonic.tv_nsec)
@@ -458,11 +480,16 @@
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt);
+		system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt);
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat_cpu(i).irqs[j];
 	}
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	user = cputime64_add(user_rt, user);
+	system = cputime64_add(system_rt, system);
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -470,19 +497,24 @@
 		(unsigned long long)cputime64_to_clock_t(iowait),
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
-		(unsigned long long)cputime64_to_clock_t(steal));
+		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(user_rt),
+		(unsigned long long)cputime64_to_clock_t(system_rt));
+
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kstat_cpu(i).cpustat.user;
+		user_rt = kstat_cpu(i).cpustat.user_rt;
+		system_rt = kstat_cpu(i).cpustat.system_rt;
+		user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user);
 		nice = kstat_cpu(i).cpustat.nice;
-		system = kstat_cpu(i).cpustat.system;
+		system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system);
 		idle = kstat_cpu(i).cpustat.idle;
 		iowait = kstat_cpu(i).cpustat.iowait;
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
-		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -491,7 +523,9 @@
 			(unsigned long long)cputime64_to_clock_t(iowait),
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
-			(unsigned long long)cputime64_to_clock_t(steal));
+			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(user_rt),
+			(unsigned long long)cputime64_to_clock_t(system_rt));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
@@ -512,6 +546,40 @@
 		nr_running(),
 		nr_iowait());
 
+	show_no_hz_stats(p);
+
+#ifdef CONFIG_PREEMPT_RT
+	{
+		unsigned long nr_uninterruptible_cpu(int cpu);
+		extern int pi_initialized;
+		unsigned long rt_nr_running(void);
+		unsigned long rt_nr_running_cpu(int cpu);
+		unsigned long rt_nr_uninterruptible(void);
+		unsigned long rt_nr_uninterruptible_cpu(int cpu);
+
+		int i;
+
+		seq_printf(p, "pi_init: %d\n", pi_initialized);
+		seq_printf(p, "nr_running(): %ld\n",
+			nr_running());
+		seq_printf(p, "nr_uninterruptible(): %ld\n",
+			nr_uninterruptible());
+		for_each_online_cpu(i)
+			seq_printf(p, "nr_uninterruptible(%d): %ld\n",
+				i, nr_uninterruptible_cpu(i));
+		seq_printf(p, "rt_nr_running(): %ld\n",
+			rt_nr_running());
+		for_each_online_cpu(i)
+			seq_printf(p, "rt_nr_running(%d): %ld\n",
+				i, rt_nr_running_cpu(i));
+		seq_printf(p, "nr_rt_uninterruptible(): %ld\n",
+			   rt_nr_uninterruptible());
+		for_each_online_cpu(i)
+			seq_printf(p, "nr_rt_uninterruptible(%d): %ld\n",
+				   i, rt_nr_uninterruptible_cpu(i));
+	}
+#endif
+
 	return 0;
 }
 
@@ -620,6 +688,20 @@
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_LATENCY_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+	.open		= latency_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /*
  * writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -660,6 +742,9 @@
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
 		{"loadavg",     loadavg_read_proc},
+#ifdef CONFIG_PREEMPT_RT
+		{"loadavgrt",   loadavg_rt_read_proc},
+#endif
 		{"uptime",	uptime_read_proc},
 		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
@@ -705,6 +790,9 @@
 #ifdef CONFIG_SCHEDSTATS
 	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
 #endif
+#ifdef CONFIG_LATENCY_TRACE
+	create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
diff -urN ./linux-2.6.18.1/fs/proc/task_mmu.c linux-2.6.18.1-cabi-20070529-RT_HRT/fs/proc/task_mmu.c
--- ./linux-2.6.18.1/fs/proc/task_mmu.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/proc/task_mmu.c	2007-05-19 23:58:35.000000000 +0900
@@ -350,8 +350,10 @@
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
 		vma = mm->mmap;
-		while (l-- && vma)
+		while (l-- && vma) {
 			vma = vma->vm_next;
+			cond_resched();
+		}
 		goto out;
 	}
 
diff -urN ./linux-2.6.18.1/fs/xfs/linux-2.6/mrlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/linux-2.6/mrlock.h
--- ./linux-2.6.18.1/fs/xfs/linux-2.6/mrlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/linux-2.6/mrlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -23,8 +23,8 @@
 enum { MR_NONE, MR_ACCESS, MR_UPDATE };
 
 typedef struct {
-	struct rw_semaphore	mr_lock;
-	int			mr_writer;
+	struct compat_rw_semaphore	mr_lock;
+	int				mr_writer;
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
diff -urN ./linux-2.6.18.1/fs/xfs/linux-2.6/sema.h linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/linux-2.6/sema.h
--- ./linux-2.6.18.1/fs/xfs/linux-2.6/sema.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/linux-2.6/sema.h	2007-05-19 23:58:35.000000000 +0900
@@ -27,7 +27,7 @@
  * sema_t structure just maps to struct semaphore in Linux kernel.
  */
 
-typedef struct semaphore sema_t;
+typedef struct compat_semaphore sema_t;
 
 #define init_sema(sp, val, c, d)	sema_init(sp, val)
 #define initsema(sp, val)		sema_init(sp, val)
@@ -38,7 +38,12 @@
 
 static inline int issemalocked(sema_t *sp)
 {
-	return down_trylock(sp) || (up(sp), 0);
+	int rv;
+
+	if ((rv = down_trylock(sp)))
+		return (rv);
+	up(sp);
+	return (0);
 }
 
 /*
diff -urN ./linux-2.6.18.1/fs/xfs/linux-2.6/xfs_buf.h linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/linux-2.6/xfs_buf.h
--- ./linux-2.6.18.1/fs/xfs/linux-2.6/xfs_buf.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/linux-2.6/xfs_buf.h	2007-05-19 23:58:35.000000000 +0900
@@ -118,7 +118,7 @@
 #define XB_PAGES	2
 
 typedef struct xfs_buf {
-	struct semaphore	b_sema;		/* semaphore for lockables */
+	struct compat_semaphore	b_sema;		/* semaphore for lockables */
 	unsigned long		b_queuetime;	/* time buffer was queued */
 	atomic_t		b_pin_count;	/* pin count */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
@@ -138,7 +138,7 @@
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
 	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
-	struct semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
+	struct compat_semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
 	void			*b_fspriv3;
diff -urN ./linux-2.6.18.1/fs/xfs/xfs_mount.h linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/xfs_mount.h
--- ./linux-2.6.18.1/fs/xfs/xfs_mount.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/fs/xfs/xfs_mount.h	2007-05-19 23:58:35.000000000 +0900
@@ -373,7 +373,7 @@
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* XFS_IN_MAXLEVELS */
 	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct compat_rw_semaphore m_peraglock;	/* lock for m_perag (pointer) */
 	sema_t			m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
diff -urN ./linux-2.6.18.1/include/acpi/acpiosxf.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/acpi/acpiosxf.h
--- ./linux-2.6.18.1/include/acpi/acpiosxf.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/acpi/acpiosxf.h	2007-05-19 23:58:35.000000000 +0900
@@ -61,7 +61,7 @@
 	OSL_EC_BURST_HANDLER
 } acpi_execute_type;
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 /* Functions for acpi_os_signal */
diff -urN ./linux-2.6.18.1/include/asm-arm/arch-pxa/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/arch-pxa/timex.h
--- ./linux-2.6.18.1/include/asm-arm/arch-pxa/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/arch-pxa/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -16,9 +16,15 @@
 #define CLOCK_TICK_RATE 3686400
 #elif defined(CONFIG_PXA27x)
 /* PXA27x timer base */
+#include <asm-arm/arch-pxa/hardware.h>
+#include <asm-arm/arch-pxa/pxa-regs.h>
 #ifdef CONFIG_MACH_MAINSTONE
 #define CLOCK_TICK_RATE 3249600
 #else
 #define CLOCK_TICK_RATE 3250000
 #endif
 #endif
+
+#define mach_read_cycles() OSCR
+#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32)
+#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32)
diff -urN ./linux-2.6.18.1/include/asm-arm/atomic.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/atomic.h
--- ./linux-2.6.18.1/include/asm-arm/atomic.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/atomic.h	2007-05-19 23:58:35.000000000 +0900
@@ -128,10 +128,10 @@
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val += i;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -141,10 +141,10 @@
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val -= i;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -154,11 +154,11 @@
 	int ret;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	ret = v->counter;
 	if (likely(ret == old))
 		v->counter = new;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return ret;
 }
@@ -167,11 +167,46 @@
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*addr &= ~mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
+#ifndef CONFIG_SMP
+/*
+ * Atomic compare and exchange.
+ */
+#define __HAVE_ARCH_CMPXCHG	1
+
+extern unsigned long wrong_size_cmpxchg(volatile void *ptr);
+
+static inline unsigned long __cmpxchg(volatile void *ptr,
+				    unsigned long old,
+				    unsigned long new, int size)
+{
+	unsigned long flags, prev;
+	volatile unsigned long *p = ptr;
+
+	if (size == 4) {
+		local_irq_save(flags);
+		if ((prev = *p) == old)
+			*p = new;
+		local_irq_restore(flags);
+		return(prev);
+	} else
+		return wrong_size_cmpxchg(ptr);
+}
+
+#define cmpxchg(ptr,o,n)					  	\
+({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		\
+			   	 (unsigned long)_n_, sizeof(*(ptr)));	\
+})
+
+#endif
+
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
diff -urN ./linux-2.6.18.1/include/asm-arm/bitops.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/bitops.h
--- ./linux-2.6.18.1/include/asm-arm/bitops.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/bitops.h	2007-05-19 23:58:35.000000000 +0900
@@ -37,9 +37,9 @@
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*p |= mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_clear_bit(unsigned int bit, volatile unsigned long *p)
@@ -49,9 +49,9 @@
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*p &= ~mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_change_bit(unsigned int bit, volatile unsigned long *p)
@@ -61,9 +61,9 @@
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*p ^= mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline int
@@ -75,10 +75,10 @@
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	res = *p;
 	*p = res | mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -92,10 +92,10 @@
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	res = *p;
 	*p = res & ~mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -109,10 +109,10 @@
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	res = *p;
 	*p = res ^ mask;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return res & mask;
 }
diff -urN ./linux-2.6.18.1/include/asm-arm/dma.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/dma.h
--- ./linux-2.6.18.1/include/asm-arm/dma.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/dma.h	2007-05-19 23:58:35.000000000 +0900
@@ -27,7 +27,7 @@
 #define DMA_MODE_CASCADE 2
 #define DMA_AUTOINIT	 4
 
-extern spinlock_t  dma_spin_lock;
+extern raw_spinlock_t  dma_spin_lock;
 
 static inline unsigned long claim_dma_lock(void)
 {
diff -urN ./linux-2.6.18.1/include/asm-arm/irqflags.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/irqflags.h
--- ./linux-2.6.18.1/include/asm-arm/irqflags.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/irqflags.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,132 @@
+#ifndef __ASM_ARM_IRQFLAGS_H
+#define __ASM_ARM_IRQFLAGS_H
+
+#ifdef __KERNEL__
+
+#include <asm/ptrace.h>
+
+/*
+ * CPU interrupt mask handling.
+ */
+#if __LINUX_ARM_ARCH__ >= 6
+
+#define raw_local_irq_save(x)					\
+	({							\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_save\n"	\
+	"cpsid	i"						\
+	: "=r" (x) : : "memory", "cc");				\
+	})
+
+#define raw_local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
+#define raw_local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
+#define local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
+#define local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
+
+#else
+
+/*
+ * Save the current interrupt enable state & disable IRQs
+ */
+#define raw_local_irq_save(x)					\
+	({							\
+		unsigned long temp;				\
+		(void) (&temp == &x);				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_save\n"	\
+"	orr	%1, %0, #128\n"					\
+"	msr	cpsr_c, %1"					\
+	: "=r" (x), "=r" (temp)					\
+	:							\
+	: "memory", "cc");					\
+	})
+
+/*
+ * Enable IRQs
+ */
+#define raw_local_irq_enable()					\
+	({							\
+		unsigned long temp;				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_enable\n"	\
+"	bic	%0, %0, #128\n"					\
+"	msr	cpsr_c, %0"					\
+	: "=r" (temp)						\
+	:							\
+	: "memory", "cc");					\
+	})
+
+/*
+ * Disable IRQs
+ */
+#define raw_local_irq_disable()					\
+	({							\
+		unsigned long temp;				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_irq_disable\n"	\
+"	orr	%0, %0, #128\n"					\
+"	msr	cpsr_c, %0"					\
+	: "=r" (temp)						\
+	:							\
+	: "memory", "cc");					\
+	})
+
+/*
+ * Enable FIQs
+ */
+#define local_fiq_enable()					\
+	({							\
+		unsigned long temp;				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ stf\n"		\
+"	bic	%0, %0, #64\n"					\
+"	msr	cpsr_c, %0"					\
+	: "=r" (temp)						\
+	:							\
+	: "memory", "cc");					\
+	})
+
+/*
+ * Disable FIQs
+ */
+#define local_fiq_disable()					\
+	({							\
+		unsigned long temp;				\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ clf\n"		\
+"	orr	%0, %0, #64\n"					\
+"	msr	cpsr_c, %0"					\
+	: "=r" (temp)						\
+	:							\
+	: "memory", "cc");					\
+	})
+
+#endif
+
+/*
+ * Save the current interrupt enable state.
+ */
+#define raw_local_save_flags(x)					\
+	({							\
+	__asm__ __volatile__(					\
+	"mrs	%0, cpsr		@ local_save_flags"	\
+	: "=r" (x) : : "memory", "cc");				\
+	})
+
+/*
+ * restore saved IRQ & FIQ state
+ */
+#define raw_local_irq_restore(x)				\
+	__asm__ __volatile__(					\
+	"msr	cpsr_c, %0		@ local_irq_restore\n"	\
+	:							\
+	: "r" (x)						\
+	: "memory", "cc")
+
+#define raw_irqs_disabled_flags(flags)	\
+({					\
+	(int)((flags) & PSR_I_BIT);	\
+})
+
+#endif
+#endif
diff -urN ./linux-2.6.18.1/include/asm-arm/mach/time.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/mach/time.h
--- ./linux-2.6.18.1/include/asm-arm/mach/time.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/mach/time.h	2007-05-19 23:58:35.000000000 +0900
@@ -38,7 +38,10 @@
 	void			(*init)(void);
 	void			(*suspend)(void);
 	void			(*resume)(void);
+
+#ifdef CONFIG_IS_TICK_BASED
 	unsigned long		(*offset)(void);
+#endif
 
 #ifdef CONFIG_NO_IDLE_HZ
 	struct dyn_tick_timer	*dyn_tick;
diff -urN ./linux-2.6.18.1/include/asm-arm/pgalloc.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/pgalloc.h
--- ./linux-2.6.18.1/include/asm-arm/pgalloc.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/pgalloc.h	2007-05-19 23:58:35.000000000 +0900
@@ -109,7 +109,7 @@
  *
  * Ensure that we always set both PMD entries.
  */
-static inline void
+static inline void notrace
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
 	unsigned long pte_ptr = (unsigned long)ptep;
@@ -122,7 +122,7 @@
 	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
 }
 
-static inline void
+static inline void notrace
 pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep)
 {
 	__pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
diff -urN ./linux-2.6.18.1/include/asm-arm/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/semaphore.h
--- ./linux-2.6.18.1/include/asm-arm/semaphore.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -5,46 +5,66 @@
 #define __ASM_ARM_SEMAPHORE_H
 
 #include <linux/linkage.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define semaphore compat_semaphore
+#endif
+
 #include <asm/atomic.h>
 #include <asm/locks.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INIT(name, cnt)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt)				\
 {								\
 	.count	= ATOMIC_INIT(cnt),				\
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INIT(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init(struct semaphore *sem, int val)
+static inline void compat_sema_init(struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	sem->sleepers = 0;
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX(struct semaphore *sem)
+static inline void compat_init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_sema_init(sem, 1);
+}
+
+static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 0);
 }
 
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
+static inline int compat_sema_count(struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	return atomic_read(&sem->count);
 }
 
 /*
@@ -55,16 +75,18 @@
 asmlinkage int  __down_trylock_failed(void);
 asmlinkage void __up_wakeup(void);
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_up(struct compat_semaphore *sem);
+extern int __compat_down_interruptible(struct compat_semaphore * sem);
+extern int __compat_down_trylock(struct compat_semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down" is the actual routine that waits...
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__down_op(sem, __down_failed);
@@ -74,13 +96,13 @@
  * This is ugly, but we want the default case to fall through.
  * "__down_interruptible" is the actual routine that waits...
  */
-static inline int down_interruptible (struct semaphore * sem)
+static inline int compat_down_interruptible (struct compat_semaphore * sem)
 {
 	might_sleep();
 	return __down_op_ret(sem, __down_interruptible_failed);
 }
 
-static inline int down_trylock(struct semaphore *sem)
+static inline int compat_down_trylock(struct compat_semaphore *sem)
 {
 	return __down_op_ret(sem, __down_trylock_failed);
 }
@@ -91,9 +113,10 @@
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__up_op(sem, __up_wakeup);
 }
 
+#include <linux/semaphore.h>
 #endif
diff -urN ./linux-2.6.18.1/include/asm-arm/system.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/system.h
--- ./linux-2.6.18.1/include/asm-arm/system.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/system.h	2007-05-19 23:58:35.000000000 +0900
@@ -207,130 +207,7 @@
 {
 }
 
-/*
- * CPU interrupt mask handling.
- */
-#if __LINUX_ARM_ARCH__ >= 6
-
-#define local_irq_save(x)					\
-	({							\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_save\n"	\
-	"cpsid	i"						\
-	: "=r" (x) : : "memory", "cc");				\
-	})
-
-#define local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
-#define local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
-#define local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
-#define local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
-
-#else
-
-/*
- * Save the current interrupt enable state & disable IRQs
- */
-#define local_irq_save(x)					\
-	({							\
-		unsigned long temp;				\
-		(void) (&temp == &x);				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_save\n"	\
-"	orr	%1, %0, #128\n"					\
-"	msr	cpsr_c, %1"					\
-	: "=r" (x), "=r" (temp)					\
-	:							\
-	: "memory", "cc");					\
-	})
-	
-/*
- * Enable IRQs
- */
-#define local_irq_enable()					\
-	({							\
-		unsigned long temp;				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_enable\n"	\
-"	bic	%0, %0, #128\n"					\
-"	msr	cpsr_c, %0"					\
-	: "=r" (temp)						\
-	:							\
-	: "memory", "cc");					\
-	})
-
-/*
- * Disable IRQs
- */
-#define local_irq_disable()					\
-	({							\
-		unsigned long temp;				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_irq_disable\n"	\
-"	orr	%0, %0, #128\n"					\
-"	msr	cpsr_c, %0"					\
-	: "=r" (temp)						\
-	:							\
-	: "memory", "cc");					\
-	})
-
-/*
- * Enable FIQs
- */
-#define local_fiq_enable()					\
-	({							\
-		unsigned long temp;				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ stf\n"		\
-"	bic	%0, %0, #64\n"					\
-"	msr	cpsr_c, %0"					\
-	: "=r" (temp)						\
-	:							\
-	: "memory", "cc");					\
-	})
-
-/*
- * Disable FIQs
- */
-#define local_fiq_disable()					\
-	({							\
-		unsigned long temp;				\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ clf\n"		\
-"	orr	%0, %0, #64\n"					\
-"	msr	cpsr_c, %0"					\
-	: "=r" (temp)						\
-	:							\
-	: "memory", "cc");					\
-	})
-
-#endif
-
-/*
- * Save the current interrupt enable state.
- */
-#define local_save_flags(x)					\
-	({							\
-	__asm__ __volatile__(					\
-	"mrs	%0, cpsr		@ local_save_flags"	\
-	: "=r" (x) : : "memory", "cc");				\
-	})
-
-/*
- * restore saved IRQ & FIQ state
- */
-#define local_irq_restore(x)					\
-	__asm__ __volatile__(					\
-	"msr	cpsr_c, %0		@ local_irq_restore\n"	\
-	:							\
-	: "r" (x)						\
-	: "memory", "cc")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	(int)(flags & PSR_I_BIT);	\
-})
+#include <linux/irqflags.h>
 
 #ifdef CONFIG_SMP
 
diff -urN ./linux-2.6.18.1/include/asm-arm/thread_info.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/thread_info.h
--- ./linux-2.6.18.1/include/asm-arm/thread_info.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/thread_info.h	2007-05-19 23:58:35.000000000 +0900
@@ -133,6 +133,7 @@
 #define TIF_NOTIFY_RESUME	0
 #define TIF_SIGPENDING		1
 #define TIF_NEED_RESCHED	2
+#define TIF_NEED_RESCHED_DELAYED 3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
@@ -141,6 +142,7 @@
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
diff -urN ./linux-2.6.18.1/include/asm-arm/timeofday.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/timeofday.h
--- ./linux-2.6.18.1/include/asm-arm/timeofday.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/timeofday.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,4 @@
+#ifndef _ASM_ARM_TIMEOFDAY_H
+#define _ASM_ARM_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
diff -urN ./linux-2.6.18.1/include/asm-arm/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/timex.h
--- ./linux-2.6.18.1/include/asm-arm/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -16,9 +16,17 @@
 
 typedef unsigned long cycles_t;
 
+#ifndef mach_read_cycles
+ #define mach_read_cycles() (0)
+#ifdef CONFIG_LATENCY_TIMING
+ #define mach_cycles_to_usecs(d) (d)
+ #define mach_usecs_to_cycles(d) (d)
+#endif
+#endif
+
 static inline cycles_t get_cycles (void)
 {
-	return 0;
+	return mach_read_cycles();
 }
 
 #endif
diff -urN ./linux-2.6.18.1/include/asm-arm/tlb.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/tlb.h
--- ./linux-2.6.18.1/include/asm-arm/tlb.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/tlb.h	2007-05-19 23:58:35.000000000 +0900
@@ -36,15 +36,18 @@
 struct mmu_gather {
 	struct mm_struct	*mm;
 	unsigned int		fullmm;
+	int			cpu;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	tlb->fullmm = full_mm_flush;
 
@@ -60,7 +63,7 @@
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
diff -urN ./linux-2.6.18.1/include/asm-arm/tlbflush.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/tlbflush.h
--- ./linux-2.6.18.1/include/asm-arm/tlbflush.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/tlbflush.h	2007-05-19 23:58:35.000000000 +0900
@@ -246,6 +246,7 @@
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -257,6 +258,7 @@
 		asm("mcr%? p15, 0, %0, c8, c6, 0" : : "r" (zero));
 	if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
 		asm("mcr%? p15, 0, %0, c8, c5, 0" : : "r" (zero));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_mm(struct mm_struct *mm)
@@ -265,6 +267,7 @@
 	const int asid = ASID(mm);
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -285,6 +288,7 @@
 		asm("mcr%? p15, 0, %0, c8, c6, 2" : : "r" (asid));
 	if (tlb_flag(TLB_V6_I_ASID))
 		asm("mcr%? p15, 0, %0, c8, c5, 2" : : "r" (asid));
+	preempt_enable();
 }
 
 static inline void
@@ -293,6 +297,7 @@
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
 
 	if (tlb_flag(TLB_WB))
@@ -317,6 +322,7 @@
 		asm("mcr%? p15, 0, %0, c8, c6, 1" : : "r" (uaddr));
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c8, c5, 1" : : "r" (uaddr));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
@@ -324,6 +330,7 @@
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	kaddr &= PAGE_MASK;
 
 	if (tlb_flag(TLB_WB))
@@ -352,6 +359,7 @@
 	 */
 	if (tlb_flag(TLB_V6_U_PAGE | TLB_V6_D_PAGE | TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
+	preempt_enable();
 }
 
 /*
@@ -372,21 +380,25 @@
 	const unsigned int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
 	if (tlb_flag(TLB_WB))
 		asm("mcr%?	p15, 0, %0, c7, c10, 4	@ flush_pmd"
 			: : "r" (zero));
+	preempt_enable();
 }
 
 static inline void clean_pmd_entry(pmd_t *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
+	preempt_enable();
 }
 
 #undef tlb_flag
diff -urN ./linux-2.6.18.1/include/asm-arm/unistd.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/unistd.h
--- ./linux-2.6.18.1/include/asm-arm/unistd.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-arm/unistd.h	2007-05-19 23:58:35.000000000 +0900
@@ -348,6 +348,10 @@
 #define __NR_get_mempolicy		(__NR_SYSCALL_BASE+320)
 #define __NR_set_mempolicy		(__NR_SYSCALL_BASE+321)
 
+#ifndef __ASSEMBLY__
+#define NR_syscalls			(__NR_set_mempolicy + 1 - __NR_SYSCALL_BASE)
+#endif
+
 /*
  * The following SWIs are ARM private.
  */
diff -urN ./linux-2.6.18.1/include/asm-frv/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-frv/timex.h
--- ./linux-2.6.18.1/include/asm-frv/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-frv/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -6,11 +6,6 @@
 #define CLOCK_TICK_RATE		1193180 /* Underlying HZ */
 #define CLOCK_TICK_FACTOR	20	/* Factor of both 1000000 and CLOCK_TICK_RATE */
 
-#define FINETUNE							\
-((((((long)LATCH * HZ - CLOCK_TICK_RATE) << SHIFT_HZ) *			\
-   (1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR))	\
-  << (SHIFT_SCALE-SHIFT_HZ)) / HZ)
-
 typedef unsigned long cycles_t;
 
 static inline cycles_t get_cycles(void)
diff -urN ./linux-2.6.18.1/include/asm-generic/bug.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/bug.h
--- ./linux-2.6.18.1/include/asm-generic/bug.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/bug.h	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,8 @@
 
 #include <linux/compiler.h>
 
+extern void __WARN_ON(const char *func, const char *file, const int line);
+
 #ifdef CONFIG_BUG
 #ifndef HAVE_ARCH_BUG
 #define BUG() do { \
@@ -17,10 +19,8 @@
 
 #ifndef HAVE_ARCH_WARN_ON
 #define WARN_ON(condition) do { \
-	if (unlikely((condition)!=0)) { \
-		printk("BUG: warning at %s:%d/%s()\n", __FILE__, __LINE__, __FUNCTION__); \
-		dump_stack(); \
-	} \
+	if (unlikely((condition)!=0)) \
+		__WARN_ON(__FUNCTION__, __FILE__, __LINE__); \
 } while (0)
 #endif
 
@@ -57,4 +57,28 @@
 # define WARN_ON_SMP(x)			do { } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
 #endif
diff -urN ./linux-2.6.18.1/include/asm-generic/div64.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/div64.h
--- ./linux-2.6.18.1/include/asm-generic/div64.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/div64.h	2007-05-20 14:14:28.000000000 +0900
@@ -55,4 +55,26 @@
 
 #endif /* BITS_PER_LONG */
 
+/*
+ * (long)X = ((long long)divs) / (long)div
+ * (long)rem = ((long long)divs) % (long)div
+ *
+ * Warning, this will do an exception if X overflows.
+ */
+#define div_long_long_rem(a,b,c) div_ll_X_l_rem(a,b,c)
+
+extern inline long
+div_ll_X_l_rem(long long divs, long div, long *rem)
+{
+        long dum2;
+      __asm__("divl %2":"=a"(dum2), "=d"(*rem)
+      : "rm"(div), "A"(divs));
+
+        return dum2;
+
+}
+
+
+
+
 #endif /* _ASM_GENERIC_DIV64_H */
diff -urN ./linux-2.6.18.1/include/asm-generic/percpu.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/percpu.h
--- ./linux-2.6.18.1/include/asm-generic/percpu.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/percpu.h	2007-05-19 23:58:35.000000000 +0900
@@ -12,12 +12,24 @@
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 #define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id())
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -30,16 +42,27 @@
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \
+    __typeof__(type) per_cpu__##name##_locked
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
+#define per_cpu_var_locked(var, cpu)			(*((void)(cpu), &per_cpu__##var##_locked))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+    extern spinlock_t per_cpu_lock__##name##_locked; \
+    extern __typeof__(type) per_cpu__##name##_locked
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff -urN ./linux-2.6.18.1/include/asm-generic/tlb.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/tlb.h
--- ./linux-2.6.18.1/include/asm-generic/tlb.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-generic/tlb.h	2007-05-19 23:58:35.000000000 +0900
@@ -41,11 +41,12 @@
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
+	int			cpu;
 	struct page *		pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /* tlb_gather_mmu
  *	Return a pointer to an initialized struct mmu_gather.
@@ -53,8 +54,10 @@
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 
 	/* Use fast mode if only one CPU is online */
@@ -90,7 +93,7 @@
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /* tlb_remove_page
diff -urN ./linux-2.6.18.1/include/asm-i386/acpi.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/acpi.h
--- ./linux-2.6.18.1/include/asm-i386/acpi.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/acpi.h	2007-05-19 23:58:35.000000000 +0900
@@ -52,8 +52,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
diff -urN ./linux-2.6.18.1/include/asm-i386/bug.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/bug.h
--- ./linux-2.6.18.1/include/asm-i386/bug.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/bug.h	2007-05-19 23:58:35.000000000 +0900
@@ -11,11 +11,22 @@
 #ifdef CONFIG_BUG
 #define HAVE_ARCH_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern void stop_trace(void);
+#else
+# define stop_trace()			do { } while (0)
+#endif
+
 #define BUG()				\
+do {					\
+stop_trace();				\
+printk("BUG at %s:%d!\n", __FILE__, __LINE__); \
  __asm__ __volatile__(	"ud2\n"		\
 			"\t.word %c0\n"	\
 			"\t.long %c1\n"	\
-			 : : "i" (__LINE__), "i" (__FILE__))
+			 : : "i" (__LINE__), "i" (__FILE__)); \
+} while (0)
 #else
 #define BUG() __asm__ __volatile__("ud2\n")
 #endif
diff -urN ./linux-2.6.18.1/include/asm-i386/div64.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/div64.h
--- ./linux-2.6.18.1/include/asm-i386/div64.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/div64.h	2007-05-20 14:14:28.000000000 +0900
@@ -45,4 +45,5 @@
 	return dum2;
 
 }
+
 #endif
diff -urN ./linux-2.6.18.1/include/asm-i386/dma.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/dma.h
--- ./linux-2.6.18.1/include/asm-i386/dma.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/dma.h	2007-05-19 23:58:35.000000000 +0900
@@ -134,7 +134,7 @@
 #define DMA_AUTOINIT	0x10
 
 
-extern spinlock_t  dma_spin_lock;
+extern spinlock_t dma_spin_lock;
 
 static __inline__ unsigned long claim_dma_lock(void)
 {
diff -urN ./linux-2.6.18.1/include/asm-i386/fixmap.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/fixmap.h
--- ./linux-2.6.18.1/include/asm-i386/fixmap.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/fixmap.h	2007-05-19 23:58:35.000000000 +0900
@@ -26,6 +26,7 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
+#include <asm/vsyscall-gtod.h>
 #ifdef CONFIG_HIGHMEM
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -52,6 +53,11 @@
 enum fixed_addresses {
 	FIX_HOLE,
 	FIX_VDSO,
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+	FIX_VSYSCALL_GTOD_LAST_PAGE,
+	FIX_VSYSCALL_GTOD_FIRST_PAGE = FIX_VSYSCALL_GTOD_LAST_PAGE
+					+ VSYSCALL_GTOD_NUMPAGES - 1,
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
diff -urN ./linux-2.6.18.1/include/asm-i386/highmem.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/highmem.h
--- ./linux-2.6.18.1/include/asm-i386/highmem.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/highmem.h	2007-05-19 23:58:35.000000000 +0900
@@ -66,14 +66,32 @@
 extern void FASTCALL(kunmap_high(struct page *page));
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
 void kunmap(struct page *page);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-struct page *kmap_atomic_to_page(void *ptr);
+
+void *__kmap_atomic(struct page *page, enum km_type type);
+void __kunmap_atomic(void *kvaddr, enum km_type type);
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+#  define kmap_atomic(page, type)	kmap(page)
+#  define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+#  define kunmap_atomic(kvaddr, type)	kunmap_virt(kvaddr)
+#  define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+#else
+# define kmap_atomic(page, type)	__kmap_atomic(page, type)
+# define kmap_atomic_pfn(pfn, type)	__kmap_atomic_pfn(pfn, type)
+# define kunmap_atomic(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
diff -urN ./linux-2.6.18.1/include/asm-i386/hw_irq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/hw_irq.h
--- ./linux-2.6.18.1/include/asm-i386/hw_irq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/hw_irq.h	2007-05-19 23:58:35.000000000 +0900
@@ -17,8 +17,6 @@
 #include <asm/irq.h>
 #include <asm/sections.h>
 
-struct hw_interrupt_type;
-
 #define NMI_VECTOR		0x02
 
 /*
@@ -30,7 +28,6 @@
 
 extern u8 irq_vector[NR_IRQ_VECTORS];
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
-#define AUTO_ASSIGN		-1
 
 extern void (*interrupt[NR_IRQS])(void);
 
diff -urN ./linux-2.6.18.1/include/asm-i386/i8253.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/i8253.h
--- ./linux-2.6.18.1/include/asm-i386/i8253.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/i8253.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,6 +1,7 @@
 #ifndef __ASM_I8253_H__
 #define __ASM_I8253_H__
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
+extern struct clock_event pit_clockevent;
 
 #endif	/* __ASM_I8253_H__ */
diff -urN ./linux-2.6.18.1/include/asm-i386/i8259.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/i8259.h
--- ./linux-2.6.18.1/include/asm-i386/i8259.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/i8259.h	2007-05-19 23:58:35.000000000 +0900
@@ -7,7 +7,7 @@
 #define cached_master_mask	(__byte(0, cached_irq_mask))
 #define cached_slave_mask	(__byte(1, cached_irq_mask))
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
 extern void enable_8259A_irq(unsigned int irq);
diff -urN ./linux-2.6.18.1/include/asm-i386/io_apic.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/io_apic.h
--- ./linux-2.6.18.1/include/asm-i386/io_apic.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/io_apic.h	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,7 @@
 
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -12,46 +13,6 @@
 
 #ifdef CONFIG_X86_IO_APIC
 
-#ifdef CONFIG_PCI_MSI
-static inline int use_pci_vector(void)	{return 1;}
-static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
-static inline void end_edge_ioapic_vector (unsigned int vector) { }
-#define startup_level_ioapic	startup_level_ioapic_vector
-#define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
-#define disable_level_ioapic	mask_IO_APIC_vector
-#define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
-#define end_level_ioapic	end_level_ioapic_vector
-#define set_ioapic_affinity	set_ioapic_affinity_vector
-
-#define startup_edge_ioapic 	startup_edge_ioapic_vector
-#define shutdown_edge_ioapic 	disable_edge_ioapic_vector
-#define enable_edge_ioapic 	unmask_IO_APIC_vector
-#define disable_edge_ioapic 	disable_edge_ioapic_vector
-#define ack_edge_ioapic 	ack_edge_ioapic_vector
-#define end_edge_ioapic 	end_edge_ioapic_vector
-#else
-static inline int use_pci_vector(void)	{return 0;}
-static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
-static inline void end_edge_ioapic_irq (unsigned int irq) { }
-#define startup_level_ioapic	startup_level_ioapic_irq
-#define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
-#define disable_level_ioapic	mask_IO_APIC_irq
-#define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
-#define end_level_ioapic	end_level_ioapic_irq
-#define set_ioapic_affinity	set_ioapic_affinity_irq
-
-#define startup_edge_ioapic 	startup_edge_ioapic_irq
-#define shutdown_edge_ioapic 	disable_edge_ioapic_irq
-#define enable_edge_ioapic 	unmask_IO_APIC_irq
-#define disable_edge_ioapic 	disable_edge_ioapic_irq
-#define ack_edge_ioapic 	ack_edge_ioapic_irq
-#define end_edge_ioapic 	end_edge_ioapic_irq
-#endif
-
 #define IO_APIC_BASE(idx) \
 		((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \
 		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK)))
@@ -159,31 +120,11 @@
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	*IO_APIC_BASE(apic) = reg;
-	return *(IO_APIC_BASE(apic)+4);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index regiser
- */
+extern unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg);
+extern unsigned int io_apic_read(unsigned int apic, unsigned int reg);
+extern void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value);
+extern void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value);
 extern int sis_apic_bug;
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	if (sis_apic_bug)
-		*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
 
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
@@ -208,6 +149,4 @@
 #define io_apic_assign_pci_irqs 0
 #endif
 
-extern int assign_irq_vector(int irq);
-
 #endif
diff -urN ./linux-2.6.18.1/include/asm-i386/mach-default/do_timer.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-default/do_timer.h
--- ./linux-2.6.18.1/include/asm-i386/mach-default/do_timer.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-default/do_timer.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,7 +1,8 @@
 /* defines for inline arch setup functions */
-
+#include <linux/clockchips.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
+#include <asm/i8253.h>
 
 /**
  * do_timer_interrupt_hook - hook into timer tick
@@ -16,24 +17,9 @@
 
 static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 {
-	do_timer(regs);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode_vm(regs));
-#endif
-/*
- * In the SMP case we use the local APIC timer interrupt to do the
- * profiling, except when we simulate SMP mode on a uniprocessor
- * system, in that case we have to call the local interrupt handler.
- */
-#ifndef CONFIG_X86_LOCAL_APIC
-	profile_tick(CPU_PROFILING, regs);
-#else
-	if (!using_apic_timer)
-		smp_local_timer_interrupt(regs);
-#endif
+	pit_clockevent.event_handler(regs);
 }
 
-
 /* you can safely undefine this if you don't have the Neptune chipset */
 
 #define BUGGY_NEPTUN_TIMER
diff -urN ./linux-2.6.18.1/include/asm-i386/mach-default/irq_vectors.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-default/irq_vectors.h
--- ./linux-2.6.18.1/include/asm-i386/mach-default/irq_vectors.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-default/irq_vectors.h	2007-05-19 23:58:35.000000000 +0900
@@ -63,7 +63,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
+#define FIRST_SYSTEM_VECTOR	0xee
 
 #define TIMER_IRQ 0
 
diff -urN ./linux-2.6.18.1/include/asm-i386/mach-default/irq_vectors_limits.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-default/irq_vectors_limits.h
--- ./linux-2.6.18.1/include/asm-i386/mach-default/irq_vectors_limits.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-default/irq_vectors_limits.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,10 +1,6 @@
 #ifndef _ASM_IRQ_VECTORS_LIMITS_H
 #define _ASM_IRQ_VECTORS_LIMITS_H
 
-#ifdef CONFIG_PCI_MSI
-#define NR_IRQS FIRST_SYSTEM_VECTOR
-#define NR_IRQ_VECTORS NR_IRQS
-#else
 #ifdef CONFIG_X86_IO_APIC
 #define NR_IRQS 224
 # if (224 >= 32 * NR_CPUS)
@@ -16,6 +12,5 @@
 #define NR_IRQS 16
 #define NR_IRQ_VECTORS NR_IRQS
 #endif
-#endif
 
 #endif /* _ASM_IRQ_VECTORS_LIMITS_H */
diff -urN ./linux-2.6.18.1/include/asm-i386/mach-visws/do_timer.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-visws/do_timer.h
--- ./linux-2.6.18.1/include/asm-i386/mach-visws/do_timer.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mach-visws/do_timer.h	2007-05-19 23:58:35.000000000 +0900
@@ -9,7 +9,10 @@
 	/* Clear the interrupt */
 	co_cpu_write(CO_CPU_STAT,co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR);
 
+	write_seqlock(&xtime_lock);
 	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
 #ifndef CONFIG_SMP
 	update_process_times(user_mode_vm(regs));
 #endif
diff -urN ./linux-2.6.18.1/include/asm-i386/mc146818rtc.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mc146818rtc.h
--- ./linux-2.6.18.1/include/asm-i386/mc146818rtc.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/mc146818rtc.h	2007-05-19 23:58:35.000000000 +0900
@@ -69,7 +69,7 @@
 		lock_cmos(reg)
 #define lock_cmos_suffix(reg) \
 		unlock_cmos();			\
-		local_irq_restore(cmos_flags);	\
+		local_irq_restore(cmos_flags); \
 	} while (0)
 #else
 #define lock_cmos_prefix(reg) do {} while (0)
diff -urN ./linux-2.6.18.1/include/asm-i386/msi.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/msi.h
--- ./linux-2.6.18.1/include/asm-i386/msi.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/msi.h	2007-05-19 23:58:35.000000000 +0900
@@ -9,14 +9,11 @@
 #include <asm/desc.h>
 #include <mach_apic.h>
 
-#define LAST_DEVICE_VECTOR	(FIRST_SYSTEM_VECTOR - 1)
-#define MSI_TARGET_CPU_SHIFT	12
-
-extern struct msi_ops msi_apic_ops;
+extern struct msi_ops arch_msi_ops;
 
 static inline int msi_arch_init(void)
 {
-	msi_register(&msi_apic_ops);
+	msi_register(&arch_msi_ops);
 	return 0;
 }
 
diff -urN ./linux-2.6.18.1/include/asm-i386/msidef.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/msidef.h
--- ./linux-2.6.18.1/include/asm-i386/msidef.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/msidef.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,47 @@
+#ifndef ASM_MSIDEF_H
+#define ASM_MSIDEF_H
+
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT		0
+#define  MSI_DATA_VECTOR_MASK		0x000000ff
+#define	 MSI_DATA_VECTOR(v)		(((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT	8
+#define  MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define  MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT		14
+#define	 MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
+#define	 MSI_DATA_LEVEL_ASSERT		(1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT		15
+#define  MSI_DATA_TRIGGER_EDGE		(0 << MSI_DATA_TRIGGER_SHIFT)
+#define  MSI_DATA_TRIGGER_LEVEL		(1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI		0
+#define MSI_ADDR_BASE_LO		0xfee00000
+
+#define MSI_ADDR_DEST_MODE_SHIFT	2
+#define  MSI_ADDR_DEST_MODE_PHYSICAL	(0 << MSI_ADDR_DEST_MODE_SHIFT)
+#define	 MSI_ADDR_DEST_MODE_LOGICAL	(1 << MSI_ADDR_DEST_MODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT	3
+#define  MSI_ADDR_REDIRECTION_CPU	(0 << MSI_ADDR_REDIRECTION_SHIFT) /* dedicated cpu */
+#define  MSI_ADDR_REDIRECTION_LOWPRI	(1 << MSI_ADDR_REDIRECTION_SHIFT) /* lowest priority */
+
+#define MSI_ADDR_DEST_ID_SHIFT		12
+#define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
+#define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK)
+
+#endif /* ASM_MSIDEF_H */
diff -urN ./linux-2.6.18.1/include/asm-i386/pgtable.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/pgtable.h
--- ./linux-2.6.18.1/include/asm-i386/pgtable.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/pgtable.h	2007-05-19 23:58:35.000000000 +0900
@@ -36,7 +36,7 @@
 extern pgd_t swapper_pg_dir[1024];
 extern kmem_cache_t *pgd_cache;
 extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
+extern raw_spinlock_t pgd_lock;
 extern struct page *pgd_list;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
@@ -161,6 +161,8 @@
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_VSYSCALL \
+	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
 
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO)
@@ -168,6 +170,8 @@
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE)
 #define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL|(__PAGE_KERNEL_RO | _PAGE_PCD))
 
 /*
  * The i386 can't do page protection for execute, and considers that
diff -urN ./linux-2.6.18.1/include/asm-i386/rwsem.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/rwsem.h
--- ./linux-2.6.18.1/include/asm-i386/rwsem.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/rwsem.h	2007-05-19 23:58:35.000000000 +0900
@@ -44,15 +44,15 @@
 
 struct rwsem_waiter;
 
-extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
-extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_read_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_write_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_wake(struct compat_rw_semaphore *));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct compat_rw_semaphore *sem));
 
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
@@ -78,23 +78,23 @@
 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
 	__RWSEM_DEP_MAP_INIT(name) }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name,
 			 struct lock_class_key *key);
 
-#define init_rwsem(sem)						\
+#define compat_init_rwsem(sem)					\
 do {								\
 	static struct lock_class_key __key;			\
 								\
-	__init_rwsem((sem), #sem, &__key);			\
+	__compat_init_rwsem((sem), #sem, &__key);		\
 } while (0)
 
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning down_read\n\t"
@@ -119,7 +119,7 @@
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	__asm__ __volatile__(
@@ -142,7 +142,8 @@
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void
+__down_write_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	int tmp;
 
@@ -174,7 +175,7 @@
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	signed long ret = cmpxchg(&sem->count,
 				  RWSEM_UNLOCKED_VALUE, 
@@ -187,7 +188,7 @@
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
@@ -213,7 +214,7 @@
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __up_write\n\t"
@@ -239,7 +240,7 @@
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __downgrade_write\n\t"
@@ -264,7 +265,7 @@
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 LOCK_PREFIX	"addl %1,%0"
@@ -275,7 +276,7 @@
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	int tmp = delta;
 
@@ -287,7 +288,7 @@
 	return tmp+delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
diff -urN ./linux-2.6.18.1/include/asm-i386/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/semaphore.h
--- ./linux-2.6.18.1/include/asm-i386/semaphore.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,10 +1,9 @@
 #ifndef _I386_SEMAPHORE_H
 #define _I386_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
-#ifdef __KERNEL__
-
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -41,30 +40,40 @@
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -74,27 +83,27 @@
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-fastcall void __down_failed(void /* special register calling convention */);
-fastcall int  __down_failed_interruptible(void  /* params in registers */);
-fastcall int  __down_failed_trylock(void  /* params in registers */);
-fastcall void __up_wakeup(void /* special register calling convention */);
+fastcall void __compat_down_failed(void /* special register calling convention */);
+fastcall int  __compat_down_failed_interruptible(void  /* params in registers */);
+fastcall int  __compat_down_failed_trylock(void  /* params in registers */);
+fastcall void __compat_up_wakeup(void /* special register calling convention */);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/i386/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__asm__ __volatile__(
@@ -104,7 +113,7 @@
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __down_failed\n\t"
+		"call __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"+m" (sem->count)
@@ -116,7 +125,7 @@
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -129,7 +138,7 @@
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_interruptible\n\t"
+		"call __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "+m" (sem->count)
@@ -142,7 +151,7 @@
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -154,7 +163,7 @@
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_trylock\n\t"
+		"call __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "+m" (sem->count)
@@ -169,7 +178,7 @@
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -178,7 +187,7 @@
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __up_wakeup\n\t"
+		"call __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		".subsection 0\n"
@@ -187,5 +196,10 @@
 		:"memory","ax");
 }
 
-#endif
+extern int FASTCALL(compat_sem_is_locked(struct compat_semaphore *sem));
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif
diff -urN ./linux-2.6.18.1/include/asm-i386/spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/spinlock.h
--- ./linux-2.6.18.1/include/asm-i386/spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -57,7 +57,7 @@
 	"jmp 4b\n" \
 	"5:\n\t"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	asm(__raw_spin_lock_string : "+m" (lock->slock) : : "memory");
 }
@@ -68,13 +68,13 @@
  * so we turn it off:
  */
 #ifndef CONFIG_PROVE_LOCKING
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	asm(__raw_spin_lock_string_flags : "+m" (lock->slock) : "r" (flags) : "memory");
 }
 #endif
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -98,7 +98,7 @@
 		:"+m" (lock->slock) : : "memory"
 
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -112,7 +112,7 @@
 		:"=q" (oldval), "+m" (lock->slock) \
 		:"0" (oldval) : "memory"
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	char oldval = 1;
 
@@ -159,17 +159,17 @@
  */
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -179,7 +179,7 @@
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -188,12 +188,12 @@
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ", %0"
 				 : "+m" (rw->lock) : : "memory");
diff -urN ./linux-2.6.18.1/include/asm-i386/spinlock_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/spinlock_types.h
--- ./linux-2.6.18.1/include/asm-i386/spinlock_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/spinlock_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
diff -urN ./linux-2.6.18.1/include/asm-i386/thread_info.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/thread_info.h
--- ./linux-2.6.18.1/include/asm-i386/thread_info.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/thread_info.h	2007-05-19 23:58:35.000000000 +0900
@@ -142,11 +142,14 @@
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
+#define TIF_NEED_RESCHED_DELAYED 19	/* reschedule on return to userspace */
+
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
diff -urN ./linux-2.6.18.1/include/asm-i386/tlbflush.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/tlbflush.h
--- ./linux-2.6.18.1/include/asm-i386/tlbflush.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/tlbflush.h	2007-05-19 23:58:35.000000000 +0900
@@ -4,15 +4,32 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #define __flush_tlb()							\
 	do {								\
 		unsigned int tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr3, %0;              \n"		\
 			"movl %0, %%cr3;  # flush TLB \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -23,6 +40,7 @@
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr4, %2;  # turn off PGE     \n"	\
 			"movl %2, %1;                        \n"	\
@@ -34,6 +52,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
@@ -86,6 +105,13 @@
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
diff -urN ./linux-2.6.18.1/include/asm-i386/unistd.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/unistd.h
--- ./linux-2.6.18.1/include/asm-i386/unistd.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/unistd.h	2007-05-20 14:14:28.000000000 +0900
@@ -324,6 +324,17 @@
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
 
+#ifdef  CONFIG_CABI
+#define __NR_cabi_account_create          318
+#define __NR_cabi_account_destroy         319
+#define __NR_cabi_account_bind_pid        320
+#define __NR_cabi_account_bind_pgid       321
+#define __NR_cabi_account_unbind          322
+#define __NR_cabi_account_get             323
+#define __NR_cabi_account_set             324
+#define __NR_cabi_account_eval		  325
+#endif
+
 #ifdef __KERNEL__
 
 #define NR_syscalls 318
diff -urN ./linux-2.6.18.1/include/asm-i386/vsyscall-gtod.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/vsyscall-gtod.h
--- ./linux-2.6.18.1/include/asm-i386/vsyscall-gtod.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/vsyscall-gtod.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,49 @@
+#ifndef _ASM_i386_VSYSCALL_GTOD_H_
+#define _ASM_i386_VSYSCALL_GTOD_H_
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+
+/* VSYSCALL_GTOD_START must be the same as
+ * __fix_to_virt(FIX_VSYSCALL_GTOD FIRST_PAGE)
+ * and must also be same as addr in vmlinux.lds.S */
+#define VSYSCALL_GTOD_START 0xffffd000
+#define VSYSCALL_GTOD_SIZE 1024
+#define VSYSCALL_GTOD_END (VSYSCALL_GTOD_START + PAGE_SIZE)
+#define VSYSCALL_GTOD_NUMPAGES \
+	((VSYSCALL_GTOD_END-VSYSCALL_GTOD_START) >> PAGE_SHIFT)
+#define VSYSCALL_ADDR(vsyscall_nr) \
+	(VSYSCALL_GTOD_START+VSYSCALL_GTOD_SIZE*(vsyscall_nr))
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+#include <linux/seqlock.h>
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+
+#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_data __attribute__ ((unused,__section__(".vsyscall_data")))
+
+/* ReadOnly generic time value attributes*/
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data")))
+
+#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock")))
+
+
+enum vsyscall_num {
+	__NR_vgettimeofday,
+	__NR_vtime,
+};
+
+int vsyscall_init(void);
+extern char __vsyscall_0;
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+#else /* CONFIG_GENERIC_TIME_VSYSCALL */
+
+#define __vsyscall(nr)
+#define __vsyscall_fn
+#define __vsyscall_data
+
+#define vsyscall_init()
+#define vsyscall_set_timesource(x)
+#endif /* CONFIG_GENERIC_TIME_VSYSCALL */
+#endif /* _ASM_i386_VSYSCALL_GTOD_H_ */
diff -urN ./linux-2.6.18.1/include/asm-i386/xor.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/xor.h
--- ./linux-2.6.18.1/include/asm-i386/xor.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-i386/xor.h	2007-05-19 23:58:35.000000000 +0900
@@ -862,7 +862,21 @@
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 	do {						\
 		xor_speed(&xor_block_8regs);		\
 		xor_speed(&xor_block_8regs_p);		\
@@ -875,9 +889,10 @@
 	                xor_speed(&xor_block_p5_mmx);	\
 	        }					\
 	} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
+# define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif
+
diff -urN ./linux-2.6.18.1/include/asm-ia64/irqflags.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/irqflags.h
--- ./linux-2.6.18.1/include/asm-ia64/irqflags.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/irqflags.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,95 @@
+
+/*
+ * include/asm-i64/irqflags.h
+ *
+ * IRQ flags handling
+ *
+ * This file gets included from lowlevel asm headers too, to provide
+ * wrapped versions of the local_irq_*() APIs, based on the
+ * raw_local_irq_*() macros from the lowlevel headers.
+ */
+#ifndef _ASM_IRQFLAGS_H
+#define _ASM_IRQFLAGS_H
+
+/* For spinlocks etc */
+
+/*
+ * - clearing psr.i is implicitly serialized (visible by next insn)
+ * - setting psr.i requires data serialization
+ * - we need a stop-bit before reading PSR because we sometimes
+ *   write a floating-point register right before reading the PSR
+ *   and that writes to PSR.mfl
+ */
+#define __local_irq_save(x)			\
+do {						\
+	ia64_stop();				\
+	(x) = ia64_getreg(_IA64_REG_PSR);	\
+	ia64_stop();				\
+	ia64_rsm(IA64_PSR_I);			\
+} while (0)
+
+#define __local_irq_disable()			\
+do {						\
+	ia64_stop();				\
+	ia64_rsm(IA64_PSR_I);			\
+} while (0)
+
+#define __local_irq_restore(x)	ia64_intrin_local_irq_restore((x) & IA64_PSR_I)
+
+#ifdef CONFIG_IA64_DEBUG_IRQ
+
+  extern unsigned long last_cli_ip;
+
+# define __save_ip()		last_cli_ip = ia64_getreg(_IA64_REG_IP)
+
+# define raw_local_irq_save(x)					\
+do {								\
+	unsigned long psr;					\
+								\
+	__local_irq_save(psr);					\
+	if (psr & IA64_PSR_I)					\
+		__save_ip();					\
+	(x) = psr;						\
+} while (0)
+
+# define raw_local_irq_disable()	do { unsigned long x; local_irq_save(x); } while (0)
+
+# define raw_local_irq_restore(x)					\
+do {								\
+	unsigned long old_psr, psr = (x);			\
+								\
+	local_save_flags(old_psr);				\
+	__local_irq_restore(psr);				\
+	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I))	\
+		__save_ip();					\
+} while (0)
+
+#else /* !CONFIG_IA64_DEBUG_IRQ */
+# define raw_local_irq_save(x)	__local_irq_save(x)
+# define raw_local_irq_disable()	__local_irq_disable()
+# define raw_local_irq_restore(x)	__local_irq_restore(x)
+#endif /* !CONFIG_IA64_DEBUG_IRQ */
+
+#define raw_local_irq_enable()	({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
+#define raw_local_save_flags(flags)	({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
+
+#define raw_irqs_disabled()				\
+({						\
+	unsigned long __ia64_id_flags;		\
+	local_save_flags(__ia64_id_flags);	\
+	(__ia64_id_flags & IA64_PSR_I) == 0;	\
+})
+
+#define raw_irqs_disabled_flags(flags) ((flags & IA64_PSR_I) == 0)
+
+
+#define raw_safe_halt()         ia64_pal_halt_light()    /* PAL_HALT_LIGHT */
+
+/* TBD... */
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_STR
+# define TRACE_IRQS_OFF_STR
+
+#endif
+
diff -urN ./linux-2.6.18.1/include/asm-ia64/mmu_context.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/mmu_context.h
--- ./linux-2.6.18.1/include/asm-ia64/mmu_context.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/mmu_context.h	2007-05-19 23:58:35.000000000 +0900
@@ -31,7 +31,7 @@
 #include <asm/processor.h>
 
 struct ia64_ctx {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;     /* available free range */
 	unsigned int max_ctx;   /* max. context value supported by all CPUs */
diff -urN ./linux-2.6.18.1/include/asm-ia64/percpu.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/percpu.h
--- ./linux-2.6.18.1/include/asm-ia64/percpu.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/percpu.h	2007-05-19 23:58:35.000000000 +0900
@@ -24,10 +24,17 @@
 #define DECLARE_PER_CPU(type, name)				\
 	extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
 
+#define DECLARE_PER_CPU_LOCKED(type, name)		\
+	extern spinlock_t per_cpu_lock__##name##_locked; \
+	extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked
+
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name)				\
-	__attribute__((__section__(".data.percpu")))		\
-	__SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
+	__attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_LOCKED(type, name)				\
+	__attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \
+	__attribute__((__section__(".data.percpu"))) __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name##_locked
 
 /*
  * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
@@ -45,6 +52,16 @@
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset)))
 #define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset)))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
+
 extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size);
 extern void setup_per_cpu_areas (void);
 extern void *per_cpu_init(void);
diff -urN ./linux-2.6.18.1/include/asm-ia64/processor.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/processor.h
--- ./linux-2.6.18.1/include/asm-ia64/processor.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/processor.h	2007-05-19 23:58:35.000000000 +0900
@@ -129,8 +129,10 @@
  */
 struct cpuinfo_ia64 {
 	__u32 softirq_pending;
-	__u64 itm_delta;	/* # of clock cycles between clock ticks */
-	__u64 itm_next;		/* interval timer mask value to use for next clock tick */
+	__u64 itm_tick_delta;	/* # of clock cycles between clock ticks */
+	__u64 itm_tick_next;	/* interval timer mask value to use for next clock tick */
+	__u64 itm_timer_next;
+	__u64 __itm_next;
 	__u64 nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
 	__u64 unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
 	__u64 unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
diff -urN ./linux-2.6.18.1/include/asm-ia64/rtc.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/rtc.h
--- ./linux-2.6.18.1/include/asm-ia64/rtc.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/rtc.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,7 @@
+#ifndef _IA64_RTC_H
+#define _IA64_RTC_H
+
+#error "no asm/rtc.h on IA64 !"
+
+#endif
+
diff -urN ./linux-2.6.18.1/include/asm-ia64/rwsem.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/rwsem.h
--- ./linux-2.6.18.1/include/asm-ia64/rwsem.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/rwsem.h	2007-05-19 23:58:35.000000000 +0900
@@ -29,7 +29,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -46,16 +46,16 @@
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
 	  LIST_HEAD_INIT((name).wait_list) }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
 static inline void
-init_rwsem (struct rw_semaphore *sem)
+compat_init_rwsem (struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -66,7 +66,7 @@
  * lock for reading
  */
 static inline void
-__down_read (struct rw_semaphore *sem)
+__down_read (struct compat_rw_semaphore *sem)
 {
 	long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1);
 
@@ -78,7 +78,7 @@
  * lock for writing
  */
 static inline void
-__down_write (struct rw_semaphore *sem)
+__down_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -95,7 +95,7 @@
  * unlock after reading
  */
 static inline void
-__up_read (struct rw_semaphore *sem)
+__up_read (struct compat_rw_semaphore *sem)
 {
 	long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1);
 
@@ -107,7 +107,7 @@
  * unlock after writing
  */
 static inline void
-__up_write (struct rw_semaphore *sem)
+__up_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -124,7 +124,7 @@
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 static inline int
-__down_read_trylock (struct rw_semaphore *sem)
+__down_read_trylock (struct compat_rw_semaphore *sem)
 {
 	long tmp;
 	while ((tmp = sem->count) >= 0) {
@@ -139,7 +139,7 @@
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
 static inline int
-__down_write_trylock (struct rw_semaphore *sem)
+__down_write_trylock (struct compat_rw_semaphore *sem)
 {
 	long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE,
 			      RWSEM_ACTIVE_WRITE_BIAS);
@@ -150,7 +150,7 @@
  * downgrade write lock to read lock
  */
 static inline void
-__downgrade_write (struct rw_semaphore *sem)
+__downgrade_write (struct compat_rw_semaphore *sem)
 {
 	long old, new;
 
@@ -170,7 +170,7 @@
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
diff -urN ./linux-2.6.18.1/include/asm-ia64/sal.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/sal.h
--- ./linux-2.6.18.1/include/asm-ia64/sal.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/sal.h	2007-05-19 23:58:35.000000000 +0900
@@ -43,7 +43,7 @@
 #include <asm/system.h>
 #include <asm/fpu.h>
 
-extern spinlock_t sal_lock;
+extern raw_spinlock_t sal_lock;
 
 /* SAL spec _requires_ eight args for each call. */
 #define __SAL_CALL(result,a0,a1,a2,a3,a4,a5,a6,a7)	\
diff -urN ./linux-2.6.18.1/include/asm-ia64/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/semaphore.h
--- ./linux-2.6.18.1/include/asm-ia64/semaphore.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -11,54 +11,65 @@
 
 #include <asm/atomic.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)					\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count)					\
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name, count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 static inline void
-sema_init (struct semaphore *sem, int val)
+compat_sema_init (struct compat_semaphore *sem, int val)
 {
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
+	*sem = (struct compat_semaphore) __COMPAT_SEMAPHORE_INITIALIZER(*sem, val);
 }
 
 static inline void
-init_MUTEX (struct semaphore *sem)
+compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
 static inline void
-init_MUTEX_LOCKED (struct semaphore *sem)
+compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down (struct semaphore * sem);
-extern int  __down_interruptible (struct semaphore * sem);
-extern int  __down_trylock (struct semaphore * sem);
-extern void __up (struct semaphore * sem);
+extern void __down (struct compat_semaphore * sem);
+extern int  __down_interruptible (struct compat_semaphore * sem);
+extern int  __down_trylock (struct compat_semaphore * sem);
+extern void __up (struct compat_semaphore * sem);
 
 /*
  * Atomically decrement the semaphore's count.  If it goes negative,
  * block the calling thread in the TASK_UNINTERRUPTIBLE state.
  */
 static inline void
-down (struct semaphore *sem)
+compat_down (struct compat_semaphore *sem)
 {
 	might_sleep();
 	if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1)
@@ -70,7 +81,7 @@
  * block the calling thread in the TASK_INTERRUPTIBLE state.
  */
 static inline int
-down_interruptible (struct semaphore * sem)
+compat_down_interruptible (struct compat_semaphore * sem)
 {
 	int ret = 0;
 
@@ -81,7 +92,7 @@
 }
 
 static inline int
-down_trylock (struct semaphore *sem)
+compat_down_trylock (struct compat_semaphore *sem)
 {
 	int ret = 0;
 
@@ -91,10 +102,12 @@
 }
 
 static inline void
-up (struct semaphore * sem)
+compat_up (struct compat_semaphore * sem)
 {
 	if (ia64_fetchadd(1, &sem->count.counter, rel) <= -1)
 		__up(sem);
 }
 
+#include <linux/semaphore.h>
+
 #endif /* _ASM_IA64_SEMAPHORE_H */
diff -urN ./linux-2.6.18.1/include/asm-ia64/spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/spinlock.h
--- ./linux-2.6.18.1/include/asm-ia64/spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -17,8 +17,6 @@
 #include <asm/intrinsics.h>
 #include <asm/system.h>
 
-#define __raw_spin_lock_init(x)			((x)->lock = 0)
-
 #ifdef ASM_SUPPORTED
 /*
  * Try to get the lock.  If we fail to get the lock, make a non-standard call to
@@ -30,7 +28,7 @@
 #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory"
 
 static inline void
-__raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
+__raw_spin_lock_flags (__raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
@@ -89,7 +87,7 @@
 #define __raw_spin_lock(lock) __raw_spin_lock_flags(lock, 0)
 
 /* Unlock by doing an ordered store and releasing the cacheline with nta */
-static inline void __raw_spin_unlock(raw_spinlock_t *x) {
+static inline void __raw_spin_unlock(__raw_spinlock_t *x) {
 	barrier();
 	asm volatile ("st4.rel.nta [%0] = r0\n\t" :: "r"(x));
 }
@@ -109,7 +107,7 @@
 		} while (ia64_spinlock_val);						\
 	}										\
 } while (0)
-#define __raw_spin_unlock(x)	do { barrier(); ((raw_spinlock_t *) x)->lock = 0; } while (0)
+#define __raw_spin_unlock(x)	do { barrier(); ((__raw_spinlock_t *) x)->lock = 0; } while (0)
 #endif /* !ASM_SUPPORTED */
 
 #define __raw_spin_is_locked(x)		((x)->lock != 0)
@@ -122,7 +120,7 @@
 
 #define __raw_read_lock(rw)								\
 do {											\
-	raw_rwlock_t *__read_lock_ptr = (rw);						\
+	__raw_rwlock_t *__read_lock_ptr = (rw);						\
 											\
 	while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {		\
 		ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);			\
@@ -133,7 +131,7 @@
 
 #define __raw_read_unlock(rw)					\
 do {								\
-	raw_rwlock_t *__read_lock_ptr = (rw);			\
+	__raw_rwlock_t *__read_lock_ptr = (rw);			\
 	ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);	\
 } while (0)
 
@@ -165,7 +163,7 @@
 	(result == 0);								\
 })
 
-static inline void __raw_write_unlock(raw_rwlock_t *x)
+static inline void __raw_write_unlock(__raw_rwlock_t *x)
 {
 	u8 *y = (u8 *)x;
 	barrier();
@@ -193,7 +191,7 @@
 	(ia64_val == 0);						\
 })
 
-static inline void __raw_write_unlock(raw_rwlock_t *x)
+static inline void __raw_write_unlock(__raw_rwlock_t *x)
 {
 	barrier();
 	x->write_lock = 0;
@@ -201,10 +199,10 @@
 
 #endif /* !ASM_SUPPORTED */
 
-static inline int __raw_read_trylock(raw_rwlock_t *x)
+static inline int __raw_read_trylock(__raw_rwlock_t *x)
 {
 	union {
-		raw_rwlock_t lock;
+		__raw_rwlock_t lock;
 		__u32 word;
 	} old, new;
 	old.lock = new.lock = *x;
diff -urN ./linux-2.6.18.1/include/asm-ia64/spinlock_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/spinlock_types.h
--- ./linux-2.6.18.1/include/asm-ia64/spinlock_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/spinlock_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -7,14 +7,14 @@
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile unsigned int read_counter	: 31;
 	volatile unsigned int write_lock	:  1;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0, 0 }
 
diff -urN ./linux-2.6.18.1/include/asm-ia64/system.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/system.h
--- ./linux-2.6.18.1/include/asm-ia64/system.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/system.h	2007-05-19 23:58:35.000000000 +0900
@@ -104,81 +104,16 @@
  */
 #define set_mb(var, value)	do { (var) = (value); mb(); } while (0)
 
-#define safe_halt()         ia64_pal_halt_light()    /* PAL_HALT_LIGHT */
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
  * that none of the previous instructions in the same group are
  * affected by the rsm/ssm.
  */
-/* For spinlocks etc */
 
-/*
- * - clearing psr.i is implicitly serialized (visible by next insn)
- * - setting psr.i requires data serialization
- * - we need a stop-bit before reading PSR because we sometimes
- *   write a floating-point register right before reading the PSR
- *   and that writes to PSR.mfl
- */
-#define __local_irq_save(x)			\
-do {						\
-	ia64_stop();				\
-	(x) = ia64_getreg(_IA64_REG_PSR);	\
-	ia64_stop();				\
-	ia64_rsm(IA64_PSR_I);			\
-} while (0)
-
-#define __local_irq_disable()			\
-do {						\
-	ia64_stop();				\
-	ia64_rsm(IA64_PSR_I);			\
-} while (0)
-
-#define __local_irq_restore(x)	ia64_intrin_local_irq_restore((x) & IA64_PSR_I)
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
 
-  extern unsigned long last_cli_ip;
-
-# define __save_ip()		last_cli_ip = ia64_getreg(_IA64_REG_IP)
-
-# define local_irq_save(x)					\
-do {								\
-	unsigned long psr;					\
-								\
-	__local_irq_save(psr);					\
-	if (psr & IA64_PSR_I)					\
-		__save_ip();					\
-	(x) = psr;						\
-} while (0)
-
-# define local_irq_disable()	do { unsigned long x; local_irq_save(x); } while (0)
-
-# define local_irq_restore(x)					\
-do {								\
-	unsigned long old_psr, psr = (x);			\
-								\
-	local_save_flags(old_psr);				\
-	__local_irq_restore(psr);				\
-	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I))	\
-		__save_ip();					\
-} while (0)
+#include <linux/trace_irqflags.h>
 
-#else /* !CONFIG_IA64_DEBUG_IRQ */
-# define local_irq_save(x)	__local_irq_save(x)
-# define local_irq_disable()	__local_irq_disable()
-# define local_irq_restore(x)	__local_irq_restore(x)
-#endif /* !CONFIG_IA64_DEBUG_IRQ */
-
-#define local_irq_enable()	({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
-#define local_save_flags(flags)	({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
-
-#define irqs_disabled()				\
-({						\
-	unsigned long __ia64_id_flags;		\
-	local_save_flags(__ia64_id_flags);	\
-	(__ia64_id_flags & IA64_PSR_I) == 0;	\
-})
 
 #ifdef __KERNEL__
 
diff -urN ./linux-2.6.18.1/include/asm-ia64/thread_info.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/thread_info.h
--- ./linux-2.6.18.1/include/asm-ia64/thread_info.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/thread_info.h	2007-05-19 23:58:35.000000000 +0900
@@ -88,6 +88,7 @@
 #define TIF_MEMDIE		17
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
+#define TIF_NEED_RESCHED_DELAYED 20	/* reschedule on return to userspace */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
diff -urN ./linux-2.6.18.1/include/asm-ia64/tlb.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/tlb.h
--- ./linux-2.6.18.1/include/asm-ia64/tlb.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ia64/tlb.h	2007-05-19 23:58:35.000000000 +0900
@@ -40,6 +40,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/percpu.h>
 
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
@@ -61,11 +62,12 @@
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
 	unsigned long		start_addr;
 	unsigned long		end_addr;
+	int cpu;
 	struct page 		*pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * Flush the TLB for address range START to END and, if not in fast mode, release the
@@ -127,8 +129,10 @@
 static inline struct mmu_gather *
 tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
 
+	tlb->cpu = cpu;
 	tlb->mm = mm;
 	/*
 	 * Use fast mode if only 1 CPU is online.
@@ -165,7 +169,7 @@
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
-	put_cpu_var(mmu_gathers);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 }
 
 /*
diff -urN ./linux-2.6.18.1/include/asm-m32r/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-m32r/timex.h
--- ./linux-2.6.18.1/include/asm-m32r/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-m32r/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -12,9 +12,6 @@
 
 #define CLOCK_TICK_RATE	(CONFIG_BUS_CLOCK / CONFIG_TIMER_DIVIDE)
 #define CLOCK_TICK_FACTOR	20	/* Factor of both 1000000 and CLOCK_TICK_RATE */
-#define FINETUNE ((((((long)LATCH * HZ - CLOCK_TICK_RATE) << SHIFT_HZ) * \
-	(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
-		<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)
 
 #ifdef __KERNEL__
 /*
diff -urN ./linux-2.6.18.1/include/asm-mips/asmmacro.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/asmmacro.h
--- ./linux-2.6.18.1/include/asm-mips/asmmacro.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/asmmacro.h	2007-05-19 23:58:35.000000000 +0900
@@ -21,7 +21,7 @@
 #endif
 
 #ifdef CONFIG_MIPS_MT_SMTC
-	.macro	local_irq_enable reg=t0
+	.macro	raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_TCSTATUS
 	ori	\reg, \reg, TCSTATUS_IXMT
 	xori	\reg, \reg, TCSTATUS_IXMT
@@ -29,21 +29,21 @@
 	_ehb
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_TCSTATUS
 	ori	\reg, \reg, TCSTATUS_IXMT
 	mtc0	\reg, CP0_TCSTATUS
 	_ehb
 	.endm
 #else
-	.macro	local_irq_enable reg=t0
+	.macro	raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	mtc0	\reg, CP0_STATUS
 	irq_enable_hazard
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	xori	\reg, \reg, 1
diff -urN ./linux-2.6.18.1/include/asm-mips/atomic.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/atomic.h
--- ./linux-2.6.18.1/include/asm-mips/atomic.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/atomic.h	2007-05-19 23:58:35.000000000 +0900
@@ -17,8 +17,6 @@
  * <linux/spinlock.h> we have to include <linux/spinlock.h> outside the
  * main big wrapper ...
  */
-#include <linux/spinlock.h>
-
 #ifndef _ASM_ATOMIC_H
 #define _ASM_ATOMIC_H
 
@@ -80,13 +78,16 @@
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		v->counter += i;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -122,13 +123,16 @@
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		v->counter -= i;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -168,15 +172,18 @@
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		result = v->counter;
 		result += i;
 		v->counter = result;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
@@ -215,15 +222,18 @@
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		result = v->counter;
 		result -= i;
 		v->counter = result;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
@@ -278,16 +288,19 @@
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		result = v->counter;
 		result -= i;
 		if (result >= 0)
 			v->counter = result;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 
 	return result;
 }
@@ -434,13 +447,16 @@
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		v->counter += i;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -476,13 +492,16 @@
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		v->counter -= i;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
+#endif
 }
 
 /*
@@ -522,17 +541,20 @@
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		result = v->counter;
 		result += i;
 		v->counter = result;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 
 	return result;
+#endif
 }
 
 static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
@@ -569,17 +591,20 @@
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		result = v->counter;
 		result -= i;
 		v->counter = result;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 
 	return result;
+#endif
 }
 
 /*
@@ -632,18 +657,20 @@
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	} 
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		result = v->counter;
 		result -= i;
 		if (result >= 0)
 			v->counter = result;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
-
 	return result;
+#endif
 }
 
 #define atomic64_dec_return(v) atomic64_sub_return(1,(v))
diff -urN ./linux-2.6.18.1/include/asm-mips/bitops.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/bitops.h
--- ./linux-2.6.18.1/include/asm-mips/bitops.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/bitops.h	2007-05-19 23:58:35.000000000 +0900
@@ -47,8 +47,8 @@
  */
 
 #define __bi_flags			unsigned long flags
-#define __bi_local_irq_save(x)		local_irq_save(x)
-#define __bi_local_irq_restore(x)	local_irq_restore(x)
+#define __bi_local_irq_save(x)		raw_local_irq_save(x)
+#define __bi_local_irq_restore(x)	raw_local_irq_restore(x)
 #else
 #define __bi_flags
 #define __bi_local_irq_save(x)
@@ -90,6 +90,7 @@
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -100,6 +101,7 @@
 		__bi_local_irq_save(flags);
 		*a |= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -138,6 +140,7 @@
 		"	.set	mips0					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (~(1UL << (nr & SZLONG_MASK))), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -148,6 +151,7 @@
 		__bi_local_irq_save(flags);
 		*a &= ~mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -188,6 +192,7 @@
 		"	.set	mips0				\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -198,6 +203,7 @@
 		__bi_local_irq_save(flags);
 		*a ^= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -254,6 +260,7 @@
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -268,6 +275,7 @@
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -326,6 +334,7 @@
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -340,6 +349,7 @@
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -396,6 +406,7 @@
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask, retval;
@@ -409,6 +420,7 @@
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
diff -urN ./linux-2.6.18.1/include/asm-mips/hw_irq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/hw_irq.h
--- ./linux-2.6.18.1/include/asm-mips/hw_irq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/hw_irq.h	2007-05-19 23:58:35.000000000 +0900
@@ -10,6 +10,7 @@
 
 #include <linux/profile.h>
 #include <asm/atomic.h>
+#include <linux/irqflags.h>
 
 extern void disable_8259A_irq(unsigned int irq);
 extern void enable_8259A_irq(unsigned int irq);
diff -urN ./linux-2.6.18.1/include/asm-mips/i8259.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/i8259.h
--- ./linux-2.6.18.1/include/asm-mips/i8259.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/i8259.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,7 +19,7 @@
 
 #include <asm/io.h>
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_i8259_irqs(void);
 
diff -urN ./linux-2.6.18.1/include/asm-mips/io.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/io.h
--- ./linux-2.6.18.1/include/asm-mips/io.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/io.h	2007-05-19 23:58:35.000000000 +0900
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/irqflags.h>
 
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
diff -urN ./linux-2.6.18.1/include/asm-mips/linkage.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/linkage.h
--- ./linux-2.6.18.1/include/asm-mips/linkage.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/linkage.h	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,11 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm.h>
+
+/* FASTCALL stuff */
+#define FASTCALL(x)	x
+#define fastcall
+
 #endif
 
 #endif
diff -urN ./linux-2.6.18.1/include/asm-mips/m48t35.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/m48t35.h
--- ./linux-2.6.18.1/include/asm-mips/m48t35.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/m48t35.h	2007-05-19 23:58:35.000000000 +0900
@@ -6,7 +6,7 @@
 
 #include <linux/spinlock.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 struct m48t35_rtc {
 	volatile u8	pad[0x7ff8];    /* starts at 0x7ff8 */
diff -urN ./linux-2.6.18.1/include/asm-mips/mipsregs.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/mipsregs.h
--- ./linux-2.6.18.1/include/asm-mips/mipsregs.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/mipsregs.h	2007-05-19 23:58:35.000000000 +0900
@@ -759,7 +759,7 @@
 	unsigned long long val;						\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
@@ -787,7 +787,7 @@
 do {									\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
diff -urN ./linux-2.6.18.1/include/asm-mips/rwsem.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/rwsem.h
--- ./linux-2.6.18.1/include/asm-mips/rwsem.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/rwsem.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,176 @@
+/*
+ * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff
+ * in lib/rwsem.c.  Adapted largely from include/asm-ppc/rwsem.h
+ * by john.cooper@timesys.com
+ */
+
+#ifndef _MIPS_RWSEM_H
+#define _MIPS_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+/*
+ * the semaphore definition
+ */
+struct compat_rw_semaphore {
+	/* XXX this should be able to be an atomic_t  -- paulus */
+	signed long		count;
+#define RWSEM_UNLOCKED_VALUE		0x00000000
+#define RWSEM_ACTIVE_BIAS		0x00000001
+#define RWSEM_ACTIVE_MASK		0x0000ffff
+#define RWSEM_WAITING_BIAS		(-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#if RWSEM_DEBUG
+	int			debug;
+#endif
+};
+
+/*
+ * initialisation
+ */
+#if RWSEM_DEBUG
+#define __RWSEM_DEBUG_INIT      , 0
+#else
+#define __RWSEM_DEBUG_INIT	/* */
+#endif
+
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	  LIST_HEAD_INIT((name).wait_list) \
+	  __RWSEM_DEBUG_INIT }
+
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
+
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
+
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
+{
+	sem->count = RWSEM_UNLOCKED_VALUE;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+	sem->debug = 0;
+#endif
+}
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct compat_rw_semaphore *sem)
+{
+	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
+		smp_wmb();
+	else
+		rwsem_down_read_failed(sem);
+}
+
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	while ((tmp = sem->count) >= 0) {
+		if (tmp == cmpxchg(&sem->count, tmp,
+				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			smp_wmb();
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+				(atomic_t *)(&sem->count));
+	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
+		smp_wmb();
+	else
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+		      RWSEM_ACTIVE_WRITE_BIAS);
+	smp_wmb();
+	return tmp == RWSEM_UNLOCKED_VALUE;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_dec_return((atomic_t *)(&sem->count));
+	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct compat_rw_semaphore *sem)
+{
+	smp_wmb();
+	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+			      (atomic_t *)(&sem->count)) < 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
+{
+	atomic_add(delta, (atomic_t *)(&sem->count));
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
+	if (tmp < 0)
+		rwsem_downgrade_wake(sem);
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
+{
+	smp_mb();
+	return atomic_add_return(delta, (atomic_t *)(&sem->count));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _MIPS_RWSEM_H */
diff -urN ./linux-2.6.18.1/include/asm-mips/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/semaphore.h
--- ./linux-2.6.18.1/include/asm-mips/semaphore.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -24,12 +24,20 @@
 
 #ifdef __KERNEL__
 
-#include <asm/atomic.h>
-#include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
@@ -39,39 +47,42 @@
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name, 1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -79,31 +90,37 @@
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASM_SEMAPHORE_H */
diff -urN ./linux-2.6.18.1/include/asm-mips/spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/spinlock.h
--- ./linux-2.6.18.1/include/asm-mips/spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -27,7 +27,7 @@
  * We make no fairness assumptions.  They have a cost.
  */
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	unsigned int tmp;
 
@@ -61,7 +61,7 @@
 	}
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 	"	.set	noreorder	# __raw_spin_unlock	\n"
@@ -73,7 +73,7 @@
 	: "memory");
 }
 
-static inline unsigned int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline unsigned int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	unsigned int temp, res;
 
@@ -130,7 +130,7 @@
  */
 #define __raw_write_can_lock(rw)	(!(rw)->lock)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
@@ -167,7 +167,7 @@
 /* Note the use of sub, not subu which will make the kernel die with an
    overflow exception if we ever try to unlock an rwlock that is already
    unlocked or is being held by a writer.  */
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
@@ -196,7 +196,7 @@
 	}
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 
@@ -229,7 +229,7 @@
 	}
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	__asm__ __volatile__(
 	"	sync			# __raw_write_unlock	\n"
@@ -239,9 +239,9 @@
 	: "memory");
 }
 
-#define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
+#define __raw_read_trylock(lock) generic_raw_read_trylock(lock)
 
-static inline int __raw_write_trylock(raw_rwlock_t *rw)
+static inline int __raw_write_trylock(__raw_rwlock_t *rw)
 {
 	unsigned int tmp;
 	int ret;
diff -urN ./linux-2.6.18.1/include/asm-mips/spinlock_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/spinlock_types.h
--- ./linux-2.6.18.1/include/asm-mips/spinlock_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/spinlock_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0 }
 
diff -urN ./linux-2.6.18.1/include/asm-mips/system.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/system.h
--- ./linux-2.6.18.1/include/asm-mips/system.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/system.h	2007-05-19 23:58:35.000000000 +0900
@@ -245,10 +245,10 @@
 	} else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		retval = *m;
 		*m = val;
-		local_irq_restore(flags);	/* implies memory barrier  */
+		raw_local_irq_restore(flags);	/* implies memory barrier  */
 	}
 
 	return retval;
@@ -294,10 +294,10 @@
 	} else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		retval = *m;
 		*m = val;
-		local_irq_restore(flags);	/* implies memory barrier  */
+		raw_local_irq_restore(flags);	/* implies memory barrier  */
 	}
 
 	return retval;
@@ -373,14 +373,16 @@
 		: "=&r" (retval), "=R" (*m)
 		: "R" (*m), "Jr" (old), "Jr" (new)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		retval = *m;
 		if (retval == old)
 			*m = new;
-		local_irq_restore(flags);	/* implies memory barrier  */
+		raw_local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
@@ -431,11 +433,11 @@
 	} else {
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		retval = *m;
 		if (retval == old)
 			*m = new;
-		local_irq_restore(flags);	/* implies memory barrier  */
+		raw_local_irq_restore(flags);	/* implies memory barrier  */
 	}
 
 	return retval;
diff -urN ./linux-2.6.18.1/include/asm-mips/thread_info.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/thread_info.h
--- ./linux-2.6.18.1/include/asm-mips/thread_info.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/thread_info.h	2007-05-19 23:58:35.000000000 +0900
@@ -113,6 +113,7 @@
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_SECCOMP		5	/* secure computing */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedule on return to userspace */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -125,6 +126,7 @@
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
diff -urN ./linux-2.6.18.1/include/asm-mips/time.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/time.h
--- ./linux-2.6.18.1/include/asm-mips/time.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/time.h	2007-05-19 23:58:35.000000000 +0900
@@ -22,7 +22,7 @@
 #include <linux/rtc.h>
 #include <linux/spinlock.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 /*
  * RTC ops.  By default, they point to no-RTC functions.
diff -urN ./linux-2.6.18.1/include/asm-mips/timeofday.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/timeofday.h
--- ./linux-2.6.18.1/include/asm-mips/timeofday.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/timeofday.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,5 @@
+#ifndef _ASM_MIPS_TIMEOFDAY_H
+#define _ASM_MIPS_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
+
diff -urN ./linux-2.6.18.1/include/asm-mips/uaccess.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/uaccess.h
--- ./linux-2.6.18.1/include/asm-mips/uaccess.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-mips/uaccess.h	2007-05-19 23:58:35.000000000 +0900
@@ -424,7 +424,6 @@
 	const void *__cu_from;						\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -454,7 +453,6 @@
 	const void *__cu_from;						\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -509,7 +507,6 @@
 	const void __user *__cu_from;					\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -540,7 +537,6 @@
 	const void __user *__cu_from;					\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -558,7 +554,6 @@
 	const void __user *__cu_from;					\
 	long __cu_len;							\
 									\
-	might_sleep();							\
 	__cu_to = (to);							\
 	__cu_from = (from);						\
 	__cu_len = (n);							\
@@ -585,7 +580,6 @@
 {
 	__kernel_size_t res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, $0\n\t"
@@ -634,7 +628,6 @@
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
@@ -671,7 +664,6 @@
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
@@ -690,7 +682,6 @@
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		__MODULE_JAL(__strlen_user_nocheck_asm)
@@ -720,7 +711,6 @@
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		__MODULE_JAL(__strlen_user_asm)
@@ -737,7 +727,6 @@
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
@@ -768,7 +757,6 @@
 {
 	long res;
 
-	might_sleep();
 	__asm__ __volatile__(
 		"move\t$4, %1\n\t"
 		"move\t$5, %2\n\t"
diff -urN ./linux-2.6.18.1/include/asm-powerpc/hw_irq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/hw_irq.h
--- ./linux-2.6.18.1/include/asm-powerpc/hw_irq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/hw_irq.h	2007-05-19 23:58:35.000000000 +0900
@@ -14,33 +14,34 @@
 
 #ifdef CONFIG_PPC_ISERIES
 
-extern unsigned long local_get_flags(void);
-extern unsigned long local_irq_disable(void);
-extern void local_irq_restore(unsigned long);
-
-#define local_irq_enable()	local_irq_restore(1)
-#define local_save_flags(flags)	((flags) = local_get_flags())
-#define local_irq_save(flags)	((flags) = local_irq_disable())
+extern unsigned long raw_local_get_flags(void);
+extern unsigned long raw_local_irq_disable(void);
+extern void raw_local_irq_restore(unsigned long);
+
+#define raw_local_irq_enable()	raw_local_irq_restore(1)
+#define raw_local_save_flags(flags)	((flags) = raw_local_get_flags())
+#define raw_local_irq_save(flags)	((flags) = raw_local_irq_disable())
 
-#define irqs_disabled()		(local_get_flags() == 0)
+#define raw_irqs_disabled()			(raw_local_get_flags() == 0)
+#define raw_irqs_disabled_flags(flags)	((flags) == 0)
 
 #else
 
 #if defined(CONFIG_BOOKE)
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
+#define raw_local_irq_restore(flags)	__asm__ __volatile__("wrtee %0" : : "r" (flags) : "memory")
 #elif defined(__powerpc64__)
 #define SET_MSR_EE(x)	__mtmsrd(x, 1)
-#define local_irq_restore(flags) do { \
+#define raw_local_irq_restore(flags) do { \
 	__asm__ __volatile__("": : :"memory"); \
 	__mtmsrd((flags), 1); \
 } while(0)
 #else
 #define SET_MSR_EE(x)	mtmsr(x)
-#define local_irq_restore(flags)	mtmsr(flags)
+#define raw_local_irq_restore(flags)	mtmsr(flags)
 #endif
 
-static inline void local_irq_disable(void)
+static inline void raw_local_irq_disable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 0": : :"memory");
@@ -52,7 +53,7 @@
 #endif
 }
 
-static inline void local_irq_enable(void)
+static inline void raw_local_irq_enable(void)
 {
 #ifdef CONFIG_BOOKE
 	__asm__ __volatile__("wrteei 1": : :"memory");
@@ -64,7 +65,7 @@
 #endif
 }
 
-static inline void local_irq_save_ptr(unsigned long *flags)
+static inline void raw_local_irq_save_ptr(unsigned long *flags)
 {
 	unsigned long msr;
 	msr = mfmsr();
@@ -77,12 +78,15 @@
 	__asm__ __volatile__("": : :"memory");
 }
 
-#define local_save_flags(flags)	((flags) = mfmsr())
-#define local_irq_save(flags)	local_irq_save_ptr(&flags)
-#define irqs_disabled()		((mfmsr() & MSR_EE) == 0)
+#define raw_local_save_flags(flags)		((flags) = mfmsr())
+#define raw_local_irq_save(flags)		raw_local_irq_save_ptr(&flags)
+#define raw_irqs_disabled()			((mfmsr() & MSR_EE) == 0)
+#define raw_irqs_disabled_flags(flags)	((flags & MSR_EE) == 0)
 
 #endif /* CONFIG_PPC_ISERIES */
 
+#include <linux/trace_irqflags.h>
+
 #define mask_irq(irq)						\
 	({							\
 	 	irq_desc_t *desc = get_irq_desc(irq);		\
diff -urN ./linux-2.6.18.1/include/asm-powerpc/rwsem.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/rwsem.h
--- ./linux-2.6.18.1/include/asm-powerpc/rwsem.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/rwsem.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,6 +1,10 @@
 #ifndef _ASM_POWERPC_RWSEM_H
 #define _ASM_POWERPC_RWSEM_H
 
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
 #ifdef __KERNEL__
 
 /*
@@ -17,7 +21,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	/* XXX this should be able to be an atomic_t  -- paulus */
 	signed int		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
@@ -26,7 +30,7 @@
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 };
 
@@ -34,15 +38,15 @@
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
 	  LIST_HEAD_INIT((name).wait_list) }
 
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -52,13 +56,13 @@
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -74,7 +78,7 @@
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -84,7 +88,7 @@
 		rwsem_down_write_failed(sem);
 }
 
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -96,7 +100,7 @@
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -108,7 +112,7 @@
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
 			      (atomic_t *)(&sem->count)) < 0))
@@ -118,7 +122,7 @@
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
@@ -126,7 +130,7 @@
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -138,12 +142,12 @@
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->count != 0);
 }
diff -urN ./linux-2.6.18.1/include/asm-powerpc/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/semaphore.h
--- ./linux-2.6.18.1/include/asm-powerpc/semaphore.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -10,54 +10,65 @@
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all sempahores are compat
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
 	 * sleeping on `wait'.
 	 */
 	atomic_t count;
+	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -65,31 +76,35 @@
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_POWERPC_SEMAPHORE_H */
diff -urN ./linux-2.6.18.1/include/asm-powerpc/spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/spinlock.h
--- ./linux-2.6.18.1/include/asm-powerpc/spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -53,7 +53,7 @@
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
  */
-static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock)
+static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	unsigned long tmp, token;
 
@@ -72,10 +72,10 @@
 	return tmp;
 }
 
-static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock)
+static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
-	return __spin_trylock(lock) == 0;
+	return ___raw_spin_trylock(lock) == 0;
 }
 
 /*
@@ -95,19 +95,19 @@
 #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (get_lppaca()->shared_proc)
-extern void __spin_yield(raw_spinlock_t *lock);
-extern void __rw_yield(raw_rwlock_t *lock);
+extern void __spin_yield(__raw_spinlock_t *lock);
+extern void __rw_yield(__raw_rwlock_t *lock);
 #else /* SPLPAR || ISERIES */
 #define __spin_yield(x)	barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
 
-static void __inline__ __raw_spin_lock(raw_spinlock_t *lock)
+static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		do {
 			HMT_low();
@@ -118,13 +118,13 @@
 	}
 }
 
-static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	unsigned long flags_dis;
 
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__spin_trylock(lock) == 0))
+		if (likely(___raw_spin_trylock(lock) == 0))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -138,7 +138,7 @@
 	}
 }
 
-static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock)
+static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	SYNC_IO;
 	__asm__ __volatile__("# __raw_spin_unlock\n\t"
@@ -147,7 +147,7 @@
 }
 
 #ifdef CONFIG_PPC64
-extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
+extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock);
 #else
 #define __raw_spin_unlock_wait(lock) \
 	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
@@ -179,7 +179,7 @@
  * This returns the old value in the lock + 1,
  * so we got a read lock if the return value is > 0.
  */
-static long __inline__ __read_trylock(raw_rwlock_t *rw)
+static long __inline__ __read_trylock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -203,7 +203,7 @@
  * This returns the old value in the lock,
  * so we got the write lock if the return value is 0.
  */
-static __inline__ long __write_trylock(raw_rwlock_t *rw)
+static __inline__ long __write_trylock(__raw_rwlock_t *rw)
 {
 	long tmp, token;
 
@@ -223,7 +223,7 @@
 	return tmp;
 }
 
-static void __inline__ __raw_read_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__read_trylock(rw) > 0))
@@ -237,7 +237,7 @@
 	}
 }
 
-static void __inline__ __raw_write_lock(raw_rwlock_t *rw)
+static void __inline__ __raw_write_lock(__raw_rwlock_t *rw)
 {
 	while (1) {
 		if (likely(__write_trylock(rw) == 0))
@@ -251,17 +251,17 @@
 	}
 }
 
-static int __inline__ __raw_read_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
 
-static int __inline__ __raw_write_trylock(raw_rwlock_t *rw)
+static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw)
 {
 	return __write_trylock(rw) == 0;
 }
 
-static void __inline__ __raw_read_unlock(raw_rwlock_t *rw)
+static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	long tmp;
 
@@ -278,7 +278,7 @@
 	: "cr0", "memory");
 }
 
-static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
+static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	__asm__ __volatile__("# write_unlock\n\t"
 				LWSYNC_ON_SMP: : :"memory");
diff -urN ./linux-2.6.18.1/include/asm-powerpc/spinlock_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/spinlock_types.h
--- ./linux-2.6.18.1/include/asm-powerpc/spinlock_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/spinlock_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
 
 typedef struct {
 	volatile signed int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ 0 }
 
diff -urN ./linux-2.6.18.1/include/asm-powerpc/thread_info.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/thread_info.h
--- ./linux-2.6.18.1/include/asm-powerpc/thread_info.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/thread_info.h	2007-05-19 23:58:35.000000000 +0900
@@ -120,6 +120,7 @@
 #define TIF_MEMDIE		10
 #define TIF_SECCOMP		11	/* secure computing */
 #define TIF_RESTOREALL		12	/* Restore all regs (implies NOERROR) */
+#define TIF_NEED_RESCHED_DELAYED 13	/* reschedule on return to userspace */
 #define TIF_NOERROR		14	/* Force successful syscall return */
 #define TIF_RESTORE_SIGMASK	15	/* Restore signal mask in do_signal */
 
@@ -138,6 +139,8 @@
 #define _TIF_RESTOREALL		(1<<TIF_RESTOREALL)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
+
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
 
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
diff -urN ./linux-2.6.18.1/include/asm-powerpc/time.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/time.h
--- ./linux-2.6.18.1/include/asm-powerpc/time.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-powerpc/time.h	2007-05-19 23:58:35.000000000 +0900
@@ -30,6 +30,7 @@
 extern unsigned long tb_ticks_per_sec;
 extern u64 tb_to_xs;
 extern unsigned      tb_to_us;
+extern unsigned long cpu_khz;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff -urN ./linux-2.6.18.1/include/asm-ppc/ocp.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ppc/ocp.h
--- ./linux-2.6.18.1/include/asm-ppc/ocp.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ppc/ocp.h	2007-05-19 23:58:35.000000000 +0900
@@ -27,10 +27,10 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/mmu.h>
 #include <asm/ocp_ids.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 #ifdef CONFIG_PPC_OCP
diff -urN ./linux-2.6.18.1/include/asm-ppc/time.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ppc/time.h
--- ./linux-2.6.18.1/include/asm-ppc/time.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-ppc/time.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,6 +19,7 @@
 extern unsigned tb_ticks_per_jiffy;
 extern unsigned tb_to_us;
 extern unsigned tb_last_stamp;
+extern unsigned long cpu_khz;
 extern unsigned long disarm_decr[NR_CPUS];
 
 extern void to_tm(int tim, struct rtc_time * tm);
diff -urN ./linux-2.6.18.1/include/asm-sh64/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-sh64/timex.h
--- ./linux-2.6.18.1/include/asm-sh64/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-sh64/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -17,9 +17,6 @@
 
 #define CLOCK_TICK_RATE	1193180 /* Underlying HZ */
 #define CLOCK_TICK_FACTOR	20	/* Factor of both 1000000 and CLOCK_TICK_RATE */
-#define FINETUNE ((((((long)LATCH * HZ - CLOCK_TICK_RATE) << SHIFT_HZ) * \
-	(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
-		<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)
 
 typedef unsigned long cycles_t;
 
diff -urN ./linux-2.6.18.1/include/asm-x86_64/acpi.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/acpi.h
--- ./linux-2.6.18.1/include/asm-x86_64/acpi.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/acpi.h	2007-05-19 23:58:35.000000000 +0900
@@ -50,8 +50,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
diff -urN ./linux-2.6.18.1/include/asm-x86_64/apic.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/apic.h
--- ./linux-2.6.18.1/include/asm-x86_64/apic.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/apic.h	2007-05-19 23:58:35.000000000 +0900
@@ -16,7 +16,6 @@
 #define APIC_DEBUG   2
 
 extern int apic_verbosity;
-extern int apic_runs_main_timer;
 
 /*
  * Define the default level of output to be very little
diff -urN ./linux-2.6.18.1/include/asm-x86_64/calling.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/calling.h
--- ./linux-2.6.18.1/include/asm-x86_64/calling.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/calling.h	2007-05-19 23:58:35.000000000 +0900
@@ -160,3 +160,53 @@
 	.macro icebp
 	.byte 0xf1
 	.endm
+
+/*
+ * latency-tracing helpers:
+ */
+
+	.macro TRACE_SYS_CALL
+
+#ifdef CONFIG_LATENCY_TRACE
+	SAVE_ARGS
+
+	mov     %rdx, %rcx
+	mov     %rsi, %rdx
+	mov     %rdi, %rsi
+	mov     %rax, %rdi
+
+	call sys_call
+
+	RESTORE_ARGS
+#endif
+	.endm
+
+
+	.macro TRACE_SYS_IA32_CALL
+
+#ifdef CONFIG_LATENCY_TRACE
+	SAVE_ARGS
+
+	mov     %rdx, %rcx
+	mov     %rsi, %rdx
+	mov     %rdi, %rsi
+	mov     %rax, %rdi
+
+	call sys_ia32_call
+
+	RESTORE_ARGS
+#endif
+	.endm
+
+	.macro TRACE_SYS_RET
+
+#ifdef CONFIG_LATENCY_TRACE
+	SAVE_ARGS
+
+	mov     %rax, %rdi
+
+	call sys_ret
+
+	RESTORE_ARGS
+#endif
+	.endm
diff -urN ./linux-2.6.18.1/include/asm-x86_64/hardirq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/hardirq.h
--- ./linux-2.6.18.1/include/asm-x86_64/hardirq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/hardirq.h	2007-05-19 23:58:35.000000000 +0900
@@ -6,6 +6,9 @@
 #include <asm/pda.h>
 #include <asm/apic.h>
 
+/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
+#define MAX_HARDIRQS_PER_CPU NR_VECTORS
+
 #define __ARCH_IRQ_STAT 1
 
 #define local_softirq_pending() read_pda(__softirq_pending)
diff -urN ./linux-2.6.18.1/include/asm-x86_64/hpet.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/hpet.h
--- ./linux-2.6.18.1/include/asm-x86_64/hpet.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/hpet.h	2007-05-19 23:58:35.000000000 +0900
@@ -58,6 +58,13 @@
 extern int apic_is_clustered_box(void);
 
 extern int hpet_use_timer;
+extern unsigned long hpet_address;
+
+extern int hpet_reenable(void);
+extern int hpet_arch_init(void);
+extern int hpet_stop(void);
+extern unsigned int hpet_calibrate_tsc(void);
+
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
diff -urN ./linux-2.6.18.1/include/asm-x86_64/hw_irq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/hw_irq.h
--- ./linux-2.6.18.1/include/asm-x86_64/hw_irq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/hw_irq.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,8 +19,7 @@
 #include <asm/irq.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
-
-struct hw_interrupt_type;
+#include <linux/percpu.h>
 #endif
 
 #define NMI_VECTOR		0x02
@@ -75,9 +74,10 @@
 
 
 #ifndef __ASSEMBLY__
-extern u8 irq_vector[NR_IRQ_VECTORS];
+extern unsigned int irq_vector[NR_IRQ_VECTORS];
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
-#define AUTO_ASSIGN		-1
 
 /*
  * Various low-level irq details needed by irq.c, process.c,
diff -urN ./linux-2.6.18.1/include/asm-x86_64/io_apic.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/io_apic.h
--- ./linux-2.6.18.1/include/asm-x86_64/io_apic.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/io_apic.h	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,7 @@
 
 #include <asm/types.h>
 #include <asm/mpspec.h>
+#include <asm/apicdef.h>
 
 /*
  * Intel IO-APIC support for SMP and UP systems.
@@ -12,46 +13,6 @@
 
 #ifdef CONFIG_X86_IO_APIC
 
-#ifdef CONFIG_PCI_MSI
-static inline int use_pci_vector(void)	{return 1;}
-static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
-static inline void end_edge_ioapic_vector (unsigned int vector) { }
-#define startup_level_ioapic	startup_level_ioapic_vector
-#define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
-#define disable_level_ioapic	mask_IO_APIC_vector
-#define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
-#define end_level_ioapic	end_level_ioapic_vector
-#define set_ioapic_affinity	set_ioapic_affinity_vector
-
-#define startup_edge_ioapic 	startup_edge_ioapic_vector
-#define shutdown_edge_ioapic 	disable_edge_ioapic_vector
-#define enable_edge_ioapic 	unmask_IO_APIC_vector
-#define disable_edge_ioapic 	disable_edge_ioapic_vector
-#define ack_edge_ioapic 	ack_edge_ioapic_vector
-#define end_edge_ioapic 	end_edge_ioapic_vector
-#else
-static inline int use_pci_vector(void)	{return 0;}
-static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
-static inline void end_edge_ioapic_irq (unsigned int irq) { }
-#define startup_level_ioapic	startup_level_ioapic_irq
-#define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
-#define disable_level_ioapic	mask_IO_APIC_irq
-#define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
-#define end_level_ioapic	end_level_ioapic_irq
-#define set_ioapic_affinity	set_ioapic_affinity_irq
-
-#define startup_edge_ioapic 	startup_edge_ioapic_irq
-#define shutdown_edge_ioapic 	disable_edge_ioapic_irq
-#define enable_edge_ioapic 	unmask_IO_APIC_irq
-#define disable_edge_ioapic 	disable_edge_ioapic_irq
-#define ack_edge_ioapic 	ack_edge_ioapic_irq
-#define end_edge_ioapic 	end_edge_ioapic_irq
-#endif
-
 #define APIC_MISMATCH_DEBUG
 
 #define IO_APIC_BASE(idx) \
@@ -204,7 +165,6 @@
 extern int io_apic_get_version (int ioapic);
 extern int io_apic_get_redir_entries (int ioapic);
 extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int, int);
-extern int timer_uses_ioapic_pin_0;
 #endif
 
 extern int sis_apic_bug; /* dummy */ 
@@ -213,10 +173,8 @@
 #define io_apic_assign_pci_irqs 0
 #endif
 
-extern int assign_irq_vector(int irq);
-
 void enable_NMI_through_LVT0 (void * dummy);
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 #endif
diff -urN ./linux-2.6.18.1/include/asm-x86_64/irq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/irq.h
--- ./linux-2.6.18.1/include/asm-x86_64/irq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/irq.h	2007-05-19 23:58:35.000000000 +0900
@@ -31,13 +31,8 @@
 
 #define FIRST_SYSTEM_VECTOR	0xef   /* duplicated in hw_irq.h */
 
-#ifdef CONFIG_PCI_MSI
-#define NR_IRQS FIRST_SYSTEM_VECTOR
+#define NR_IRQS (NR_VECTORS + (32 *NR_CPUS))
 #define NR_IRQ_VECTORS NR_IRQS
-#else
-#define NR_IRQS 224
-#define NR_IRQ_VECTORS (32 * NR_CPUS)
-#endif
 
 static __inline__ int irq_canonicalize(int irq)
 {
diff -urN ./linux-2.6.18.1/include/asm-x86_64/msi.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/msi.h
--- ./linux-2.6.18.1/include/asm-x86_64/msi.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/msi.h	2007-05-19 23:58:35.000000000 +0900
@@ -10,14 +10,11 @@
 #include <asm/mach_apic.h>
 #include <asm/smp.h>
 
-#define LAST_DEVICE_VECTOR	(FIRST_SYSTEM_VECTOR - 1)
-#define MSI_TARGET_CPU_SHIFT	12
-
-extern struct msi_ops msi_apic_ops;
+extern struct msi_ops arch_msi_ops;
 
 static inline int msi_arch_init(void)
 {
-	msi_register(&msi_apic_ops);
+	msi_register(&arch_msi_ops);
 	return 0;
 }
 
diff -urN ./linux-2.6.18.1/include/asm-x86_64/msidef.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/msidef.h
--- ./linux-2.6.18.1/include/asm-x86_64/msidef.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/msidef.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,47 @@
+#ifndef ASM_MSIDEF_H
+#define ASM_MSIDEF_H
+
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT		0
+#define  MSI_DATA_VECTOR_MASK		0x000000ff
+#define	 MSI_DATA_VECTOR(v)		(((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT	8
+#define  MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define  MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT		14
+#define	 MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
+#define	 MSI_DATA_LEVEL_ASSERT		(1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT		15
+#define  MSI_DATA_TRIGGER_EDGE		(0 << MSI_DATA_TRIGGER_SHIFT)
+#define  MSI_DATA_TRIGGER_LEVEL		(1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI		0
+#define MSI_ADDR_BASE_LO		0xfee00000
+
+#define MSI_ADDR_DEST_MODE_SHIFT	2
+#define  MSI_ADDR_DEST_MODE_PHYSICAL	(0 << MSI_ADDR_DEST_MODE_SHIFT)
+#define	 MSI_ADDR_DEST_MODE_LOGICAL	(1 << MSI_ADDR_DEST_MODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT	3
+#define  MSI_ADDR_REDIRECTION_CPU	(0 << MSI_ADDR_REDIRECTION_SHIFT) /* dedicated cpu */
+#define  MSI_ADDR_REDIRECTION_LOWPRI	(1 << MSI_ADDR_REDIRECTION_SHIFT) /* lowest priority */
+
+#define MSI_ADDR_DEST_ID_SHIFT		12
+#define	 MSI_ADDR_DEST_ID_MASK		0x00ffff0
+#define  MSI_ADDR_DEST_ID(dest)		(((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK)
+
+#endif /* ASM_MSIDEF_H */
diff -urN ./linux-2.6.18.1/include/asm-x86_64/percpu.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/percpu.h
--- ./linux-2.6.18.1/include/asm-x86_64/percpu.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/percpu.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,12 +19,24 @@
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) __DEFINE_SPINLOCK(per_cpu_lock__##name##_locked); \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 #define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu)))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu)))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -41,15 +53,28 @@
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+	spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+	__typeof__(type) per_cpu__##name##_locked
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
+#define per_cpu_var_locked(var, cpu)		(*((void)(cpu), &per_cpu__##var##_locked))
 #define __get_cpu_var(var)			per_cpu__##var
 #define __raw_get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+	extern spinlock_t per_cpu_lock__##name##_locked; \
+	extern __typeof__(type) per_cpu__##name##_locked
+
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_X8664_PERCPU_H_ */
diff -urN ./linux-2.6.18.1/include/asm-x86_64/proto.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/proto.h
--- ./linux-2.6.18.1/include/asm-x86_64/proto.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/proto.h	2007-05-19 23:58:35.000000000 +0900
@@ -47,10 +47,7 @@
 #else
 #define pmtmr_ioport 0
 #endif
-extern unsigned long long monotonic_base;
-extern int sysctl_vsyscall;
 extern int nohpet;
-extern unsigned long vxtime_hz;
 
 extern int numa_setup(char *opt);
 
diff -urN ./linux-2.6.18.1/include/asm-x86_64/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/semaphore.h
--- ./linux-2.6.18.1/include/asm-x86_64/semaphore.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,10 +1,15 @@
 #ifndef _X86_64_SEMAPHORE_H
 #define _X86_64_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
 #ifdef __KERNEL__
 
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -43,29 +48,34 @@
 #include <linux/rwsem.h>
 #include <linux/stringify.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -75,32 +85,33 @@
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
+asmlinkage void __compat_down_failed(void /* special register calling convention */);
+asmlinkage int  __compat_down_failed_interruptible(void  /* params in registers */);
+asmlinkage int  __compat_down_failed_trylock(void  /* params in registers */);
+asmlinkage void __compat_up_wakeup(void /* special register calling convention */);
 
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
+asmlinkage void __compat_down(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_interruptible(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_trylock(struct compat_semaphore * sem);
+asmlinkage void __compat_up(struct compat_semaphore * sem);
+asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/x86_64/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -110,7 +121,7 @@
 		"js 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed\n\t"
+		"2:\tcall __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -122,7 +133,7 @@
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -135,7 +146,7 @@
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_interruptible\n\t"
+		"2:\tcall __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -148,7 +159,7 @@
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -159,7 +170,7 @@
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_trylock\n\t"
+		"2:\tcall __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -174,7 +185,7 @@
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -182,12 +193,15 @@
 		"jle 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __up_wakeup\n\t"
+		"2:\tcall __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
 		:"D" (sem)
 		:"memory");
 }
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 #endif
diff -urN ./linux-2.6.18.1/include/asm-x86_64/spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/spinlock.h
--- ./linux-2.6.18.1/include/asm-x86_64/spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -38,14 +38,14 @@
 	"movl $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	asm volatile(__raw_spin_lock_string : "=m" (lock->slock) : : "memory");
 }
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	int oldval;
 
@@ -57,7 +57,7 @@
 	return oldval > 0;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -91,17 +91,17 @@
 #define __raw_read_can_lock(x)		((int)(x)->lock > 0)
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -111,7 +111,7 @@
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -120,12 +120,12 @@
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX " ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile(LOCK_PREFIX " ; addl $" RW_LOCK_BIAS_STR ",%0"
 				: "=m" (rw->lock) : : "memory");
diff -urN ./linux-2.6.18.1/include/asm-x86_64/spinlock_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/spinlock_types.h
--- ./linux-2.6.18.1/include/asm-x86_64/spinlock_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/spinlock_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
diff -urN ./linux-2.6.18.1/include/asm-x86_64/thread_info.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/thread_info.h
--- ./linux-2.6.18.1/include/asm-x86_64/thread_info.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/thread_info.h	2007-05-19 23:58:35.000000000 +0900
@@ -112,6 +112,7 @@
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedul on return to userspace */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 /* 16 free */
@@ -128,6 +129,7 @@
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
diff -urN ./linux-2.6.18.1/include/asm-x86_64/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/timex.h
--- ./linux-2.6.18.1/include/asm-x86_64/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -41,9 +41,9 @@
 
 extern unsigned int cpu_khz;
 
+extern void mark_tsc_unstable(void);
+extern void set_cyc2ns_scale(unsigned long khz);
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
-extern struct vxtime_data vxtime;
-
 #endif
diff -urN ./linux-2.6.18.1/include/asm-x86_64/tlbflush.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/tlbflush.h
--- ./linux-2.6.18.1/include/asm-x86_64/tlbflush.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/tlbflush.h	2007-05-19 23:58:35.000000000 +0900
@@ -8,11 +8,13 @@
 	do {								\
 		unsigned long tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr3, %0;  # flush TLB \n"		\
 			"movq %0, %%cr3;              \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -23,6 +25,7 @@
 	do {								\
 		unsigned long tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr4, %2;  # turn off PGE     \n"	\
 			"movq %2, %1;                        \n"	\
@@ -34,6 +37,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
diff -urN ./linux-2.6.18.1/include/asm-x86_64/unistd.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/unistd.h
--- ./linux-2.6.18.1/include/asm-x86_64/unistd.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/unistd.h	2007-05-19 23:58:35.000000000 +0900
@@ -11,6 +11,8 @@
  * Note: holes are not allowed.
  */
 
+#define NR_syscalls (__NR_syscall_max+1)
+
 /* at least 8 syscall per cacheline */
 #define __NR_read                                0
 __SYSCALL(__NR_read, sys_read)
diff -urN ./linux-2.6.18.1/include/asm-x86_64/vsyscall.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/vsyscall.h
--- ./linux-2.6.18.1/include/asm-x86_64/vsyscall.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-x86_64/vsyscall.h	2007-05-19 23:58:35.000000000 +0900
@@ -14,48 +14,19 @@
 #ifdef __KERNEL__
 #include <linux/seqlock.h>
 
-#define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
-#define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16)))
-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
-#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16)))
-#define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
-#define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
-#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16)))
-
-#define VXTIME_TSC	1
-#define VXTIME_HPET	2
-#define VXTIME_PMTMR	3
-
-struct vxtime_data {
-	long hpet_address;	/* HPET base address */
-	int last;
-	unsigned long last_tsc;
-	long quot;
-	long tsc_quot;
-	int mode;
-};
+/* Definitions for CONFIG_GENERIC_TIME definitions */
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+
 
 #define hpet_readl(a)           readl((const void __iomem *)fix_to_virt(FIX_HPET_BASE) + a)
 #define hpet_writel(d,a)        writel(d, (void __iomem *)fix_to_virt(FIX_HPET_BASE) + a)
 
-/* vsyscall space (readonly) */
-extern struct vxtime_data __vxtime;
-extern struct timespec __xtime;
-extern volatile unsigned long __jiffies;
-extern unsigned long __wall_jiffies;
-extern struct timezone __sys_tz;
-extern seqlock_t __xtime_lock;
-
 /* kernel space (writeable) */
-extern struct vxtime_data vxtime;
 extern unsigned long wall_jiffies;
 extern struct timezone sys_tz;
-extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
-
-extern int sysctl_vsyscall;
-
-#define ARCH_HAVE_XTIME_LOCK 1
+extern raw_seqlock_t xtime_lock;
+extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
 
 #endif /* __KERNEL__ */
 
diff -urN ./linux-2.6.18.1/include/asm-xtensa/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-xtensa/timex.h
--- ./linux-2.6.18.1/include/asm-xtensa/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/asm-xtensa/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -31,9 +31,6 @@
 
 #define CLOCK_TICK_RATE 	1193180	/* (everyone is using this value) */
 #define CLOCK_TICK_FACTOR       20 /* Factor of both 10^6 and CLOCK_TICK_RATE */
-#define FINETUNE ((((((long)LATCH * HZ - CLOCK_TICK_RATE) << SHIFT_HZ) * \
-	(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
-		<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)
 
 #ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
 extern unsigned long ccount_per_jiffy;
diff -urN ./linux-2.6.18.1/include/cabi/cabi.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi.h
--- ./linux-2.6.18.1/include/cabi/cabi.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi.h	2007-05-22 21:47:13.000000000 +0900
@@ -0,0 +1,801 @@
+#ifndef	CABI_CABI_H
+#define	CABI_CABI_H
+/* 
+ * Please change the definie to undef if you don't need to
+ * cabi_debug the debug codes.
+ */
+
+#ifdef CONFIG_X86
+#include <asm/msr.h>
+#endif  /* #ifdef CONFIG_X86 */
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/list.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+#include <cabi/cabi_error.h>
+#include <cabi/cabi_debug.h>
+/* from 2.6.0 64-bit/32-bit devide is allowed by thE
+ * do_div().
+ */
+#else
+#include <errno.h>
+#include <time.h>
+#include <sys/types.h>
+#include <linux/unistd.h>
+#endif
+
+/*
+ * Types and defines
+ */
+typedef unsigned long long *cpu_tick_t;
+typedef unsigned long long cpu_tick_data_t;
+
+#define	NANOSEC			1000000000L
+#define	MICROSEC		1000000L
+#define	MILISEC			1000L
+#define MILION			1000000L
+#define TENTHOUSANDS   		10000L
+#define THOUSAND		1000L
+#define HUNDRED         	100L
+
+
+/* for parameter check */
+#define CABI_TICK_MIN		1
+#define CABI_TIME_SEC_MAX	(long)(0x7FFFFFFF / 1000)
+#define CHECK_DEGIT(x)		((x)> MICROSEC ? 1 : 0)
+
+#ifdef CONFIG_X86
+static inline void
+cabi_rdtsc(cpu_tick_t data_p)
+{
+#if 1
+	rdtscll(*data_p);
+#else
+	rdtsc(*(int *) (data_p), *(((int *) data_p) + 1));
+#endif
+}
+#else
+#define CABI_CLOCK		1000000000
+#endif	/* #ifdef CONFIG_X86 */
+
+/*
+ * CPU account argument parameter
+ */
+enum cabi_terminate_operation {
+	OP_NONE    = 0x000,
+	OP_BLOCK   = 0x002,
+	OP_SIGNAL  = 0x004,
+	OP_DETECT  = 0x008,
+	OP_UNKNOWN = 0x020,
+	OP_BOOST   = 0x040,
+};
+
+typedef enum cabi_terminate_operation cabi_operation_t;
+
+enum cabi_bind_proc_type {
+	BIND_NORMAL_PROC = 0x0,
+	BIND_IDLE_PROC   = 0x1,
+};
+
+typedef enum cabi_bind_proc_type cabi_bind_type_t;
+
+enum cabi_account_policy {
+        PO_NONE   = 0x000,
+        PO_CYCLIC = 0x002,
+	PO_RM     = 0x004,
+        PO_DEFSRV = 0x008,
+        PO_VPE    = 0x020,
+	PO_EDF	  = 0x040,
+	PO_OVLD	  = 0x080,
+};
+typedef enum cabi_account_policy cabi_account_policy_t;
+
+enum cabi_replenish_policy {
+	REP_SOFT  = 0x000,
+	REP_HARD  = 0x002,
+};
+typedef enum cabi_replenish_policy cabi_replenish_policy_t;
+
+struct cpu_param {
+	cabi_account_policy_t policy;       /* user can set a policy for each AO */
+	cabi_operation_t operation;	    /* termnate action */
+	cabi_replenish_policy_t rep_policy; /* replenish policy */
+	cabi_bind_type_t bind_proc_type;    /* process bind type */
+	struct {
+		pid_t pid;		  /* process id */
+		int sig;		  /* signal number */
+		int flag;		  /* default(current) or else */
+	} cabi_signal;
+};
+typedef struct cpu_param cabi_param_data_t;
+typedef struct cpu_param *cabi_param_t;
+typedef unsigned long cabi_object_t;
+
+enum cabi_operation_status {
+	CABI_BLOCKED	=  1,
+	CABI_UNBLOCK 	=  0,
+	CABI_SIGNAL 	= -1,
+};
+typedef enum cabi_operation_status opt_state_t;
+
+#define CABI_SEND_DEFL		0
+#define CABI_SEND_PID		1
+#define CABI_SIG_DEFL		0	/* or SIGCONT ? */
+
+#define	CABI_SIGFLAG(cabi)	((cabi)->pm.cabi_signal.flag)
+#define	CABI_SIGPID(cabi)	((cabi)->pm.cabi_signal.pid)
+#define	CABI_SIGNUM(cabi)	((cabi)->pm.cabi_signal.sig)
+
+#define SIGNAL_ON		1
+#define SIGNAL_OFF		0
+
+#define LANDOM_BYTES		1
+#define OVERLOAD_CABI_ID	1
+#define FIRST_CABI_ID		2
+#define IDLE_PROCESS		0
+#define CABI_ID_MAX     	2147483647
+
+/*
+ * for user applications
+ */
+struct cabi_uaccount {
+	struct timespec cpu_time;
+	struct timespec cpu_period;
+	struct cpu_param pm;
+	cabi_object_t cabi_id;
+};
+typedef struct cabi_uaccount *cabi_uaccount_t;
+
+#ifdef	__KERNEL__
+/*
+ *	NULL			not running
+ *	RUNNING			running
+ *	DEPLETED		not running and depleted
+ *	RUNNING|DEPLETED	running but depleted
+ */
+enum cabi_account_state {
+	CABI_IS_NULL     = 0x000,	/* not running */
+	CABI_IS_RUNNING  = 0x001,	/* actually running */
+	CABI_IS_DEPLETED = 0x010,	/* it's empty */
+	//CABI_IS_BOOST	 = 0x100,	/* boosted priority */
+	CABI_IS_OVERLOAD = 0x100,	/* overloading */
+};
+typedef enum cabi_account_state cabi_account_state_t;
+typedef unsigned long cabi_overload_t;
+
+
+enum cabi_priority_control {
+	CABI_PRI_NONE   = 0x000,
+	CABI_PRI_BOOST  = 0x002,
+	CABI_PRI_NORMAL = 0x004,
+	CABI_PRI_RT     = 0x008,
+};
+typedef enum cabi_priority_control cabi_priority_control_t;
+/*
+ * include/linux/sched.h, we add cabi_info address. 
+ * this is the reference macros.
+ */
+#define	TASK_ACCOUNT(tsk)	((cabi_account_t)((tsk)->cabi_info))
+#define	LVAL_TASK_ACCOUNT(tsk)	((tsk)->cabi_info)
+#define CABI_ID(tsk)		((unsigned long)((tsk)->cabi_info->cabi_id))
+
+/*
+ * Internal data structure to keep track of cpu capacity usages.
+ */
+typedef unsigned long cpu_capacity_t;
+typedef unsigned long long cpu_capacity_quad_t;
+
+#define	CAPACITY_INT(x)		((x)/100)
+#define	CAPACITY_FRAC(x)	((x)-(x/100)*100)
+#define	INT2CAPACITY(x)		((x)*10000)
+#define	CAPACITY2INT(x)		((x)/10000)
+#define	PERCENT2CAPACITY(x)	((x)*100)
+/* #define	CAPACITY_OF(c,t)	((c)/CAPACITY2INT(t)) */
+#define	CPU_CAPACITY_MAX	PERCENT2CAPACITY(100)
+#define FULLCAPACITY    	100
+
+/*
+ * Define the instructions for hooks
+ */
+#define NULL_INSTRUCTION 	0x000
+#define WAKE_UP			0x001
+#define CAPACITY_CALCULATION	0x002
+#define LIMIT_CHECK		0x003
+#define OVERLOAD_CHECK		0x004
+#define ISR_EXPIRE		0x005
+#define PRIORITY_BOOST		0x006
+/*
+ * struct cabi_account
+ */
+#include <linux/posix-timers.h>
+struct cabi_account {
+
+	cabi_account_state_t cpu_state;	/* state */
+	struct cpu_param pm;	           /* policy & operation */
+	cabi_priority_control_t prio_type; /* type of the priority */
+	int priority_boost, priority_depleted;	  /* priority boost */
+	struct cabi_account_operations *ops;
+
+	cabi_object_t cabi_id;	/* object id */
+	cabi_overload_t overload;	/* overload flag */
+
+	/* Linux dependent part */
+	struct list_head cpu_proc_list;	/* list of processes */
+
+	/* procfs entry under /proc/cpu/ */
+	struct proc_dir_entry *cpu_proc_entry;
+	struct proc_dir_entry *cpu_log_proc_entry;
+	struct timer_list cpu_replenish_tmr;
+
+	struct hrtimer cpu_replenish_hrtmr;
+
+	cpu_tick_data_t cpu_period_used_ticks;	/* CPU ticks already used 
+						   in the current period   */
+	cpu_tick_data_t cpu_period_start_ticks;	/* remember start ticks for 
+						   the current invocation   */
+	cpu_tick_data_t cpu_period_available_ticks; /* CPU ticks that can be
+						     * used for this period   */
+	wait_queue_head_t depleted_wait;	/* for enforcement        */
+
+	/* execution statistics */
+	cpu_tick_data_t cpu_period_prev_used_ticks;
+	cpu_tick_data_t cpu_total_used_ticks;
+	cpu_capacity_t cpu_max_utilization;
+	cpu_capacity_t cpu_min_utilization;
+
+	struct {
+		unsigned long total_count;	 /* replenishment count */
+		cpu_capacity_quad_t total_utils; /* sum of utilizations */
+		unsigned long *history;		 /* history of the average */
+	} cpu_average;
+
+	/* static data */
+	struct list_head cpu_link;	/* link all cpu accounts */
+	struct timespec cpu_time;
+	struct timespec cpu_period;
+	cpu_tick_data_t cpu_time_ticks;	/* ticks for cpu_time */
+	cpu_tick_data_t cpu_period_ticks;	/* ticks for cpu_period */
+	cpu_capacity_t cpu_capacity;	/* requested capacity */
+
+	opt_state_t opt_state;
+	unsigned long block_count, signal_count;
+	int	signal_block;
+};
+
+typedef struct cabi_account *cabi_account_t;
+
+#define	NULL_ACCOUNT		((cabi_account_t)0)
+#define NULL_UACCOUNT		((cabi_uaccount_t)0)
+#define NULL_ACCOUNT_ID         0x0
+
+/* Spin Lock & Unlock
+ * Usage:
+ *	{
+ *		unsigned long flags;
+ e		cabi_spin_lock(flags);
+ *		<CRITICAL SECTION>
+ *		cabi_spin_unlock(flags);
+ *	}
+ */
+extern spinlock_t cabi_lock;
+#define	cabi_spin_lock(flags)	spin_lock_irqsave(&cabi_lock, flags)
+#define	cabi_spin_unlock(flags)	spin_unlock_irqrestore(&cabi_lock, flags)
+
+/*
+ * timer for accounts:
+ *	replenishment	... periodic
+ *	expiration	... aperiodic
+ *
+ *   - tmr_expire:  holds an absolute time in CPU tick when its timer expires.
+ *   - tmr_handler: keeps the pointer to a handler funcion which is called 
+ *		    when its timer expires.
+ *
+ */
+extern void cabi_replenish_timer_init(cabi_account_t, cpu_tick_t);
+extern void cabi_replenish_timer_cancel(cabi_account_t);
+extern void cabi_enforce_timer_start(cabi_account_t, cpu_tick_t);
+extern void cabi_enforce_timer_cancel(void);
+extern void cabi_timer_start(void);
+extern int cabi_send_signal(cabi_account_t, int, int);
+
+/*
+ * Global Variables and params
+ */
+extern cpu_tick_data_t cabi_timer_adjust;
+extern cpu_tick_data_t cabi_cpu_ticks_jitter;
+extern cpu_tick_data_t cabi_cpu_ticks_per_second;
+//extern cpu_tick_data_t cabi_cpu_ticks_per_jiffy;
+extern cabi_account_t cabi_current_account;
+
+#define CABI_USECS_PER_JIFFY	(1000016L/HZ)	/* 10000 (HZ=100) */
+#define CABI_NANOSECS_PER_JIFFY	(1000016762L/HZ)	/* 10000167 (HZ=100) */
+
+/*
+ * Conversions 
+ */
+
+
+static inline void
+usec2tick(cpu_tick_t us, cpu_tick_t tick)
+{
+
+	unsigned long rtick, rem;
+	cpu_tick_data_t usec = MICROSEC;
+	cpu_tick_data_t result;
+
+	ENTER;
+
+	result = (*us) * cabi_cpu_ticks_per_second;
+
+	cabi_debug("multiple*ns: %llu\n", result);
+	cabi_debug("usec : %llu \n", usec);
+
+	rtick = div_long_long_rem(result, usec, &rem);
+
+	cabi_debug("div (64bit/32bit) result rtick %lu rem %lu\n", 
+		rtick, rem);
+
+	*tick = (cpu_tick_data_t) rtick;
+
+	cabi_debug("tick %llu \n", *tick);
+
+}
+
+#include <linux/smp_lock.h>
+static spinlock_t cal_lock = SPIN_LOCK_UNLOCKED;
+
+static inline void
+nanosec2tick(cpu_tick_t ns, cpu_tick_t tick)
+{
+	cpu_tick_data_t nanosec = NANOSEC;
+	cpu_tick_data_t result;
+	unsigned long rtick, rem, cpu_sec, cpu_ns;
+	
+	spin_lock(&cal_lock);
+
+	if (cabi_cpu_ticks_per_second > 1000000UL) {
+		cabi_debug("cabi_cpu_ticks_per_second is over %llu\n", 
+			cabi_cpu_ticks_per_second);
+
+		/* to avoid overflow of multiple of two values,
+		   a * b / c  transform a /c1 * b/c2  
+		   in this case, nanosec is 1000000000, therefore
+		   deviding by 100000(5) and 100000(4) then multiple
+		   makes the same result */
+
+		/* cabi_cpu_ticks_per_secnd is devided by 100000 */ 
+		cpu_sec = div_long_long_rem (cabi_cpu_ticks_per_second,
+					100000, &rem);
+		cabi_debug("[1] cpu_sec : %lu\n", cpu_sec);
+		
+		/* ns is devided by 10000 */
+		cpu_ns =  div_long_long_rem (*ns, 10000, &rem);
+		cabi_debug("[2] cpu_ns : %lu\n", cpu_ns);
+
+		rtick = cpu_sec * cpu_ns;
+		cabi_debug("[3] div (64bit/32bit) rtick %lu rem %lu\n", 
+						rtick, rem);
+		
+	} else {
+		result = (*ns) * cabi_cpu_ticks_per_second;
+		cabi_debug("[1] multiple*ns: %llu\n", result);
+		rtick = div_long_long_rem(result, nanosec , &rem);
+
+		cabi_debug("[2] div (64bit/32bit) rtick %lu rem %lu\n",
+			rtick, rem);
+	}
+
+	*tick = (cpu_tick_data_t) rtick;
+	spin_unlock(&cal_lock);
+
+	cabi_debug("[3] rtick-> tick %llu \n", *tick);
+}
+
+
+
+static inline cpu_tick_data_t
+_tick2nanosec(cpu_tick_t tick)
+{
+	cpu_tick_data_t nanosec = NANOSEC;
+	return ((*tick) * nanosec) / cabi_cpu_ticks_per_second;
+}
+
+
+static inline cpu_tick_data_t
+tick2nanosec(cpu_tick_t tick)
+{
+	cpu_tick_data_t nanosec = NANOSEC;
+	cpu_tick_data_t us = MICROSEC;
+	cpu_tick_data_t ms = MILISEC;
+	unsigned long rem, rtick, cpu_sec;
+	cpu_tick_data_t  cpu_tick;
+
+	spin_lock(&cal_lock);
+	if (*tick > 100000000UL) {
+		cabi_debug("tick is over 100000000\n");
+
+		/* The formula is originally, 
+		 * ns * tick / cpu_ticks_pse 
+		 * (ns * tick)/us / cpu_ticks_pse/us
+                 * = ms * tick / cpu_ticks_pse/us
+                 */
+
+		cpu_tick = (*tick) * ms;
+		cpu_sec = div_long_long_rem 
+			(cabi_cpu_ticks_per_second, us, &rem);
+
+		cabi_debug("cpu_tick %llu cpu_sec %lu\n",
+			cpu_tick, cpu_sec);
+
+		rtick = (unsigned long) div_long_long_rem
+			(cpu_tick, (unsigned long) cpu_sec, &rem);
+			
+ 	} else {
+		*tick = (*tick) * nanosec;
+		rtick = (unsigned long) div_long_long_rem
+		((*tick), (unsigned long) cabi_cpu_ticks_per_second, &rem);
+	}
+	*tick = (unsigned long long) rtick;
+	 spin_unlock(&cal_lock);
+
+	cabi_debug("nanosec : tick %llu rem %lu\n", *tick, rem);
+
+	return (*tick);
+}
+
+
+#ifndef CONFIG_X86
+static inline void
+jiffies2tick(cpu_tick_t tick)
+{
+	unsigned long long milisec = MILISEC;
+	*tick = (jiffies * 10) * (cabi_cpu_ticks_per_second / milisec);
+}
+#endif	/* #ifndef CONFIG_X86 */
+
+static inline void
+cabi_rdticks(cpu_tick_t data_p)
+{
+#ifdef CONFIG_X86
+	cabi_rdtsc(data_p);
+#else
+	jiffies2tick(data_p);
+#endif	/* #ifdef CONFIG_X86 */
+}
+
+/* 
+ * Conversion between tick and usec
+ * cabi_cpu_ticks_per_second:MICROSEC = ticks:usec
+ */
+static inline void
+tick2usec(cpu_tick_t tick, cpu_tick_t us)
+{
+	cpu_tick_data_t usec = MICROSEC;
+	unsigned long rem, rtick, ctick;
+
+	cabi_debug("[tick2usec:01] %llu\n", *tick);
+
+	ctick = (unsigned long) div_long_long_rem 
+		(cabi_cpu_ticks_per_second, (unsigned long)usec, &rem);
+
+	if(*tick > 100000000UL) {
+
+		rtick = (unsigned long) 
+			div_long_long_rem ((*tick), ctick, &rem);
+		cabi_debug("[tick2usec:02] usec %lu rem %lu\n", rtick, rem);
+		
+	} else {
+		rtick = (*tick) / ctick;
+		cabi_debug("[tick2usec:03] usec %lu\n", rtick);
+	}
+	*us = (unsigned long) rtick;
+
+}
+
+static inline unsigned long
+TICK2USEC(cpu_tick_t tick)
+{
+	cpu_tick_data_t us;
+	tick2usec(tick, &us);
+	return (unsigned long) us;
+}
+
+static inline int
+tick2ts(cpu_tick_t tick, struct timespec *ts)
+{
+
+    cpu_tick_data_t ns;
+    cpu_tick_data_t ms = MILISEC;
+    unsigned long rtick, mtick, rem, cpu_sec;
+	
+	if (!tick) {
+		return CABI_EINVAL;
+	}	
+	
+	cabi_debug("in tick2ts\n");
+	cabi_debug("tick %llu / ticks_per_sec %llu\n", 
+			(*tick), cabi_cpu_ticks_per_second);
+	
+	if (*tick > MICROSEC) {
+		mtick = div_long_long_rem
+			((*tick), ms, &rem);
+		cpu_sec = div_long_long_rem 
+			((cabi_cpu_ticks_per_second), ms, &rem);
+		rtick = (unsigned long) div_long_long_rem
+		 (mtick, (unsigned long) cpu_sec, &rem);
+        	ts->tv_sec = rtick;
+
+		cabi_debug("tv_sec %lu rem %lu\n", ts->tv_sec, rem);
+		ns = mtick - (cpu_sec * ts->tv_sec);
+		ns = ns*ms;		
+
+		cabi_debug("tick - (tv_sec*cpu_sec) = ns %llu\n", ns);
+        	ts->tv_nsec = tick2nanosec(&ns);
+
+	} else {
+		cabi_debug("tick is not over.\n");
+		rtick = (unsigned long) div_long_long_rem
+			((*tick), 
+			(unsigned long) cabi_cpu_ticks_per_second, &rem);
+        	ts->tv_sec = rtick;
+		cabi_debug("tv_sec %lu rem %lu\n", ts->tv_sec, rem);
+
+        	ns = *tick - (cabi_cpu_ticks_per_second * ts->tv_sec);
+		cabi_debug("tick - (tv_sec*cpu_sec) = ns %llu\n", ns);
+
+        	ts->tv_nsec = tick2nanosec(&ns);
+	}   
+	cabi_debug("tv_nsec %lu\n", ts->tv_nsec);
+}
+
+/*
+ * Name : cabi_account_summation_of_used_ticks
+ *
+ * This function called from stop_account() function which has to tally
+ * their used time.
+ */
+
+static inline void
+cabi_account_summation_of_used_ticks(cabi_account_t cabi, cpu_tick_t now)
+{
+        cabi->cpu_period_used_ticks += (*now - cabi->cpu_period_start_ticks);
+        cabi->cpu_period_start_ticks = *now;
+}
+
+static inline unsigned long long
+cabi_subtract_ll (unsigned long x, unsigned long y,
+                        cabi_account_t cabi)
+{
+        unsigned long long available, used, pulled;
+        unsigned long rem;
+
+        available = x;
+        used      = y;
+
+        available = (unsigned long) div_long_long_rem
+                    (cabi->cpu_period_available_ticks,
+                     MILISEC, &rem);
+        used      = (unsigned long) div_long_long_rem
+                    (cabi->cpu_period_used_ticks,
+                     MILISEC, &rem);
+        pulled = (available - used) * MILISEC;
+
+        return pulled;
+}
+
+
+/*
+ * Enable/Disable CABI hooks
+ */
+static void inline
+cabi_enable(void)
+{
+	extern void cabi_enable_schedule_cpu(void);
+	extern void cabi_enable_isr(void);
+
+	cabi_enable_schedule_cpu();
+	cabi_enable_isr();
+}
+
+static void inline
+cabi_disable(void)
+{
+	extern void cabi_disable_schedule_cpu(void);
+	extern void cabi_disable_isr(void);
+
+	cabi_disable_schedule_cpu();
+	cabi_disable_isr();
+}
+
+/*
+ * Start/Stop accounting
+ */
+void cabi_start_account(cabi_account_t);
+void cabi_stop_account(cabi_account_t, cpu_tick_t);
+void cabi_account_enforce(cabi_account_t);
+void cabi_account_replenish(cabi_account_t);
+extern void cabi_account_sleep_on(cabi_account_t);
+
+/* proc file systems */
+extern int cabi_account_read_proc(cabi_account_t, char *);
+extern int cabi_account_read_bindpid_proc(cabi_account_t, char *);
+extern int cabi_account_read_block_proc(cabi_account_t, char *);
+extern int cabi_account_read_time_proc(cabi_account_t, char *);
+
+static inline int
+cabi_account_depleted(cabi_account_t cabi)
+{
+	if (cabi) {
+		return (cabi->cpu_state & CABI_IS_DEPLETED);
+	}
+	return 0;
+}
+
+extern void cabi_proc_account_create(cabi_account_t);
+extern void cabi_proc_account_destroy(cabi_account_t);
+
+/* 
+ * for overload monitoring.
+ */
+extern int setup_overload_id (cabi_account_t cabi);
+extern int cabi_account_check_overload(void);
+extern void capacity_overload (struct timespec *, struct timespec *, 
+					        cpu_capacity_quad_t *);
+
+static inline int
+cabi_account_overload(cabi_account_t cabi)
+{
+	if (cabi) {
+		if (cabi->overload == CABI_IS_OVERLOAD) {
+			cabi_debug("CABI_IS_OVERLOAD\n");
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/* cabi_account.c */
+extern cabi_account_t search_cabi(cabi_object_t);
+extern int cabi_account_attach (unsigned long, 
+				struct task_struct *);
+extern int cabi_account_detach (struct task_struct *);
+
+extern cpu_capacity_t capacity_of (cpu_capacity_quad_t, cpu_capacity_quad_t);
+
+extern cabi_account_t cabi_account_create (struct timespec *, struct timespec *,
+			cabi_param_data_t *);
+extern int cabi_account_set (unsigned long, struct timespec *, 
+			struct timespec *, cabi_param_data_t *);
+extern int cabi_account_destroy (unsigned long);
+
+extern void capacity_calculation (cabi_account_t);
+extern int capacity_admission_control(cabi_param_data_t *p, unsigned long capacity);
+
+extern void cabi_account_sleep_on (cabi_account_t);
+extern int cabi_boost_priority (cabi_account_t);
+extern void cpu_time_replenish_init(cabi_account_t);
+extern void check_sched_param (struct rs_proc_list *rs_proc);
+
+/* operations */
+extern void set_terminate_operation (cabi_account_t);
+extern void set_account_policy (cabi_account_t);
+
+extern struct cabi_account_operations cyclic_ops;
+extern struct cabi_account_operations dsv_ops;
+extern struct cabi_account_operations ovl_ops;
+
+
+struct cabi_account_operations {
+	void (*replenish) (cabi_account_t);
+	void (*isr) (cabi_account_t);
+	void (*isr_operation) (cabi_account_t);
+	void (*isr_priority) (cabi_account_t);
+};
+
+/*
+ * Scheduling type
+ */
+#define	LINUX_RT_PRIORITY		50	/* used with SCHED_FIFO */
+#define	LINUX_TS_PRIORITY		0
+#define CABI_RT_MAX_PRIORITY		60
+
+/*
+ * Process list 
+ */
+struct rs_proc_list {
+	struct list_head rs_proc_list;
+	pid_t rs_proc_pid;
+	struct task_struct *rs_proc_task;
+};
+
+static inline void
+rs_proc_list_apply(struct list_head *proc_list_head,
+		   void (*func) (struct rs_proc_list *))
+{
+	struct list_head *proc_list;
+	struct rs_proc_list *rs_proc;
+
+	proc_list = proc_list_head->next;
+	while (proc_list != proc_list_head) {
+		/* get a rs_proc */
+		rs_proc =
+		    list_entry(proc_list, struct rs_proc_list, rs_proc_list);
+		/* do it */
+		func(rs_proc);
+
+		/* next process */
+		proc_list = proc_list->next;
+	}
+}
+
+#define	malloc(s)	kmalloc(s, SLAB_KERNEL)
+#define	free(p)		kfree(p)
+#define	bzero(p, s)	memset(p, 0, s)
+
+extern void cabi_account_sched_rt(struct rs_proc_list *);
+extern void cabi_account_sched_ts(struct rs_proc_list *);
+extern struct task_struct * find_idle_process(void);
+
+static inline struct task_struct *
+__cabi_find_process_by_pid(pid_t pid)
+{
+	struct task_struct *tsk = current;
+
+	if (pid)
+		tsk = find_task_by_pid(pid);
+	return tsk;
+}
+
+static inline struct task_struct *
+__cabi_find_idle_process(pid_t pid)
+{
+	struct task_struct *idle;
+
+	/* if (pid == 0) */
+	idle = find_idle_process();
+	if (!idle) {
+		idle = NULL;
+		return idle;
+	} else {
+		return idle;
+	}
+}
+
+#else				/* __KERNEL__ */
+
+/*
+ * Types for user level applications.
+ */
+typedef void *cabi_account_t;
+#define	NULL_ACCOUNT	        0
+
+/*
+ * Function prototypes,
+ */
+extern int cabi_account_create(struct cabi_uaccount *);
+extern int cabi_account_destroy(unsigned long);
+extern int cabi_account_bind_pid(unsigned long, pid_t);
+extern int cabi_account_bind_pgid(unsigned long cabi_id, pid_t pgid);
+extern int cabi_account_unbind(pid_t);
+extern int cabi_account_get(unsigned long, cabi_uaccount_t);
+extern int cabi_account_set(unsigned long, cabi_uaccount_t);
+/* 
+ * library call 
+ */
+extern int cabi_overload_create(struct cabi_uaccount *);
+extern int cabi_overload_destroy(void);
+extern int cabi_create_bind(struct cabi_uaccount *, pid_t);
+
+
+#endif				/* __KERNEL__ */
+
+
+#endif
diff -urN ./linux-2.6.18.1/include/cabi/cabi_debug.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi_debug.h
--- ./linux-2.6.18.1/include/cabi/cabi_debug.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi_debug.h	2007-05-23 11:22:27.000000000 +0900
@@ -0,0 +1,209 @@
+/*
+ * Debug header of CABI
+ */
+#ifndef	CABI_DEBUG_H
+#define	CABI_DEBUG_H
+
+#include <cabi/cabi.h>
+#include <asm/system.h>
+#include <asm/io.h>
+
+
+/*
+ * Please change the definie to undef if you don't need to
+ * printk the debug codes.
+ */
+/*
+ * Define CABI_DEBUG to produce debug message to console
+ */
+#undef CABI_DEBUG
+#define CABI_TEST	1
+
+/*
+ * The CABI version 
+ */
+#define CABI_DATE	"07/05/05"
+#define CABI_VERSION	"0.5a"
+
+/*
+ * Debug code
+ */
+/* assrt with traditional prinf/panic */
+#ifdef CONFIG_KERNEL_ASSERTS 
+/* kgdb stuff */
+#define assert(p) KERNEL_ASSERT(#p, p)
+#else
+#define assert(p) do { \
+	if (!(p)) {						\
+		printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", 	\
+			__FILE__, __LINE__, #p);              	\
+		BUG();                                        	\
+	}                                                     	\
+} while (0)
+#endif
+
+/*
+ * the general debug interface for cabi
+ */
+
+#ifdef CABI_DEBUG
+#define cabi_debug(fmt, arg...)                                        \
+                printk(KERN_NOTICE " " __FILE__": " fmt "" , ## arg)
+#else
+#define cabi_debug(fmt, arg...)
+#endif
+
+#ifdef CABI_DEBUG
+#define ENTER      \
+	cabi_debug("ENTER  [%s;%d] \n", __FUNCTION__, __LINE__);
+#define EXIT       \
+	cabi_debug("EXIT [%s:%d] \n", __FUNCTION__,__LINE__);
+
+#define CABI_DBG  \
+	cabi_debug("DBG [%s:%d] \n", __FUNCTION__,__LINE__);
+
+#else 
+
+#define ENTER
+#define EXIT
+#define CDEBUG
+#define CABI_DBG
+#define CABI_DBG_AO
+#define CABI_DBG_TIMESPEC
+#define CABI_DBG_TG
+#define CABI_WARNING
+#define CABI_NOTICE
+#define CABI_DEBUGX
+
+#endif	// CABI_DEBUG
+
+/*
+ * Define CABI_TEST to produce cycle counter values to specific buffer.
+ */
+#ifdef CABI_TEST
+
+#define cabi_test(fmt, arg...)     \
+                printk(KERN_NOTICE "" fmt "" , ## arg)
+#define MAX_BUF_LEN 	1024
+
+extern char cabip[MAX_BUF_LEN];
+extern int cabi_len;
+extern int cabi_debug_counter;
+
+static inline void cabi_eval_rdtsc (unsigned long long *data_p)
+{
+	rdtscll(*data_p);
+}
+static inline unsigned long long cabi_show_rdtsc(void) 
+{
+	unsigned long long now;
+	cabi_eval_rdtsc (&now);
+	return now;
+}
+static inline void
+cabi_print_enter (void) 	
+{	
+	unsigned long long now;
+
+	cabi_eval_rdtsc(&now);
+	cabi_len += sprintf (cabip+cabi_len, "%llu\n", now); 
+}
+
+static inline void
+cabi_print_exit (void) 
+{
+	unsigned long long now;
+	
+	cabi_eval_rdtsc(&now);
+	cabi_len += sprintf (cabip+cabi_len, "EXIT %llu %s %s \n", 
+		now, __FILE__, __FUNCTION__); 
+
+}
+#define TENTER		\
+            cabi_test ("ENTER [%s:%d] %llu\n", __FUNCTION__, __LINE__, \
+            cabi_show_rdtsc());
+#define TEXIT		\
+            cabi_test ("EXIT  [%s:%d] %llu\n", __FUNCTION__, __LINE__, \
+            cabi_show_rdtsc());
+/* 
+#define cabi_dump \
+do { \
+		cabi_len += sprintf(cabip+cabi_len, "[%s:%d] %llu %d\n", __FUNCTION__,__LINE__, cabi_show_rdtsc(), cabi_debug_counter++);   \
+} while(0)
+*/
+
+#else
+#define TENTER		
+#define TEXIT	
+#endif
+
+
+/*
+ * Debug functions 
+ */
+
+
+/* extern parameters */
+extern struct cabi_uaccount *ucabi;
+extern struct cpu_param *p;
+extern struct cabi_account *cabi;
+extern struct rs_proc_list *rs_proc;
+extern struct timer_list *tmr;
+
+/* sanity_check_functions */
+extern int  cabi_sanity_check_timespec_c_and_t (struct timespec *c, struct timespec *t);
+extern int  cabi_sanity_check_overload_id (void);
+extern int  cabi_sanity_check_ucabi (struct cabi_uaccount *ucabi);
+extern int  cabi_sanity_check_term_act (struct cabi_uaccount *ucabi);
+extern void cabi_sanity_check_parameter_to_copy (struct timespec *c, struct timespec *t, struct cpu_param *p);
+extern int  cabi_sanity_check_operation (struct cabi_uaccount *ucabi);
+extern void cabi_debug_timespecs (struct timespec *c, struct timespec *t);
+extern void cabi_debug_timespec (struct timespec *t);
+extern int cabi_sanity_check_sigoperation (struct cabi_uaccount *ucabi);
+
+/* just logging the values */
+extern void cabi_debug_capacity (unsigned long capacity);
+extern void cabi_debug_ticks (struct cabi_account *);
+extern void cabi_debug_ct (unsigned long long, unsigned long long);
+extern void cabi_debug_sig_param(struct cabi_account *);
+
+extern int pid_check(pid_t);
+extern void bind_pid_error(pid_t);
+extern void cabi_account_attached(void);
+extern int pgid_check(pid_t);
+extern int cabi_id_check(unsigned long);
+
+extern void cabi_debug_info (struct cabi_account *);
+extern void cabi_debug_state (struct cabi_account *);
+extern void cabi_debug_detach (struct rs_proc_list *);
+extern void cabi_debug_ticks (struct cabi_account *);
+extern void cabi_debug_tick (unsigned long long tick);
+extern void cabi_debug_avail_ticks (struct cabi_account *);
+
+/* cabi_isr.c */
+extern void cabi_debug_current(struct cabi_account *);
+
+/* cabi_sched.c */
+extern void cabi_debug_entities (struct cabi_account *,
+				struct cabi_account *);
+/* kernel/sched.c */
+extern void cabi_debug_ksched (struct task_struct *,
+				struct task_struct *);
+/* cabi_signal.c */
+extern void cabi_debug_signal_pid (int);
+extern void cabi_debug_signal_attrs (int, int, struct task_struct *tsk);
+
+/* cabi_timer.c */
+extern void cabi_debug_times (struct timer_list *tmr, unsigned long);
+extern void cabi_debug_timer (struct timer_list *tmr);
+
+/* */
+extern void cabi_dump (int);
+extern void cabi_dump_ex (int, struct task_struct *, 
+				struct cabi_account *);
+extern int cabi_dump_init (void);
+extern void cabi_dump_read (int);
+extern void cabi_dump_end (void);
+extern void cabi_dump_sched (int, struct task_struct *);
+
+#endif	/* CABI_DEBUG_H */
diff -urN ./linux-2.6.18.1/include/cabi/cabi_error.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi_error.h
--- ./linux-2.6.18.1/include/cabi/cabi_error.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi_error.h	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,23 @@
+#ifndef	CABI_CABI_ERROR_H
+#define	CABI_CABI_ERROR_H
+
+#define	CABI_SUCCESS		  0	/* Success */
+#define	CABI_ERROR		  1	/* Error */
+
+#define CABI_SYSERR		  2	/* Linux system error */
+#define CABI_EINVAL		  3	/* Invalid argument */
+#define CABI_EACCESS		  4	/* No use authority */
+
+#define CABI_ENOMEM		  5	/* Memory allocation failed. */
+#define CABI_EATTACHED		  6	/* Process is still attached. */
+#define CABI_ENOAVLE		  7	/* PID is registered into another AO. */
+#define CABI_EREGIST		  8	/* PID is already registered into AO. */
+#define CABI_ENOBIND		  9	/* PID is not bound to AO. */
+#define CABI_ENOEXIST		 10	/* AO dose not exist. */
+#define CABI_EPNOEXIST		 11	/* PID dose not exist. */
+#define CABI_EPGNOEXIST		 12	/* PGID dose not exist. */
+#define CABI_ENOCABIID		 13	/* No more cabi id exist. */
+
+#define CABI_CREATE_ERR		 14	/* cabi_account_create error. */
+
+#endif	/* CABI_CABI_ERROR_H */
diff -urN ./linux-2.6.18.1/include/cabi/cabi_test.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi_test.h
--- ./linux-2.6.18.1/include/cabi/cabi_test.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/cabi_test.h	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,66 @@
+/*
+ * Testing header of CABI
+ */
+#ifndef	CABI_TEST_H
+#define	CABI_TEST_H
+
+#include <cabi/cabi.h>
+#include <cabi/cabi_debug.h>
+/*
+ * Please change the definie to undef if you don't need to
+ * printk the debug codes.
+ */
+#ifdef CABI_TEST
+
+
+
+
+
+
+
+
+/* assrt with traditional prinf/panic */
+#ifdef CONFIG_KERNEL_ASSERTS 
+/* kgdb stuff */
+#define assert(p) KERNEL_ASSERT(#p, p)
+#else
+#define assert(p) do { \
+	if (!(p)) {						\
+		printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", 	\
+			__FILE__, __LINE__, #p);              	\
+		BUG();                                        	\
+	}                                                     	\
+} while (0)
+#endif
+
+/*
+ * the general debug interface for cabi
+ */
+
+#ifdef CABI_DEBUG
+#define cabi_debug(fmt, arg...)                                        \
+		printk(KERN_NOTICE " " __FILE__": " fmt "" , ## arg)  
+#else
+#define cabi_debug(fmt, arg...)   do {} while (0)
+#endif
+
+#ifdef CABI_DEBUG
+#define ASSERT(p) assert(p)
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#define ENTER      \
+	cabi_debug("ENTER  [%s;%d] \n", __FUNCTION__, __LINE__)
+#define EXIT       \
+	cabi_debug("EXIT [%s:%d] \n", __FUNCTION__,__LINE__);
+#define CABI_DBG  \
+	cabi_debug("DBG [%s:%d] \n", __FUNCTION__,__LINE__);
+
+#define CABI_DBG_TS (c,t) \
+		__debug_timespec(c,t);         \
+		CABI_DBG_TIMESPEC;		\
+
+
+
+#endif	/* CABI_DEBUG_H */
diff -urN ./linux-2.6.18.1/include/cabi/unistd.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/unistd.h
--- ./linux-2.6.18.1/include/cabi/unistd.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/cabi/unistd.h	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,118 @@
+#ifndef _ASM_I386_CABI_UNISTD_H_
+#define _ASM_I386_CABI_UNISTD_H_
+
+/*
+ * This file contains the system call numbers.
+ */
+
+#ifdef  CONFIG_CABI
+#define __NR_cabi_account_create          318
+#define __NR_cabi_account_destroy         319
+#define __NR_cabi_account_bind_pid        320
+#define __NR_cabi_account_bind_pgid       321
+#define __NR_cabi_account_unbind          322
+#define __NR_cabi_account_get             323
+#define __NR_cabi_account_set             324
+#define __NR_cabi_account_eval	          325
+#endif
+
+
+/*
+ * user-visible error numbers are in the range -1 - -128: see
+ * <asm-i386/errno.h>
+ */
+#define __syscall_return(type, res) \
+do { \
+	if ((unsigned long)(res) >= (unsigned long)(-(128 + 1))) { \
+		errno = -(res); \
+		res = -1; \
+	} \
+	return (type) (res); \
+} while (0)
+
+/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
+#define _syscall0(type,name) \
+type name(void) \
+{ \
+long __res; \
+__asm__ volatile ("int $0x80" \
+	: "=a" (__res) \
+	: "0" (__NR_##name)); \
+__syscall_return(type,__res); \
+}
+
+#define _syscall1(type,name,type1,arg1) \
+type name(type1 arg1) \
+{ \
+long __res; \
+__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
+	: "=a" (__res) \
+	: "0" (__NR_##name),"ri" ((long)(arg1)) : "memory"); \
+__syscall_return(type,__res); \
+}
+
+#define _syscall2(type,name,type1,arg1,type2,arg2) \
+type name(type1 arg1,type2 arg2) \
+{ \
+long __res; \
+__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
+	: "=a" (__res) \
+	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)) \
+	: "memory"); \
+__syscall_return(type,__res); \
+}
+
+#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
+type name(type1 arg1,type2 arg2,type3 arg3) \
+{ \
+long __res; \
+__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
+	: "=a" (__res) \
+	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
+		  "d" ((long)(arg3)) : "memory"); \
+__syscall_return(type,__res); \
+}
+
+#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
+{ \
+long __res; \
+__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
+	: "=a" (__res) \
+	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
+	  "d" ((long)(arg3)),"S" ((long)(arg4)) : "memory"); \
+__syscall_return(type,__res); \
+} 
+
+#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+	  type5,arg5) \
+type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
+{ \
+long __res; \
+__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; movl %1,%%eax ; " \
+                  "int $0x80 ; pop %%ebx" \
+	: "=a" (__res) \
+	: "i" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
+	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)) \
+	: "memory"); \
+__syscall_return(type,__res); \
+}
+
+#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+	  type5,arg5,type6,arg6) \
+type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
+{ \
+long __res; \
+  struct { long __a1; long __a6; } __s = { (long)arg1, (long)arg6 }; \
+__asm__ volatile ("push %%ebp ; push %%ebx ; movl 4(%2),%%ebp ; " \
+                  "movl 0(%2),%%ebx ; movl %1,%%eax ; int $0x80 ; " \
+                  "pop %%ebx ;  pop %%ebp" \
+	: "=a" (__res) \
+	: "i" (__NR_##name),"0" ((long)(&__s)),"c" ((long)(arg2)), \
+	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)) \
+	: "memory"); \
+__syscall_return(type,__res); \
+}
+
+
+#endif /* _ASM_I386_UNISTD_H_ */
diff -urN ./linux-2.6.18.1/include/linux/bit_spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/bit_spinlock.h
--- ./linux-2.6.18.1/include/linux/bit_spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/bit_spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,6 +1,8 @@
 #ifndef __LINUX_BIT_SPINLOCK_H
 #define __LINUX_BIT_SPINLOCK_H
 
+#if 0
+
 /*
  *  bit-based spin_lock()
  *
@@ -73,5 +75,7 @@
 #endif
 }
 
+#endif
+
 #endif /* __LINUX_BIT_SPINLOCK_H */
 
diff -urN ./linux-2.6.18.1/include/linux/buffer_head.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/buffer_head.h
--- ./linux-2.6.18.1/include/linux/buffer_head.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/buffer_head.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,10 +19,6 @@
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
-
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
@@ -68,6 +64,8 @@
  	void *b_private;		/* reserved for b_end_io */
 	struct list_head b_assoc_buffers; /* associated with another mapping */
 	atomic_t b_count;		/* users using this buffer_head */
+	spinlock_t b_uptodate_lock;
+	spinlock_t b_state_lock;
 };
 
 /*
diff -urN ./linux-2.6.18.1/include/linux/clockchips.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/clockchips.h
--- ./linux-2.6.18.1/include/linux/clockchips.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/clockchips.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,104 @@
+/*  linux/include/linux/clockchips.h
+ *
+ *  This file contains the structure definitions for clockchips.
+ *
+ *  If you are not a clockchip, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKCHIPS_H
+#define _LINUX_CLOCKCHIPS_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+
+/* Clock event mode commands */
+enum {
+	CLOCK_EVT_PERIODIC,
+	CLOCK_EVT_ONESHOT,
+	CLOCK_EVT_SHUTDOWN,
+};
+
+/* Clock event capability flags */
+#define CLOCK_CAP_TICK		0x000001
+#define CLOCK_CAP_UPDATE	0x000002
+#ifndef CONFIG_PROFILE_NMI
+# define CLOCK_CAP_PROFILE	0x000004
+#else
+# define CLOCK_CAP_PROFILE	0x000000
+#endif
+#ifdef CONFIG_HIGH_RES_TIMERS
+# define CLOCK_CAP_NEXTEVT	0x000008
+#else
+# define CLOCK_CAP_NEXTEVT	0x000000
+#endif
+
+#define CLOCK_BASE_CAPS_MASK	(CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | \
+				 CLOCK_CAP_UPDATE)
+#define CLOCK_CAPS_MASK		(CLOCK_BASE_CAPS_MASK | CLOCK_CAP_NEXTEVT)
+
+struct clock_event;
+
+/**
+ * struct clock_event - clock event descriptor
+ *
+ * @name:		ptr to clock event name
+ * @capabilities:	capabilities of the event chip
+ * @max_delta_ns:	maximum delta value in ns
+ * @min_delta_ns:	minimum delta value in ns
+ * @mult:		nanosecond to cycles multiplier
+ * @shift:		nanoseconds to cycles divisor (power of two)
+ * @set_next_event:	set next event
+ * @set_mode:		set mode function
+ * @suspend:		suspend function (optional)
+ * @resume:		resume function (optional)
+ * @evthandler:		Assigned by the framework to be called by the low
+ *			level handler of the event source
+ */
+struct clock_event {
+	const char	*name;
+	unsigned int	capabilities;
+	unsigned long	max_delta_ns;
+	unsigned long	min_delta_ns;
+	unsigned long	mult;
+	int		shift;
+	void		(*set_next_event)(unsigned long evt,
+					  struct clock_event *);
+	void		(*set_mode)(int mode, struct clock_event *);
+	int		(*suspend)(struct clock_event *);
+	int		(*resume)(struct clock_event *);
+	void		(*event_handler)(struct pt_regs *regs);
+};
+
+/*
+ * Calculate a multiplication factor
+ */
+static inline unsigned long div_sc(unsigned long a, unsigned long b,
+				   int shift)
+{
+	uint64_t tmp = ((uint64_t)a) << shift;
+	do_div(tmp, b);
+	return (unsigned long) tmp;
+}
+
+/* Clock event layer functions */
+extern int register_local_clockevent(struct clock_event *);
+extern int register_global_clockevent(struct clock_event *);
+extern unsigned long clockevent_delta2ns(unsigned long latch,
+					 struct clock_event *evt);
+extern void clockevents_init(void);
+
+extern int clockevents_init_next_event(void);
+extern int clockevents_set_next_event(ktime_t expires, int force);
+extern int clockevents_next_event_available(void);
+extern void clockevents_resume_events(void);
+
+#else
+# define clockevents_init()		do { } while(0)
+# define clockevents_resume_events()	do { } while(0)
+#endif
+
+#endif
diff -urN ./linux-2.6.18.1/include/linux/clocksource.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/clocksource.h
--- ./linux-2.6.18.1/include/linux/clocksource.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/clocksource.h	2007-05-19 23:58:35.000000000 +0900
@@ -46,6 +46,7 @@
  * @shift:		cycle to nanosecond divisor (power of two)
  * @update_callback:	called when safe to alter clocksource values
  * @is_continuous:	defines if clocksource is free-running.
+ * @vread:		vsyscall based read
  * @cycle_interval:	Used internally by timekeeping core, please ignore.
  * @xtime_interval:	Used internally by timekeeping core, please ignore.
  */
@@ -59,6 +60,10 @@
 	u32 shift;
 	int (*update_callback)(void);
 	int is_continuous;
+	cycle_t (*vread)(void);
+#ifdef CONFIG_IA64
+	void *fsys_mmio_ptr;	/* used by fsyscall asm code */
+#endif
 
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_last, cycle_interval;
@@ -182,4 +187,10 @@
 void clocksource_reselect(void);
 struct clocksource* clocksource_get_next(void);
 
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+extern void update_vsyscall(struct timespec* ts, struct clocksource* c);
+#else
+#define update_vsyscall(now, c) do { } while(0)
+#endif
+
 #endif /* _LINUX_CLOCKSOURCE_H */
diff -urN ./linux-2.6.18.1/include/linux/completion.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/completion.h
--- ./linux-2.6.18.1/include/linux/completion.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/completion.h	2007-05-19 23:58:35.000000000 +0900
@@ -48,6 +48,7 @@
 						   unsigned long timeout));
 extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
 			struct completion *x, unsigned long timeout));
+extern unsigned int FASTCALL(completion_done(struct completion *x));
 
 extern void FASTCALL(complete(struct completion *));
 extern void FASTCALL(complete_all(struct completion *));
diff -urN ./linux-2.6.18.1/include/linux/console.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/console.h
--- ./linux-2.6.18.1/include/linux/console.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/console.h	2007-05-19 23:58:35.000000000 +0900
@@ -54,6 +54,7 @@
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
diff -urN ./linux-2.6.18.1/include/linux/debug_locks.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/debug_locks.h
--- ./linux-2.6.18.1/include/linux/debug_locks.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/debug_locks.h	2007-05-19 23:58:35.000000000 +0900
@@ -24,7 +24,7 @@
 	int __ret = 0;							\
 									\
 	if (unlikely(c)) {						\
-		if (debug_locks_off())					\
+		if (debug_locks_silent || debug_locks_off())		\
 			WARN_ON(1);					\
 		__ret = 1;						\
 	}								\
diff -urN ./linux-2.6.18.1/include/linux/futex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/futex.h
--- ./linux-2.6.18.1/include/linux/futex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/futex.h	2007-05-19 23:58:35.000000000 +0900
@@ -93,7 +93,7 @@
  */
 #define ROBUST_LIST_LIMIT	2048
 
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int
diff -urN ./linux-2.6.18.1/include/linux/genhd.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/genhd.h
--- ./linux-2.6.18.1/include/linux/genhd.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/genhd.h	2007-05-19 23:58:35.000000000 +0900
@@ -144,15 +144,22 @@
  * variants disable/enable preemption.
  */
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define __disk_stat_add(gendiskp, field, addnd)			\
+do {								\
+	preempt_disable();					\
+	(per_cpu_ptr(gendiskp->dkstats,				\
+			smp_processor_id())->field += addnd);	\
+	preempt_enable();					\
+} while (0)
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
+	preempt_disable();						\
 	for_each_possible_cpu(i)					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
+	preempt_enable();						\
 	res;								\
 })
 
diff -urN ./linux-2.6.18.1/include/linux/hardirq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/hardirq.h
--- ./linux-2.6.18.1/include/linux/hardirq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/hardirq.h	2007-05-19 23:58:35.000000000 +0900
@@ -28,31 +28,38 @@
 
 #ifndef HARDIRQ_BITS
 #define HARDIRQ_BITS	12
+
+#ifndef MAX_HARDIRQS_PER_CPU
+#define MAX_HARDIRQS_PER_CPU NR_IRQS
+#endif
+
 /*
  * The hardirq mask has to be large enough to have space for potentially
  * all IRQ sources in the system nesting on a single CPU.
  */
-#if (1 << HARDIRQ_BITS) < NR_IRQS
+#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU
 # error HARDIRQ_BITS is too low!
 #endif
 #endif
+#define PREEMPT_ACTIVE_BITS	1
 
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define PREEMPT_SHIFT		0
+#define SOFTIRQ_SHIFT		(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT		(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define PREEMPT_ACTIVE_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)		((1UL << (x))-1)
+
+#define PREEMPT_MASK		(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK		(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK		(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+
+#define PREEMPT_OFFSET		(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET		(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET		(1UL << HARDIRQ_SHIFT)
 
 #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
-#error PREEMPT_ACTIVE is too low!
+# error PREEMPT_ACTIVE is too low!
 #endif
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
@@ -63,11 +70,13 @@
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
+
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
 #else
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
diff -urN ./linux-2.6.18.1/include/linux/hrtimer.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/hrtimer.h
--- ./linux-2.6.18.1/include/linux/hrtimer.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/hrtimer.h	2007-05-22 21:50:06.000000000 +0900
@@ -17,8 +17,9 @@
 
 #include <linux/rbtree.h>
 #include <linux/ktime.h>
+#include <linux/timer.h>
 #include <linux/init.h>
-#include <linux/list.h>
+#include <linux/plist.h>
 #include <linux/wait.h>
 
 /*
@@ -34,9 +35,19 @@
 	HRTIMER_RESTART,
 };
 
-#define HRTIMER_INACTIVE	((void *)1UL)
+enum hrtimer_state {
+	HRTIMER_INACTIVE,
+	HRTIMER_PENDING,
+	HRTIMER_ACTIVE,
+};
+
+enum hrtimer_cb_mode {
+	HRTIMER_CB_SOFTIRQ,
+	HRTIMER_CB_IRQSAFE,
+	HRTIMER_CB_IRQSAFE_NO_RESTART,
+};
 
-struct hrtimer_base;
+struct hrtimer_clock_base;
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -47,13 +58,34 @@
  * @function:	timer expiry callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  *
+ * @mode:	high resolution timer feature to allow executing the
+ *		callback in the hardirq context (wakeups)
+ * @cb_entry:	list head to enqueue an expired timer into the callback list
+ *
  * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
  */
 struct hrtimer {
-	struct rb_node		node;
-	ktime_t			expires;
-	int			(*function)(struct hrtimer *);
-	struct hrtimer_base	*base;
+	struct rb_node			node;
+	ktime_t				expires;
+	int				(*function)(struct hrtimer *);
+	struct hrtimer_clock_base	*base;
+	enum hrtimer_state		state;
+#ifdef CONFIG_CABI
+	char				*data;
+#endif
+#ifdef CONFIG_HIGH_RES_TIMERS
+	int				cb_mode;
+#ifdef CONFIG_PREEMPT_RT
+	struct plist_node		cb_entry;
+#else
+	struct list_head		cb_entry;
+#endif
+#endif
+#ifdef CONFIG_TIMER_STATS
+	void				*start_site;
+	char				start_comm[16];
+	int				start_pid;
+#endif
 };
 
 /**
@@ -68,39 +100,121 @@
 	struct task_struct *task;
 };
 
+struct hrtimer_cpu_base;
+
 /**
  * struct hrtimer_base - the timer base for a specific clock
  * @index:		clock type index for per_cpu support when moving a timer
  *			to a base on another cpu.
- * @lock:		lock protecting the base and associated timers
  * @active:		red black tree root node for the active timers
  * @first:		pointer to the timer node which expires first
  * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
  * @get_softirq_time:	function to retrieve the current time from the softirq
- * @curr_timer:		the timer which is executing a callback right now
  * @softirq_time:	the time when running the hrtimer queue in the softirq
- * @lock_key:		the lock_class_key for use with lockdep
+ * @cb_pending:		list of timers where the callback is pending
+ * @offset:		offset of this clock to the monotonic base
+ * @reprogram:		function to reprogram the timer event
  */
-struct hrtimer_base {
+struct hrtimer_clock_base {
+	struct hrtimer_cpu_base	*cpu_base;
 	clockid_t		index;
-	spinlock_t		lock;
 	struct rb_root		active;
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			(*get_softirq_time)(void);
-	struct hrtimer		*curr_timer;
 	ktime_t			softirq_time;
-	struct lock_class_key lock_key;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	ktime_t			offset;
+	int			(*reprogram)(struct hrtimer *t,
+					     struct hrtimer_clock_base *b,
+					     ktime_t n);
+#endif
 };
 
+#define HRTIMER_MAX_CLOCK_BASES 2
+
+/*
+ * struct hrtimer_cpu_base - the per cpu clock bases
+ * @lock:		lock protecting the base and associated clock bases and timers
+ * @lock_key:		the lock_class_key for use with lockdep
+ * @clock_base:		array of clock bases for this cpu
+ * @curr_timer:		the timer which is executing a callback right now
+ */
+struct hrtimer_cpu_base {
+	raw_spinlock_t			lock;
+	struct lock_class_key		lock_key;
+	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	struct hrtimer			*curr_timer;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	ktime_t				expires_next;
+	int				hres_active;
+	unsigned long			check_clocks;
+#ifdef CONFIG_PREEMPT_RT
+	struct plist_head		cb_pending;
+#else
+	struct list_head		cb_pending;
+#endif
+	struct hrtimer			sched_timer;
+	struct pt_regs			*sched_regs;
+	unsigned long			events;
+#ifdef CONFIG_NO_HZ
+	ktime_t				idle_tick;
+	unsigned long			idle_jiffies;
+	unsigned long			idle_calls;
+	unsigned long			idle_sleeps;
+	unsigned long			idle_sleeptime;
+#endif
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+#endif
+};
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+extern void hrtimer_clock_notify(void);
+extern void clock_was_set(void);
+extern void hrtimer_interrupt(struct pt_regs *regs);
+
+# define hrtimer_cb_get_time(t)	(t)->base->get_time()
+# define hrtimer_hres_active	(__get_cpu_var(hrtimer_bases).hres_active)
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+# define KTIME_REALTIME_RES	(ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION }
+# define KTIME_MONOTONIC_RES	(ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION }
+
+#else
+
+# define KTIME_REALTIME_RES		KTIME_LOW_RES
+# define KTIME_MONOTONIC_RES		KTIME_LOW_RES
+
 /*
  * clock_was_set() is a NOP for non- high-resolution systems. The
  * time-sorted order guarantees that a timer does not expire early and
  * is expired in the next softirq when the clock was advanced.
  */
-#define clock_was_set()		do { } while (0)
+# define clock_was_set()		do { } while (0)
+# define hrtimer_clock_notify()		do { } while (0)
+
+# define hrtimer_cb_get_time(t)		(t)->base->softirq_time
+# define hrtimer_hres_active		0
+
+#endif
+
+extern ktime_t ktime_get(void);
+extern ktime_t ktime_get_real(void);
+
+# if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+#  define hrtimer_trace(a,b)		trace_special_u64((a).tv64,b)
+# else
+#  define hrtimer_trace(a,b)		trace_special((a).tv.sec,(a).tv.nsec,b)
+# endif
 
 /* Exported timer functions: */
 
@@ -116,17 +230,24 @@
 
 #define hrtimer_restart(timer) hrtimer_start((timer), (timer)->expires, HRTIMER_ABS)
 
+/* Softirq preemption could deadlock timer removal */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
+#else
+# define hrtimer_wait_for_timer(timer)	do { cpu_relax(); } while (0)
+#endif
+
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
 
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
 extern ktime_t hrtimer_get_next_event(void);
 #endif
 
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return rb_parent(&timer->node) != &timer->node;
+	return timer->state != HRTIMER_INACTIVE;
 }
 
 /* Forward a hrtimer so it expires after now: */
@@ -145,7 +266,67 @@
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 
+/* Resume notification */
+void hrtimer_notify_resume(void);
+
+#ifdef CONFIG_NO_HZ
+extern void hrtimer_trigger_next_hz_tick(struct tvec_t_base_s *base);
+extern int hrtimer_stop_sched_tick(void);
+extern void hrtimer_restart_sched_tick(void);
+extern void update_jiffies(void);
+struct seq_file;
+extern void show_no_hz_stats(struct seq_file *p);
+#else
+# define hrtimer_trigger_next_hz_tick(base)	do { } while (0)
+static inline int hrtimer_stop_sched_tick(void)
+{
+	return 0;
+}
+# define hrtimer_restart_sched_tick()		do { } while (0)
+# define update_jiffies()			do { } while (0)
+# define show_no_hz_stats(p)			do { } while (0)
+#endif
+
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
 
+/*
+ * Timer-statistics info:
+ */
+#ifdef CONFIG_TIMER_STATS
+
+extern void tstats_update_stats(void *timer, pid_t pid, void *startf,
+				void *timerf, char * comm);
+
+static inline void tstats_account_hrtimer(struct hrtimer *timer)
+{
+	tstats_update_stats(timer, timer->start_pid, timer->start_site,
+			    timer->function, timer->start_comm);
+}
+
+extern void __tstats_hrtimer_set_start_info(struct hrtimer *timer, void *addr);
+
+static inline void tstats_hrtimer_set_start_info(struct hrtimer *timer)
+{
+	__tstats_hrtimer_set_start_info(timer, __builtin_return_address(0));
+}
+
+static inline void tstats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+	timer->start_site = NULL;
+}
+#else
+static inline void tstats_account_hrtimer(struct hrtimer *timer)
+{
+}
+
+static inline void tstats_hrtimer_set_start_info(struct hrtimer *timer)
+{
+}
+
+static inline void tstats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+}
+#endif
+
 #endif
diff -urN ./linux-2.6.18.1/include/linux/init_task.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/init_task.h
--- ./linux-2.6.18.1/include/linux/init_task.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/init_task.h	2007-05-20 14:14:28.000000000 +0900
@@ -5,6 +5,11 @@
 #include <linux/rcupdate.h>
 #include <linux/irqflags.h>
 #include <linux/lockdep.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_CABI
+#include <cabi/cabi.h>
+#endif
 
 #define INIT_FDTABLE \
 {							\
@@ -125,9 +130,11 @@
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
-	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
+	.posix_timer_list = NULL,					\
+	.pi_lock	= RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	.cabi_info	= NULL,						\
 }
 
 
diff -urN ./linux-2.6.18.1/include/linux/init_task.h.orig linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/init_task.h.orig
--- ./linux-2.6.18.1/include/linux/init_task.h.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/init_task.h.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,144 @@
+#ifndef _LINUX__INIT_TASK_H
+#define _LINUX__INIT_TASK_H
+
+#include <linux/file.h>
+#include <linux/rcupdate.h>
+#include <linux/irqflags.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+
+#define INIT_FDTABLE \
+{							\
+	.max_fds	= NR_OPEN_DEFAULT, 		\
+	.max_fdset	= EMBEDDED_FD_SET_SIZE,		\
+	.fd		= &init_files.fd_array[0], 	\
+	.close_on_exec	= (fd_set *)&init_files.close_on_exec_init, \
+	.open_fds	= (fd_set *)&init_files.open_fds_init, 	\
+	.rcu		= RCU_HEAD_INIT, 		\
+	.free_files	= NULL,		 		\
+	.next		= NULL,		 		\
+}
+
+#define INIT_FILES \
+{ 							\
+	.count		= ATOMIC_INIT(1), 		\
+	.fdt		= &init_files.fdtab, 		\
+	.fdtab		= INIT_FDTABLE,			\
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
+	.next_fd	= 0, 				\
+	.close_on_exec_init = { { 0, } }, 		\
+	.open_fds_init	= { { 0, } }, 			\
+	.fd_array	= { NULL, } 			\
+}
+
+#define INIT_KIOCTX(name, which_mm) \
+{							\
+	.users		= ATOMIC_INIT(1),		\
+	.dead		= 0,				\
+	.mm		= &which_mm,			\
+	.user_id	= 0,				\
+	.next		= NULL,				\
+	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(name.wait), \
+	.ctx_lock	= __SPIN_LOCK_UNLOCKED(name.ctx_lock), \
+	.reqs_active	= 0U,				\
+	.max_reqs	= ~0U,				\
+}
+
+#define INIT_MM(name) \
+{			 					\
+	.mm_rb		= RB_ROOT,				\
+	.pgd		= swapper_pg_dir, 			\
+	.mm_users	= ATOMIC_INIT(2), 			\
+	.mm_count	= ATOMIC_INIT(1), 			\
+	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
+	.page_table_lock =  __SPIN_LOCK_UNLOCKED(name.page_table_lock),	\
+	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
+	.cpu_vm_mask	= CPU_MASK_ALL,				\
+}
+
+#define INIT_SIGNALS(sig) {	\
+	.count		= ATOMIC_INIT(1), 		\
+	.wait_chldexit	= __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
+	.shared_pending	= { 				\
+		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
+		.signal =  {{0}}}, \
+	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
+	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
+	.rlim		= INIT_RLIMITS,					\
+	.pgrp		= 1,						\
+	.session	= 1,						\
+}
+
+#define INIT_SIGHAND(sighand) {						\
+	.count		= ATOMIC_INIT(1), 				\
+	.action		= { { { .sa_handler = NULL, } }, },		\
+	.siglock	= __SPIN_LOCK_UNLOCKED(sighand.siglock),	\
+}
+
+extern struct group_info init_groups;
+
+/*
+ *  INIT_TASK is used to set up the first task table, touch at
+ * your own risk!. Base=0, limit=0x1fffff (=2MB)
+ */
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.thread_info	= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= 0,						\
+	.lock_depth	= -1,						\
+	.prio		= MAX_PRIO-20,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.ioprio		= 0,						\
+	.time_slice	= HZ,						\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
+	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	.group_info	= &init_groups,					\
+	.cap_effective	= CAP_INIT_EFF_SET,				\
+	.cap_inheritable = CAP_INIT_INH_SET,				\
+	.cap_permitted	= CAP_FULL_SET,					\
+	.keep_capabilities = 0,						\
+	.user		= INIT_USER,					\
+	.comm		= "swapper",					\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.fs_excl	= ATOMIC_INIT(0),				\
+	.posix_timer_list = NULL,					\
+	.pi_lock	= RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+}
+
+
+#define INIT_CPU_TIMERS(cpu_timers)					\
+{									\
+	LIST_HEAD_INIT(cpu_timers[0]),					\
+	LIST_HEAD_INIT(cpu_timers[1]),					\
+	LIST_HEAD_INIT(cpu_timers[2]),					\
+}
+
+
+#endif
diff -urN ./linux-2.6.18.1/include/linux/interrupt.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/interrupt.h
--- ./linux-2.6.18.1/include/linux/interrupt.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/interrupt.h	2007-05-20 16:37:22.000000000 +0900
@@ -45,8 +45,9 @@
 #define IRQF_SAMPLE_RANDOM	0x00000040
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
-#define IRQF_TIMER		0x00000200
 #define IRQF_PERCPU		0x00000400
+#define IRQF_NODELAY		0x00000800
+#define IRQF_TIMER		(0x00000200 | IRQF_NODELAY)
 
 /*
  * Migration helpers. Scheduled for removal in 1/2007
@@ -72,7 +73,7 @@
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
+	struct proc_dir_entry *dir, *threaded;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
@@ -96,7 +97,7 @@
 #ifdef CONFIG_LOCKDEP
 # define local_irq_enable_in_hardirq()	do { } while (0)
 #else
-# define local_irq_enable_in_hardirq()	local_irq_enable()
+# define local_irq_enable_in_hardirq()	local_irq_enable_nort()
 #endif
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -168,6 +169,7 @@
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
+// FIXME: PREEMPT_RT: set_bit()?
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
@@ -200,11 +202,18 @@
 #define save_and_cli(x)	save_and_cli(&x)
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_bh_disable()		do { } while (0)
+# define __local_bh_disable(ip)		do { } while (0)
+# define _local_bh_enable()		do { } while (0)
+# define local_bh_enable()		do { } while (0)
+# define local_bh_enable_ip(ip)		do { } while (0)
+#else
 extern void local_bh_disable(void);
-extern void __local_bh_enable(void);
 extern void _local_bh_enable(void);
 extern void local_bh_enable(void);
 extern void local_bh_enable_ip(unsigned long ip);
+#endif
 
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
@@ -219,7 +228,13 @@
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
-	TASKLET_SOFTIRQ
+	TASKLET_SOFTIRQ,
+#ifdef CONFIG_HIGH_RES_TIMERS
+	HRTIMER_SOFTIRQ,
+#endif
+	RCU_SOFTIRQ,	/* Preferable RCU should always be the last softirq */
+	/* Entries after this are ignored in split softirq mode */
+	MAX_SOFTIRQ,
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
@@ -235,10 +250,25 @@
 asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
 extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr)
+# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+#else
+# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr)
+#endif
+
+extern void FASTCALL(raise_softirq_irqoff_prio(unsigned int nr, int prio));
 extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
 extern void FASTCALL(raise_softirq(unsigned int nr));
+extern void wakeup_irqd(void);
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern void wait_for_softirq(int softirq);
+#else
+# define wait_for_softirq(x) do {} while(0)
+#endif
 
 /* Tasklets --- multithreaded analogue of BHs.
 
@@ -253,8 +283,9 @@
      to be executed on some cpu at least once after this.
    * If the tasklet is already scheduled, but its excecution is still not
      started, it will be executed only once.
-   * If this tasklet is already running on another CPU (or schedule is called
-     from tasklet itself), it is rescheduled for later.
+   * If this tasklet is already running on another CPU, it is rescheduled
+     for later.
+   * Schedule must not be called from the tasklet itself (a lockup occurs)
    * Tasklet is strictly serialized wrt itself, but not
      wrt another tasklets. If client needs some intertask synchronization,
      he makes it with spinlocks.
@@ -279,15 +310,25 @@
 enum
 {
 	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
-	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_RUN,	/* Tasklet is running (SMP only) */
+	TASKLET_STATE_PENDING	/* Tasklet is pending */
 };
 
-#ifdef CONFIG_SMP
+#define TASKLET_STATEF_SCHED	(1 << TASKLET_STATE_SCHED)
+#define TASKLET_STATEF_RUN	(1 << TASKLET_STATE_RUN)
+#define TASKLET_STATEF_PENDING	(1 << TASKLET_STATE_PENDING)
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
+static inline int tasklet_tryunlock(struct tasklet_struct *t)
+{
+	return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
+}
+
 static inline void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_clear_bit(); 
@@ -299,9 +340,10 @@
 	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
 }
 #else
-#define tasklet_trylock(t) 1
-#define tasklet_unlock_wait(t) do { } while (0)
-#define tasklet_unlock(t) do { } while (0)
+# define tasklet_trylock(t)		1
+# define tasklet_tryunlock(t)		1
+# define tasklet_unlock_wait(t)		do { } while (0)
+# define tasklet_unlock(t)		do { } while (0)
 #endif
 
 extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t));
@@ -334,22 +376,14 @@
 	smp_mb();
 }
 
-static inline void tasklet_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
-
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic_dec();
-	atomic_dec(&t->count);
-}
+extern fastcall void tasklet_enable(struct tasklet_struct *t);
+extern fastcall void tasklet_hi_enable(struct tasklet_struct *t);
 
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+void takeover_tasklets(unsigned int cpu);
 
 /*
  * Autoprobing for irqs:
@@ -398,4 +432,35 @@
 extern unsigned int probe_irq_mask(unsigned long);	/* returns mask of ISA interrupts */
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_irq_disable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_enable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); WARN_ON(in_interrupt()); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); WARN_ON(in_interrupt()); } while (0)
+# define spin_lock_nort(lock)		do { } while (0)
+# define spin_unlock_nort(lock)		do { } while (0)
+# define spin_lock_bh_nort(lock)	do { } while (0)
+# define spin_unlock_bh_nort(lock)	do { } while (0)
+# define spin_lock_rt(lock)		spin_lock(lock)
+# define spin_unlock_rt(lock)		spin_unlock(lock)
+# define smp_processor_id_rt(cpu)	(cpu)
+# define in_atomic_rt()			(!oops_in_progress && \
+					  (in_atomic() || irqs_disabled()))
+# define read_trylock_rt(lock)		({read_lock(lock); 1; })
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define spin_lock_rt(lock)		do { } while (0)
+# define spin_unlock_rt(lock)		do { } while (0)
+# define spin_lock_nort(lock)		spin_lock(lock)
+# define spin_unlock_nort(lock)		spin_unlock(lock)
+# define spin_lock_bh_nort(lock)	spin_lock_bh(lock)
+# define spin_unlock_bh_nort(lock)	spin_unlock_bh(lock)
+# define smp_processor_id_rt(cpu)	smp_processor_id()
+# define in_atomic_rt()			0
+# define read_trylock_rt(lock)		read_trylock(lock)
+#endif
+
 #endif
diff -urN ./linux-2.6.18.1/include/linux/irq.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/irq.h
--- ./linux-2.6.18.1/include/linux/irq.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/irq.h	2007-05-19 23:58:35.000000000 +0900
@@ -18,9 +18,11 @@
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
 #include <linux/irqreturn.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
+#include <asm/timex.h>
 
 /*
  * IRQ line status.
@@ -59,6 +61,8 @@
 #define IRQ_NOAUTOEN		0x08000000	/* IRQ will not be enabled on request irq */
 #define IRQ_DELAYED_DISABLE	0x10000000	/* IRQ disable (masking) happens delayed. */
 #define IRQ_WAKEUP		0x20000000	/* IRQ triggers system wakeup */
+#define IRQ_MOVE_PENDING	0x40000000	/* need to re-target IRQ destination */
+#define IRQ_NODELAY		0x80000000	/* IRQ must run immediately */
 
 struct proc_dir_entry;
 
@@ -128,11 +132,12 @@
  * @wake_depth:		enable depth, for multiple set_irq_wake() callers
  * @irq_count:		stats field to detect stalled irqs
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
+ * @thread:		Thread pointer for threaded preemptible irq handling
+ * @wait_for_handler:	Waitqueue to wait for a running preemptible handler
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
  * @cpu:		cpu index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
- * @move_irq:		need to re-target IRQ destination
  * @dir:		/proc/irq/ procfs entry
  * @affinity_entry:	/proc/irq/smp_affinity procfs entry on SMP
  *
@@ -152,14 +157,16 @@
 	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned int		irqs_unhandled;
-	spinlock_t		lock;
+ 	struct task_struct	*thread;
+ 	wait_queue_head_t	wait_for_handler;
+ 	cycles_t		timestamp;
+	raw_spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_t		affinity;
 	unsigned int		cpu;
 #endif
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
 	cpumask_t		pending_mask;
-	unsigned int		move_irq;	/* need to re-target IRQ dest */
 #endif
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *dir;
@@ -206,36 +213,7 @@
 
 void set_pending_irq(unsigned int irq, cpumask_t mask);
 void move_native_irq(int irq);
-
-#ifdef CONFIG_PCI_MSI
-/*
- * Wonder why these are dummies?
- * For e.g the set_ioapic_affinity_vector() calls the set_ioapic_affinity_irq()
- * counter part after translating the vector to irq info. We need to perform
- * this operation on the real irq, when we dont use vector, i.e when
- * pci_use_vector() is false.
- */
-static inline void move_irq(int irq)
-{
-}
-
-static inline void set_irq_info(int irq, cpumask_t mask)
-{
-}
-
-#else /* CONFIG_PCI_MSI */
-
-static inline void move_irq(int irq)
-{
-	move_native_irq(irq);
-}
-
-static inline void set_irq_info(int irq, cpumask_t mask)
-{
-	set_native_irq_info(irq, mask);
-}
-
-#endif /* CONFIG_PCI_MSI */
+void move_masked_irq(int irq);
 
 #else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */
 
@@ -247,21 +225,20 @@
 {
 }
 
-static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+static inline void move_masked_irq(int irq)
 {
 }
 
-static inline void set_irq_info(int irq, cpumask_t mask)
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
 {
-	set_native_irq_info(irq, mask);
 }
 
 #endif /* CONFIG_GENERIC_PENDING_IRQ */
 
 #else /* CONFIG_SMP */
 
-#define move_irq(x)
 #define move_native_irq(x)
+#define move_masked_irq(x)
 
 #endif /* CONFIG_SMP */
 
@@ -393,8 +370,15 @@
 	__set_irq_handler(irq, handle, 1);
 }
 
-/* Set/get chip/data for an IRQ: */
+/* Handle dynamic irq creation and destruction */
+extern int create_irq(void);
+extern void destroy_irq(unsigned int irq);
+
+/* Dynamic irq helper functions */
+extern void dynamic_irq_init(unsigned int irq);
+extern void dynamic_irq_cleanup(unsigned int irq);
 
+/* Set/get chip/data for an IRQ: */
 extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
 extern int set_irq_data(unsigned int irq, void *data);
 extern int set_irq_chip_data(unsigned int irq, void *data);
@@ -404,7 +388,21 @@
 #define get_irq_chip_data(irq)	(irq_desc[irq].chip_data)
 #define get_irq_data(irq)	(irq_desc[irq].handler_data)
 
-#endif /* CONFIG_GENERIC_HARDIRQS */
+/* Early initialization of irqs */
+extern void early_init_hardirqs(void);
+
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+extern void init_hardirqs(void);
+#else
+static inline void init_hardirqs(void) { }
+#endif
+
+#else	/* end GENERIC HARDIRQS */
+
+static inline void early_init_hardirqs(void) { }
+static inline void init_hardirqs(void) { }
+
+#endif /* !CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
 
diff -urN ./linux-2.6.18.1/include/linux/jbd.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/jbd.h
--- ./linux-2.6.18.1/include/linux/jbd.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/jbd.h	2007-05-19 23:58:35.000000000 +0900
@@ -276,6 +276,15 @@
 #define J_ASSERT(assert)	do { } while (0)
 #endif		/* JBD_ASSERTIONS */
 
+/*
+ * For assertions that are only valid on SMP (e.g. spin_is_locked()):
+ */
+#ifdef CONFIG_SMP
+# define J_ASSERT_JH_SMP(jh, expr)	J_ASSERT_JH(jh, expr)
+#else
+# define J_ASSERT_JH_SMP(jh, assert)	do { } while (0)
+#endif
+
 #if defined(JBD_PARANOID_IOFAIL)
 #define J_EXPECT(expr, why...)		J_ASSERT(expr)
 #define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
@@ -331,32 +340,32 @@
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_State, &bh->b_state);
+	spin_lock(&bh->b_state_lock);
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_trylock(BH_State, &bh->b_state);
+	return spin_trylock(&bh->b_state_lock);
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_is_locked(BH_State, &bh->b_state);
+	return spin_is_locked(&bh->b_state_lock);
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_State, &bh->b_state);
+	spin_unlock(&bh->b_state_lock);
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
+	spin_lock_irq(&bh->b_uptodate_lock);
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+	spin_unlock_irq(&bh->b_uptodate_lock);
 }
 
 struct jbd_revoke_table_s;
diff -urN ./linux-2.6.18.1/include/linux/jiffies.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/jiffies.h
--- ./linux-2.6.18.1/include/linux/jiffies.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/jiffies.h	2007-05-19 23:58:35.000000000 +0900
@@ -127,13 +127,13 @@
  *
  * And some not so obvious.
  *
- * Note that we don't want to return MAX_LONG, because
+ * Note that we don't want to return LONG_MAX, because
  * for various timeout reasons we often end up having
  * to wait "jiffies+1" in order to guarantee that we wait
  * at _least_ "jiffies" - so "jiffies+1" had better still
  * be positive.
  */
-#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1)
+#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1)
 
 /*
  * We want to do realistic conversions of time so we need to use the same
@@ -244,207 +244,24 @@
 #endif
 
 /*
- * Convert jiffies to milliseconds and back.
- *
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
- */
-static inline unsigned int jiffies_to_msecs(const unsigned long j)
-{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
-#else
-	return (j * MSEC_PER_SEC) / HZ;
-#endif
-}
-
-static inline unsigned int jiffies_to_usecs(const unsigned long j)
-{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-	return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
-#else
-	return (j * USEC_PER_SEC) / HZ;
-#endif
-}
-
-static inline unsigned long msecs_to_jiffies(const unsigned int m)
-{
-	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	return m * (HZ / MSEC_PER_SEC);
-#else
-	return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
-#endif
-}
-
-static inline unsigned long usecs_to_jiffies(const unsigned int u)
-{
-	if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return u * (HZ / USEC_PER_SEC);
-#else
-	return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
-#endif
-}
-
-/*
- * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
- * that a remainder subtract here would not do the right thing as the
- * resolution values don't fall on second boundries.  I.e. the line:
- * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
- *
- * Rather, we just shift the bits off the right.
- *
- * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
- * value to a scaled second value.
+ * Convert various time units to each other:
  */
-static __inline__ unsigned long
-timespec_to_jiffies(const struct timespec *value)
-{
-	unsigned long sec = value->tv_sec;
-	long nsec = value->tv_nsec + TICK_NSEC - 1;
-
-	if (sec >= MAX_SEC_IN_JIFFIES){
-		sec = MAX_SEC_IN_JIFFIES;
-		nsec = 0;
-	}
-	return (((u64)sec * SEC_CONVERSION) +
-		(((u64)nsec * NSEC_CONVERSION) >>
-		 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
-
-}
-
-static __inline__ void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
-{
-	/*
-	 * Convert jiffies to nanoseconds and separate with
-	 * one divide.
-	 */
-	u64 nsec = (u64)jiffies * TICK_NSEC;
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
-}
+extern unsigned int jiffies_to_msecs(const unsigned long j);
+extern unsigned int jiffies_to_usecs(const unsigned long j);
+extern unsigned long msecs_to_jiffies(const unsigned int m);
+extern unsigned long usecs_to_jiffies(const unsigned int u);
+extern unsigned long timespec_to_jiffies(const struct timespec *value);
+extern void jiffies_to_timespec(const unsigned long jiffies,
+				struct timespec *value);
+extern unsigned long timeval_to_jiffies(const struct timeval *value);
+extern void jiffies_to_timeval(const unsigned long jiffies,
+			       struct timeval *value);
+extern clock_t jiffies_to_clock_t(long x);
+extern unsigned long clock_t_to_jiffies(unsigned long x);
+extern u64 jiffies_64_to_clock_t(u64 x);
+extern u64 nsec_to_clock_t(u64 x);
+extern int nsec_to_timestamp(char *s, u64 t);
 
-/* Same for "timeval"
- *
- * Well, almost.  The problem here is that the real system resolution is
- * in nanoseconds and the value being converted is in micro seconds.
- * Also for some machines (those that use HZ = 1024, in-particular),
- * there is a LARGE error in the tick size in microseconds.
-
- * The solution we use is to do the rounding AFTER we convert the
- * microsecond part.  Thus the USEC_ROUND, the bits to be shifted off.
- * Instruction wise, this should cost only an additional add with carry
- * instruction above the way it was done above.
- */
-static __inline__ unsigned long
-timeval_to_jiffies(const struct timeval *value)
-{
-	unsigned long sec = value->tv_sec;
-	long usec = value->tv_usec;
-
-	if (sec >= MAX_SEC_IN_JIFFIES){
-		sec = MAX_SEC_IN_JIFFIES;
-		usec = 0;
-	}
-	return (((u64)sec * SEC_CONVERSION) +
-		(((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
-		 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
-}
-
-static __inline__ void
-jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
-{
-	/*
-	 * Convert jiffies to nanoseconds and separate with
-	 * one divide.
-	 */
-	u64 nsec = (u64)jiffies * TICK_NSEC;
-	long tv_usec;
-
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
-	tv_usec /= NSEC_PER_USEC;
-	value->tv_usec = tv_usec;
-}
-
-/*
- * Convert jiffies/jiffies_64 to clock_t and back.
- */
-static inline clock_t jiffies_to_clock_t(long x)
-{
-#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
-	return x / (HZ / USER_HZ);
-#else
-	u64 tmp = (u64)x * TICK_NSEC;
-	do_div(tmp, (NSEC_PER_SEC / USER_HZ));
-	return (long)tmp;
-#endif
-}
-
-static inline unsigned long clock_t_to_jiffies(unsigned long x)
-{
-#if (HZ % USER_HZ)==0
-	if (x >= ~0UL / (HZ / USER_HZ))
-		return ~0UL;
-	return x * (HZ / USER_HZ);
-#else
-	u64 jif;
-
-	/* Don't worry about loss of precision here .. */
-	if (x >= ~0UL / HZ * USER_HZ)
-		return ~0UL;
-
-	/* .. but do try to contain it here */
-	jif = x * (u64) HZ;
-	do_div(jif, USER_HZ);
-	return jif;
-#endif
-}
-
-static inline u64 jiffies_64_to_clock_t(u64 x)
-{
-#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
-	do_div(x, HZ / USER_HZ);
-#else
-	/*
-	 * There are better ways that don't overflow early,
-	 * but even this doesn't overflow in hundreds of years
-	 * in 64 bits, so..
-	 */
-	x *= TICK_NSEC;
-	do_div(x, (NSEC_PER_SEC / USER_HZ));
-#endif
-	return x;
-}
-
-static inline u64 nsec_to_clock_t(u64 x)
-{
-#if (NSEC_PER_SEC % USER_HZ) == 0
-	do_div(x, (NSEC_PER_SEC / USER_HZ));
-#elif (USER_HZ % 512) == 0
-	x *= USER_HZ/512;
-	do_div(x, (NSEC_PER_SEC / 512));
-#else
-	/*
-         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
-         * overflow after 64.99 years.
-         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
-         */
-	x *= 9;
-	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
-	                          / USER_HZ));
-#endif
-	return x;
-}
+#define TIMESTAMP_SIZE	30
 
 #endif
diff -urN ./linux-2.6.18.1/include/linux/kernel.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/kernel.h
--- ./linux-2.6.18.1/include/linux/kernel.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/kernel.h	2007-05-19 23:58:35.000000000 +0900
@@ -72,7 +72,7 @@
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
@@ -143,6 +143,8 @@
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
+extern void early_printk(const char *fmt, ...)
+	__attribute__ ((format (printf, 1, 2)));
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -152,6 +154,18 @@
 static inline int printk(const char *s, ...) { return 0; }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline int __attribute_pure__ long_log2(unsigned long x)
@@ -193,6 +207,7 @@
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
diff -urN ./linux-2.6.18.1/include/linux/kernel_stat.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/kernel_stat.h
--- ./linux-2.6.18.1/include/linux/kernel_stat.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/kernel_stat.h	2007-05-19 23:58:35.000000000 +0900
@@ -23,6 +23,8 @@
 	cputime64_t idle;
 	cputime64_t iowait;
 	cputime64_t steal;
+	cputime64_t user_rt;
+	cputime64_t system_rt;
 };
 
 struct kernel_stat {
diff -urN ./linux-2.6.18.1/include/linux/ktime.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/ktime.h
--- ./linux-2.6.18.1/include/linux/ktime.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/ktime.h	2007-05-19 23:58:35.000000000 +0900
@@ -261,8 +261,7 @@
  * idea of the (in)accuracy of timers. Timer values are rounded up to
  * this resolution values.
  */
-#define KTIME_REALTIME_RES	(ktime_t){ .tv64 = TICK_NSEC }
-#define KTIME_MONOTONIC_RES	(ktime_t){ .tv64 = TICK_NSEC }
+#define KTIME_LOW_RES		(ktime_t){ .tv64 = TICK_NSEC }
 
 /* Get the monotonic time in timespec format: */
 extern void ktime_get_ts(struct timespec *ts);
diff -urN ./linux-2.6.18.1/include/linux/latency.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/latency.h
--- ./linux-2.6.18.1/include/linux/latency.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/latency.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,25 @@
+/*
+ * latency.h: Explicit system-wide latency-expectation infrastructure
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCY_H_
+#define _INCLUDE_GUARD_LATENCY_H_
+
+#include <linux/notifier.h>
+
+void set_acceptable_latency(char *identifier, int usecs);
+void modify_acceptable_latency(char *identifier, int usecs);
+void remove_acceptable_latency(char *identifier);
+void synchronize_acceptable_latency(void);
+int system_latency_constraint(void);
+
+int register_latency_notifier(struct notifier_block * nb);
+int unregister_latency_notifier(struct notifier_block * nb);
+
+#define INFINITE_LATENCY 1000000
+
+#endif
diff -urN ./linux-2.6.18.1/include/linux/latency_hist.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/latency_hist.h
--- ./linux-2.6.18.1/include/linux/latency_hist.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/latency_hist.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,32 @@
+/*
+ * kernel/latency_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#ifndef _LINUX_LATENCY_HIST_H_
+#define _LINUX_LATENCY_HIST_H_
+
+enum {
+        INTERRUPT_LATENCY = 0,
+        PREEMPT_LATENCY,
+        WAKEUP_LATENCY
+};
+
+#define MAX_ENTRY_NUM 10240
+#define LATENCY_TYPE_NUM 3
+
+#ifdef CONFIG_LATENCY_HIST
+extern void latency_hist(int latency_type, int cpu, unsigned long latency);
+# define latency_hist_flag 1
+#else
+# define latency_hist(a,b,c) do { (void)(cpu); } while (0)
+# define latency_hist_flag 0
+#endif /* CONFIG_LATENCY_HIST */
+
+#endif /* ifndef _LINUX_LATENCY_HIST_H_ */
diff -urN ./linux-2.6.18.1/include/linux/linkage.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/linkage.h
--- ./linux-2.6.18.1/include/linux/linkage.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/linkage.h	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,8 @@
 
 #include <asm/linkage.h>
 
+#define notrace __attribute ((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
@@ -51,7 +53,7 @@
 
 #endif
 
-#define NORET_TYPE    /**/
+#define NORET_TYPE    /* */
 #define ATTRIB_NORET  __attribute__((noreturn))
 #define NORET_AND     noreturn,
 
diff -urN ./linux-2.6.18.1/include/linux/mutex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/mutex.h
--- ./linux-2.6.18.1/include/linux/mutex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/mutex.h	2007-05-19 23:58:35.000000000 +0900
@@ -12,11 +12,58 @@
 
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
+#include <linux/rt_lock.h>
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
 
 #include <asm/atomic.h>
 
+#ifdef CONFIG_PREEMPT_RT
+
+#include <linux/rtmutex.h>
+
+struct mutex {
+	struct rt_mutex		lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#define DEFINE_MUTEX(mutexname) 					\
+ 	struct mutex mutexname = { 					\
+		.lock = __RT_MUTEX_INITIALIZER(mutexname.lock) 		\
+	}
+
+extern void
+_mutex_init(struct mutex *lock, char *name, struct lock_class_key *key);
+
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_trylock(struct mutex *lock);
+extern void __lockfunc _mutex_unlock(struct mutex *lock);
+
+#define mutex_is_locked(l)	       	rt_mutex_is_locked(&(l)->lock)
+#define mutex_lock(l)			_mutex_lock(l)
+#define mutex_lock_interruptible(l)   	_mutex_lock_interruptible(l)
+#define mutex_trylock(l)		_mutex_trylock(l)
+#define mutex_unlock(l)			_mutex_unlock(l)
+#define mutex_destroy(l)		rt_mutex_destroy(&(l)->lock)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define mutex_lock_nested(l, s)	_mutex_lock_nested(l, s)
+#else
+# define mutex_lock_nested(l, s)	_mutex_lock(l)
+#endif
+
+# define mutex_init(mutex)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	_mutex_init((mutex), #mutex, &__key);		\
+} while (0)
+
+#else
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -137,3 +184,5 @@
 extern void fastcall mutex_unlock(struct mutex *lock);
 
 #endif
+
+#endif
diff -urN ./linux-2.6.18.1/include/linux/netdevice.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/netdevice.h
--- ./linux-2.6.18.1/include/linux/netdevice.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/netdevice.h	2007-05-19 23:58:35.000000000 +0900
@@ -866,7 +866,7 @@
 
 		local_irq_save(flags);
 		list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 		local_irq_restore(flags);
 		return 1;
 	}
@@ -916,20 +916,20 @@
 static inline void netif_tx_lock(struct net_device *dev)
 {
 	spin_lock(&dev->_xmit_lock);
-	dev->xmit_lock_owner = smp_processor_id();
+	dev->xmit_lock_owner = raw_smp_processor_id();
 }
 
 static inline void netif_tx_lock_bh(struct net_device *dev)
 {
 	spin_lock_bh(&dev->_xmit_lock);
-	dev->xmit_lock_owner = smp_processor_id();
+	dev->xmit_lock_owner = raw_smp_processor_id();
 }
 
 static inline int netif_tx_trylock(struct net_device *dev)
 {
 	int ok = spin_trylock(&dev->_xmit_lock);
 	if (likely(ok))
-		dev->xmit_lock_owner = smp_processor_id();
+		dev->xmit_lock_owner = raw_smp_processor_id();
 	return ok;
 }
 
diff -urN ./linux-2.6.18.1/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/netfilter_ipv4/ip_conntrack.h
--- ./linux-2.6.18.1/include/linux/netfilter_ipv4/ip_conntrack.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/netfilter_ipv4/ip_conntrack.h	2007-05-19 23:58:35.000000000 +0900
@@ -299,7 +299,7 @@
 extern unsigned int ip_conntrack_htable_size;
 extern int ip_conntrack_checksum;
  
-#define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+#define CONNTRACK_STAT_INC(count) (__raw_get_cpu_var(ip_conntrack_stat).count++)
 
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
 #include <linux/notifier.h>
@@ -309,10 +309,8 @@
 	struct ip_conntrack *ct;
 	unsigned int events;
 };
-DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DECLARE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
-#define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
- 
 extern struct atomic_notifier_head ip_conntrack_chain;
 extern struct atomic_notifier_head ip_conntrack_expect_chain;
 
@@ -340,7 +338,8 @@
 }
 
 extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct);
-extern void __ip_ct_event_cache_init(struct ip_conntrack *ct);
+extern void __ip_ct_event_cache_init(struct ip_conntrack_ecache *ecache,
+				     struct ip_conntrack *ct);
 
 static inline void 
 ip_conntrack_event_cache(enum ip_conntrack_events event,
@@ -348,12 +347,14 @@
 {
 	struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct;
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ct != ecache->ct)
-		__ip_ct_event_cache_init(ct);
+		__ip_ct_event_cache_init(ecache, ct);
 	ecache->events |= event;
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
diff -urN ./linux-2.6.18.1/include/linux/oprofile.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/oprofile.h
--- ./linux-2.6.18.1/include/linux/oprofile.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/oprofile.h	2007-05-19 23:58:35.000000000 +0900
@@ -124,6 +124,6 @@
 int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count);
 
 /** lock for read/write safety */
-extern spinlock_t oprofilefs_lock;
+extern raw_spinlock_t oprofilefs_lock;
  
 #endif /* OPROFILE_H */
diff -urN ./linux-2.6.18.1/include/linux/pagevec.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/pagevec.h
--- ./linux-2.6.18.1/include/linux/pagevec.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/pagevec.h	2007-05-19 23:58:35.000000000 +0900
@@ -9,7 +9,7 @@
 #define _LINUX_PAGEVEC_H
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
diff -urN ./linux-2.6.18.1/include/linux/parport.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/parport.h
--- ./linux-2.6.18.1/include/linux/parport.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/parport.h	2007-05-19 23:58:35.000000000 +0900
@@ -265,7 +265,7 @@
 struct ieee1284_info {
 	int mode;
 	volatile enum ieee1284_phase phase;
-	struct semaphore irq;
+	struct compat_semaphore irq;
 };
 
 /* A parallel port */
diff -urN ./linux-2.6.18.1/include/linux/pci.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/pci.h
--- ./linux-2.6.18.1/include/linux/pci.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/pci.h	2007-05-19 23:58:35.000000000 +0900
@@ -588,6 +588,12 @@
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
+struct msi_msg {
+	u32	address_lo;	/* low 32 bits of msi message address */
+	u32	address_hi;	/* high 32 bits of msi message address */
+	u32	data;		/* 16 bits of msi message data */
+};
+
 #ifndef CONFIG_PCI_MSI
 static inline void pci_scan_msi_device(struct pci_dev *dev) {}
 static inline int pci_enable_msi(struct pci_dev *dev) {return -1;}
@@ -604,6 +610,68 @@
 	struct msix_entry *entries, int nvec);
 extern void pci_disable_msix(struct pci_dev *dev);
 extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
+
+/*
+ * MSI operation vector.  Used by the msi core code (drivers/pci/msi.c)
+ * to abstract platform-specific tasks relating to MSI address generation
+ * and resource management.
+ */
+struct msi_ops {
+	int needs_64bit_address;
+	/**
+	 * setup - generate an MSI bus address and data for a given vector
+	 * @pdev: PCI device context (in)
+	 * @irq: irq allocated by the msi core (in)
+	 * @msg: PCI bus address and data for msi message (out)
+	 *
+	 * Description: The setup op is used to generate a PCI bus addres and
+	 * data which the msi core will program into the card MSI capability
+	 * registers.  The setup routine is responsible for picking an initial
+	 * cpu to target the MSI at.  The setup routine is responsible for
+	 * examining pdev to determine the MSI capabilities of the card and
+	 * generating a suitable address/data.  The setup routine is
+	 * responsible for allocating and tracking any system resources it
+	 * needs to route the MSI to the cpu it picks, and for associating
+	 * those resources with the passed in vector.
+	 *
+	 * Returns 0 if the MSI address/data was successfully setup.
+	 **/
+
+	int	(*setup)    (struct pci_dev *pdev, unsigned int irq,
+			     struct msi_msg *msg);
+
+	/**
+	 * teardown - release resources allocated by setup
+	 * @vector: vector context for resources (in)
+	 *
+	 * Description:  The teardown op is used to release any resources
+	 * that were allocated in the setup routine associated with the passed
+	 * in vector.
+	 **/
+
+	void	(*teardown) (unsigned int irq);
+
+	/**
+	 * target - retarget an MSI at a different cpu
+	 * @vector: vector context for resources (in)
+	 * @cpu:  new cpu to direct vector at (in)
+	 * @addr_hi: new value of PCI bus upper 32 bits (in/out)
+	 * @addr_lo: new value of PCI bus lower 32 bits (in/out)
+	 *
+	 * Description:  The target op is used to redirect an MSI vector
+	 * at a different cpu.  addr_hi/addr_lo coming in are the existing
+	 * values that the MSI core has programmed into the card.  The
+	 * target code is responsible for freeing any resources (if any)
+	 * associated with the old address, and generating a new PCI bus
+	 * addr_hi/addr_lo that will redirect the vector at the indicated cpu.
+	 **/
+
+	void	(*target)   (unsigned int irq, cpumask_t cpumask,
+			     struct msi_msg *msg);
+};
+
+extern int msi_register(struct msi_ops *ops);
+
 #endif
 
 extern void pci_block_user_cfg_access(struct pci_dev *dev);
diff -urN ./linux-2.6.18.1/include/linux/percpu.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/percpu.h
--- ./linux-2.6.18.1/include/linux/percpu.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/percpu.h	2007-05-19 23:58:35.000000000 +0900
@@ -8,13 +8,36 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+# define PERCPU_ENOUGH_ROOM 524288
 #endif
 
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+/*
+ * Per-CPU data structures with an additional lock - useful for
+ * PREEMPT_RT code that wants to reschedule but also wants
+ * per-CPU data structures.
+ *
+ * 'cpu' gets updated with the CPU the task is currently executing on.
+ *
+ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables
+ * are the same as the normal per-CPU variables, so there no
+ * runtime overhead.
+ */
+#define get_cpu_var_locked(var, cpuptr)			\
+(*({							\
+	int __cpu = raw_smp_processor_id();		\
+							\
+	*(cpuptr) = __cpu;				\
+	spin_lock(&__get_cpu_lock(var, __cpu));		\
+	&__get_cpu_var_locked(var, __cpu);		\
+}))
+
+#define put_cpu_var_locked(var, cpu) \
+	 do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0)
+
 #ifdef CONFIG_SMP
 
 struct percpu_data {
diff -urN ./linux-2.6.18.1/include/linux/percpu_counter.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/percpu_counter.h
--- ./linux-2.6.18.1/include/linux/percpu_counter.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/percpu_counter.h	2007-05-19 23:58:35.000000000 +0900
@@ -15,7 +15,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	s64 count;
 	s32 *counters;
 };
diff -urN ./linux-2.6.18.1/include/linux/plist.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/plist.h
--- ./linux-2.6.18.1/include/linux/plist.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/plist.h	2007-05-19 23:58:35.000000000 +0900
@@ -81,7 +81,7 @@
 	struct list_head prio_list;
 	struct list_head node_list;
 #ifdef CONFIG_DEBUG_PI_LIST
-	spinlock_t *lock;
+	raw_spinlock_t *lock;
 #endif
 };
 
@@ -126,7 +126,7 @@
  * @head:	&struct plist_head pointer
  */
 static inline void
-plist_head_init(struct plist_head *head, spinlock_t *lock)
+plist_head_init(struct plist_head *head, raw_spinlock_t *lock)
 {
 	INIT_LIST_HEAD(&head->prio_list);
 	INIT_LIST_HEAD(&head->node_list);
diff -urN ./linux-2.6.18.1/include/linux/posix-timers.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/posix-timers.h
--- ./linux-2.6.18.1/include/linux/posix-timers.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/posix-timers.h	2007-05-19 23:58:35.000000000 +0900
@@ -110,5 +110,5 @@
 
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval);
-
+int posix_cpu_thread_init(void);
 #endif
diff -urN ./linux-2.6.18.1/include/linux/prctl.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/prctl.h
--- ./linux-2.6.18.1/include/linux/prctl.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/prctl.h	2007-05-19 23:58:35.000000000 +0900
@@ -3,6 +3,7 @@
 
 /* Values to pass as first argument to prctl() */
 
+#define PR_SET_TRACING    0  /* Second arg is tracing on/off */
 #define PR_SET_PDEATHSIG  1  /* Second arg is a signal */
 #define PR_GET_PDEATHSIG  2  /* Second arg is a ptr to return the signal */
 
diff -urN ./linux-2.6.18.1/include/linux/preempt.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/preempt.h
--- ./linux-2.6.18.1/include/linux/preempt.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/preempt.h	2007-05-19 23:58:35.000000000 +0900
@@ -8,23 +8,39 @@
 
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
+#include <linux/thread_info.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
-  extern void fastcall add_preempt_count(int val);
-  extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+  extern void notrace add_preempt_count(unsigned int val);
+  extern void notrace sub_preempt_count(unsigned int val);
+  extern void notrace mask_preempt_count(unsigned int mask);
+  extern void notrace unmask_preempt_count(unsigned int mask);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+# define mask_preempt_count(mask) \
+		do { preempt_count() |= (mask); } while (0)
+# define unmask_preempt_count(mask) \
+		do { preempt_count() &= ~(mask); } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_TIMING
+  extern void touch_critical_timing(void);
+  extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing()	do { } while (0)
+# define stop_critical_timing()	do { } while (0)
 #endif
 
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+#define preempt_count()		(current_thread_info()->preempt_count)
 
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
+asmlinkage void preempt_schedule_irq(void);
 
 #define preempt_disable() \
 do { \
@@ -32,21 +48,34 @@
 	barrier(); \
 } while (0)
 
-#define preempt_enable_no_resched() \
+#define __preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
 
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern void notrace preempt_enable_no_resched(void);
+#else
+# define preempt_enable_no_resched() __preempt_enable_no_resched()
+#endif
+
 #define preempt_check_resched() \
 do { \
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 
+#define preempt_check_resched_delayed() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \
+		preempt_schedule(); \
+} while (0)
+
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	__preempt_enable_no_resched(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
@@ -55,8 +84,12 @@
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
+#define __preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
+#define preempt_check_resched_delayed()	do { } while (0)
+
+#define preempt_schedule_irq()		do { } while (0)
 
 #endif
 
diff -urN ./linux-2.6.18.1/include/linux/profile.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/profile.h
--- ./linux-2.6.18.1/include/linux/profile.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/profile.h	2007-05-19 23:58:35.000000000 +0900
@@ -6,10 +6,12 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define PREEMPT_PROFILING	3
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -30,6 +32,8 @@
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 struct task_struct;
diff -urN ./linux-2.6.18.1/include/linux/radix-tree.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/radix-tree.h
--- ./linux-2.6.18.1/include/linux/radix-tree.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/radix-tree.h	2007-05-19 23:58:35.000000000 +0900
@@ -55,7 +55,18 @@
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -71,7 +82,9 @@
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
diff -urN ./linux-2.6.18.1/include/linux/rcuclassic.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcuclassic.h
--- ./linux-2.6.18.1/include/linux/rcuclassic.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcuclassic.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,152 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (classic version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUCLASSIC_H
+#define __LINUX_RCUCLASSIC_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	long	cur;		/* Current batch number.                      */
+	long	completed;	/* Number of the last completed batch         */
+	int	next_pending;	/* Is the next batch already waiting?         */
+
+	spinlock_t	lock	____cacheline_internodealigned_in_smp;
+	cpumask_t	cpumask; /* CPUs that need to switch in order    */
+	                         /* for current batch to proceed.        */
+} ____cacheline_internodealigned_in_smp;
+
+/* Is batch a before batch b ? */
+static inline int rcu_batch_before(long a, long b)
+{
+        return (a - b) < 0;
+}
+
+/* Is batch a after batch b ? */
+static inline int rcu_batch_after(long a, long b)
+{
+        return (a - b) > 0;
+}
+
+/*
+ * Per-CPU data for Read-Copy UPdate.
+ * nxtlist - new callbacks are added here
+ * curlist - current batch for which quiescent cycle started if any
+ */
+struct rcu_data {
+	/* 1) quiescent state handling : */
+	long		quiescbatch;     /* Batch # for grace period */
+	int		passed_quiesc;	 /* User-mode/idle loop etc. */
+	int		qs_pending;	 /* core waits for quiesc state */
+
+	/* 2) batch handling */
+	long  	       	batch;           /* Batch # for current RCU batch */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail;
+	long            qlen; 	 	 /* # of queued callbacks */
+	struct rcu_head *curlist;
+	struct rcu_head **curtail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
+	int cpu;
+#ifdef CONFIG_SMP
+	long		last_rs_qlen;	 /* qlen during the last resched */
+#endif
+};
+
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#define __rcu_read_lock() \
+	do { \
+		preempt_disable(); \
+		__acquire(RCU); \
+	} while(0)
+#define __rcu_read_unlock() \
+	do { \
+		__release(RCU); \
+		preempt_enable(); \
+	} while(0)
+
+#define __rcu_read_lock_bh() \
+	do { \
+		local_bh_disable(); \
+		__acquire(RCU_BH); \
+	} while(0)
+#define __rcu_read_unlock_bh() \
+	do { \
+		__release(RCU_BH); \
+		local_bh_enable(); \
+	} while(0)
+
+#define __synchronize_sched()	synchronize_rcu()
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
+
+struct softirq_action;
+extern void rcu_process_callbacks(struct softirq_action *unused);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUCLASSIC_H */
diff -urN ./linux-2.6.18.1/include/linux/rcupdate.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcupdate.h
--- ./linux-2.6.18.1/include/linux/rcupdate.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcupdate.h	2007-05-19 23:58:35.000000000 +0900
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update mechanism for mutual exclusion 
+ * Read-Copy Update mechanism for mutual exclusion
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@
  * Copyright (C) IBM Corporation, 2001
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
- * 
+ *
  * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  * Papers:
@@ -41,6 +41,12 @@
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/config.h>
+#ifdef CONFIG_CLASSIC_RCU
+#include <linux/rcuclassic.h>
+#else
+#include <linux/rcupreempt.h>
+#endif
 
 /**
  * struct rcu_head - callback structure for use with RCU
@@ -59,81 +65,6 @@
 } while (0)
 
 
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	long	cur;		/* Current batch number.                      */
-	long	completed;	/* Number of the last completed batch         */
-	int	next_pending;	/* Is the next batch already waiting?         */
-
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-	                         /* for current batch to proceed.        */
-} ____cacheline_internodealigned_in_smp;
-
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
-        return (a - b) < 0;
-}
-
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
-        return (a - b) > 0;
-}
-
-/*
- * Per-CPU data for Read-Copy UPdate.
- * nxtlist - new callbacks are added here
- * curlist - current batch for which quiescent cycle started if any
- */
-struct rcu_data {
-	/* 1) quiescent state handling : */
-	long		quiescbatch;     /* Batch # for grace period */
-	int		passed_quiesc;	 /* User-mode/idle loop etc. */
-	int		qs_pending;	 /* core waits for quiesc state */
-
-	/* 2) batch handling */
-	long  	       	batch;           /* Batch # for current RCU batch */
-	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail;
-	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *curlist;
-	struct rcu_head **curtail;
-	struct rcu_head *donelist;
-	struct rcu_head **donetail;
-	long		blimit;		 /* Upper limit on a processed batch */
-	int cpu;
-	struct rcu_head barrier;
-#ifdef CONFIG_SMP
-	long		last_rs_qlen;	 /* qlen during the last resched */
-#endif
-};
-
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-
-extern int rcu_pending(int cpu);
-extern int rcu_needs_cpu(int cpu);
-
 /**
  * rcu_read_lock - mark the beginning of an RCU read-side critical section.
  *
@@ -163,22 +94,14 @@
  *
  * It is illegal to block while in an RCU read-side critical section.
  */
-#define rcu_read_lock() \
-	do { \
-		preempt_disable(); \
-		__acquire(RCU); \
-	} while(0)
+#define rcu_read_lock() __rcu_read_lock()
 
 /**
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  *
  * See rcu_read_lock() for more information.
  */
-#define rcu_read_unlock() \
-	do { \
-		__release(RCU); \
-		preempt_enable(); \
-	} while(0)
+#define rcu_read_unlock() __rcu_read_unlock()
 
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
@@ -201,22 +124,14 @@
  * can use just rcu_read_lock().
  *
  */
-#define rcu_read_lock_bh() \
-	do { \
-		local_bh_disable(); \
-		__acquire(RCU_BH); \
-	} while(0)
+#define rcu_read_lock_bh()	__rcu_read_lock_bh()
 
-/*
+/**
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
-#define rcu_read_unlock_bh() \
-	do { \
-		__release(RCU_BH); \
-		local_bh_enable(); \
-	} while(0)
+#define rcu_read_unlock_bh()	__rcu_read_unlock_bh()
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -268,22 +183,50 @@
  * In "classic RCU", these two guarantees happen to be one and
  * the same, but can differ in realtime RCU implementations.
  */
-#define synchronize_sched() synchronize_rcu()
+#define synchronize_sched()	__synchronize_sched()
 
-extern void rcu_init(void);
-extern void rcu_check_callbacks(int cpu, int user);
-extern void rcu_restart_cpu(int cpu);
-extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+extern void FASTCALL(call_rcu(struct rcu_head *head,
+  				void (*func)(struct rcu_head *head)));
 
-/* Exported interfaces */
-extern void FASTCALL(call_rcu(struct rcu_head *head, 
-				void (*func)(struct rcu_head *head)));
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
 extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *head)));
+
+/* Exported common interfaces */
 extern void synchronize_rcu(void);
-void synchronize_idle(void);
 extern void rcu_barrier(void);
 
+/* Internal to kernel */
+extern void rcu_init(void);
+extern void rcu_advance_callbacks(int cpu, int user);
+extern void rcu_check_callbacks(int cpu, int user);
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
diff -urN ./linux-2.6.18.1/include/linux/rcupreempt.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcupreempt.h
--- ./linux-2.6.18.1/include/linux/rcupreempt.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcupreempt.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,68 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPREEMPT_H
+#define __LINUX_RCUPREEMPT_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#define __rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#define __rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+
+#define __rcu_read_lock_nesting()	(current->rcu_read_lock_nesting)
+
+extern void __synchronize_sched(void);
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
+
+extern void rcu_process_callbacks(unsigned long unused);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_H */
diff -urN ./linux-2.6.18.1/include/linux/rcupreempt_trace.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcupreempt_trace.h
--- ./linux-2.6.18.1/include/linux/rcupreempt_trace.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rcupreempt_trace.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,84 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPREEMPT_TRACE_H
+#define __LINUX_RCUPREEMPT_TRACE_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#include <asm/atomic.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcupreempt_trace {
+	long		next_length;
+	long		next_add;
+	long		wait_length;
+	long		wait_add;
+	long		done_length;
+	long		done_add;
+	long		done_remove;
+	atomic_t	done_invoked;
+	long		rcu_check_callbacks;
+	atomic_t	rcu_try_flip1;
+	long		rcu_try_flip2;
+	long		rcu_try_flip3;
+	atomic_t	rcu_try_flip_e1;
+	long		rcu_try_flip_e2;
+	long		rcu_try_flip_e3;
+};
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(fn, arg) 	fn(arg);
+#else
+#define RCU_TRACE(fn, arg)
+#endif
+
+extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_e2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_e3(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip3(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_TRACE_H */
diff -urN ./linux-2.6.18.1/include/linux/rt_lock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rt_lock.h
--- ./linux-2.6.18.1/include/linux/rt_lock.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rt_lock.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,332 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+/*
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure definitions.
+ */
+#include <linux/rtmutex.h>
+#include <asm/atomic.h>
+#include <linux/spinlock_types.h>
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __SPIN_LOCK_UNLOCKED(name) \
+	(spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) \
+	, .save_state = 1, .file = __FILE__, .line = __LINE__ } }
+#else
+# define __SPIN_LOCK_UNLOCKED(name) \
+	(spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) } }
+#endif
+# define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(spin_old_style)
+#else /* !PREEMPT_RT */
+  typedef raw_spinlock_t spinlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+#  define _SPIN_LOCK_UNLOCKED						\
+			{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = SPINLOCK_MAGIC,		\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _SPIN_LOCK_UNLOCKED \
+			{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# endif
+# define SPIN_LOCK_UNLOCKED		_SPIN_LOCK_UNLOCKED
+# define __SPIN_LOCK_UNLOCKED(name)	_SPIN_LOCK_UNLOCKED
+#endif
+
+#define __DEFINE_SPINLOCK(name) \
+	spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * RW-semaphores are a spinlock plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex		lock;
+	int			read_depth;
+	unsigned int		break_lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+} rwlock_t;
+
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+#  define __RW_LOCK_UNLOCKED(name) (rwlock_t) \
+	{ .lock = { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name), \
+	 .save_state = 1, .file = __FILE__, .line = __LINE__ } }
+# else
+#  define __RW_LOCK_UNLOCKED(name) (rwlock_t) \
+	{ .lock = { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name) } }
+# endif
+#else /* !PREEMPT_RT */
+
+  typedef raw_rwlock_t rwlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+# define _RW_LOCK_UNLOCKED						\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+				.magic = RWLOCK_MAGIC,			\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _RW_LOCK_UNLOCKED						\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# endif
+# define __RW_LOCK_UNLOCKED(name)	_RW_LOCK_UNLOCKED
+#endif
+
+#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(rw_old_style)
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Semaphores - a spinlock plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t		count;
+	struct rt_mutex		lock;
+};
+
+#define DECLARE_MUTEX(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+/*
+ * DECLARE_MUTEX_LOCKED() is deprecated: very hard to initialize properly
+ * and it also often signals abuse of semaphores. So we redirect it to
+ * compat semaphores:
+ */
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+extern void fastcall __sema_init(struct semaphore *sem, int val, char *name, char *file, int line);
+
+#define rt_sema_init(sem, val) \
+		__sema_init(sem, val, #sem, __FILE__, __LINE__)
+
+extern void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, int line);
+#define rt_init_MUTEX(sem) \
+		__init_MUTEX(sem, #sem, __FILE__, __LINE__)
+
+extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void);
+
+/*
+ * No locked initialization for RT semaphores
+ */
+#define rt_init_MUTEX_LOCKED(sem) \
+		there_is_no_init_MUTEX_LOCKED_for_RT_semaphores()
+extern void fastcall rt_down(struct semaphore *sem);
+extern int fastcall rt_down_interruptible(struct semaphore *sem);
+extern int fastcall rt_down_trylock(struct semaphore *sem);
+extern void fastcall rt_up(struct semaphore *sem);
+
+#define rt_sem_is_locked(s)	rt_mutex_is_locked(&(s)->lock)
+#define rt_sema_count(s)	atomic_read(&(s)->count)
+
+extern int __bad_func_type(void);
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(var, type) \
+		__builtin_types_compatible_p(typeof(var), type *)
+
+#define PICK_FUNC_1ARG(type1, type2, func1, func2, arg)			\
+do {									\
+	if (TYPE_EQUAL((arg), type1))					\
+		func1((type1 *)(arg));					\
+	else if (TYPE_EQUAL((arg), type2))				\
+		func2((type2 *)(arg));					\
+	else __bad_func_type();						\
+} while (0)
+
+#define PICK_FUNC_1ARG_RET(type1, type2, func1, func2, arg)		\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((arg), type1))					\
+		__ret = func1((type1 *)(arg));				\
+	else if (TYPE_EQUAL((arg), type2))				\
+		__ret = func2((type2 *)(arg));				\
+	else __ret = __bad_func_type();					\
+									\
+	__ret;								\
+})
+
+#define PICK_FUNC_2ARG(type1, type2, func1, func2, arg0, arg1)		\
+do {									\
+	if (TYPE_EQUAL((arg0), type1))					\
+		func1((type1 *)(arg0), arg1);				\
+	else if (TYPE_EQUAL((arg0), type2))				\
+		func2((type2 *)(arg0), arg1);				\
+	else __bad_func_type();						\
+} while (0)
+
+#define sema_init(sem, val) \
+	PICK_FUNC_2ARG(struct compat_semaphore, struct semaphore, \
+		compat_sema_init, rt_sema_init, sem, val)
+
+#define init_MUTEX(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX, rt_init_MUTEX, sem)
+
+#define init_MUTEX_LOCKED(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem)
+
+#define down(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_down, rt_down, sem)
+
+#define down_interruptible(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_interruptible, rt_down_interruptible, sem)
+
+#define down_trylock(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_trylock, rt_down_trylock, sem)
+
+#define up(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_up, rt_up, sem)
+
+#define sem_is_locked(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sem_is_locked, rt_sem_is_locked, sem)
+
+#define sema_count(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sema_count, rt_sema_count, sem)
+
+/*
+ * rwsems:
+ */
+
+#define __RWSEM_INITIALIZER(name) \
+	{ .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+				     struct lock_class_key *key);
+
+# define rt_init_rwsem(sem)				\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_rwsem_init((sem), #sem, &__key);		\
+} while (0)
+
+extern void fastcall rt_down_write(struct rw_semaphore *rwsem);
+extern void fastcall
+rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
+extern void fastcall rt_down_read(struct rw_semaphore *rwsem);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void fastcall rt_down_read_non_owner(struct rw_semaphore *rwsem);
+#else
+# define rt_down_read_non_owner(rwsem)		rt_down_read(rwsem)
+#endif
+extern int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem);
+extern int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem);
+extern void fastcall rt_up_read(struct rw_semaphore *rwsem);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void fastcall rt_up_read_non_owner(struct rw_semaphore *rwsem);
+#else
+# define rt_up_read_non_owner(rwsem)	rt_up_read(rwsem)
+#endif
+extern void fastcall rt_up_write(struct rw_semaphore *rwsem);
+extern void fastcall rt_downgrade_write(struct rw_semaphore *rwsem);
+
+# define rt_rwsem_is_locked(rws)	(rt_mutex_is_locked(&(rws)->lock))
+
+#define init_rwsem(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_init_rwsem, rt_init_rwsem, rwsem)
+
+#define down_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read, rt_down_read, rwsem)
+
+#define down_read_non_owner(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read_non_owner, rt_down_read_non_owner, rwsem)
+
+#define down_read_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read_trylock, rt_down_read_trylock, rwsem)
+
+#define down_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write, rt_down_write, rwsem)
+
+#define down_write_nested(rwsem, subclass) \
+	PICK_FUNC_2ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write_nested, rt_down_write_nested, rwsem, subclass)
+
+#define down_write_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write_trylock, rt_down_write_trylock, rwsem)
+
+#define up_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_read, rt_up_read, rwsem)
+
+#define up_read_non_owner(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_read_non_owner, rt_up_read_non_owner, rwsem)
+
+#define up_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_write, rt_up_write, rwsem)
+
+#define downgrade_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_downgrade_write, rt_downgrade_write, rwsem)
+
+#define rwsem_is_locked(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem)
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif
+
diff -urN ./linux-2.6.18.1/include/linux/rtmutex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rtmutex.h
--- ./linux-2.6.18.1/include/linux/rtmutex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rtmutex.h	2007-05-19 23:58:35.000000000 +0900
@@ -24,7 +24,7 @@
  * @owner:	the mutex owner
  */
 struct rt_mutex {
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct plist_head	wait_list;
 	struct task_struct	*owner;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -63,7 +63,7 @@
 #endif
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
-	{ .wait_lock = SPIN_LOCK_UNLOCKED \
+	{ .wait_lock = RAW_SPIN_LOCK_UNLOCKED(mutexname) \
 	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
 	, .owner = NULL \
 	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
diff -urN ./linux-2.6.18.1/include/linux/rwsem-spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rwsem-spinlock.h
--- ./linux-2.6.18.1/include/linux/rwsem-spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rwsem-spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -28,7 +28,7 @@
  * - if activity is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -43,32 +43,32 @@
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-#define __RWSEM_INITIALIZER(name) \
+#define __COMPAT_RWSEM_INITIALIZER(name) \
 { 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name,
 			 struct lock_class_key *key);
 
-#define init_rwsem(sem)						\
+#define compat_init_rwsem(sem)					\
 do {								\
 	static struct lock_class_key __key;			\
 								\
-	__init_rwsem((sem), #sem, &__key);			\
+	__compat_init_rwsem((sem), #sem, &__key);		\
 } while (0)
 
-extern void FASTCALL(__down_read(struct rw_semaphore *sem));
-extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write_nested(struct rw_semaphore *sem, int subclass));
-extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__up_read(struct rw_semaphore *sem));
-extern void FASTCALL(__up_write(struct rw_semaphore *sem));
-extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
+extern void FASTCALL(__down_read(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_read_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_write(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_write_nested(struct compat_rw_semaphore *sem, int subclass));
+extern int FASTCALL(__down_write_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_read(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_write(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__downgrade_write(struct compat_rw_semaphore *sem));
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
+static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem)
 {
 	return (sem->activity != 0);
 }
diff -urN ./linux-2.6.18.1/include/linux/rwsem.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rwsem.h
--- ./linux-2.6.18.1/include/linux/rwsem.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/rwsem.h	2007-05-19 23:58:35.000000000 +0900
@@ -9,6 +9,10 @@
 
 #include <linux/linkage.h>
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #ifdef __KERNEL__
 
 #include <linux/types.h>
@@ -16,48 +20,59 @@
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-struct rw_semaphore;
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * On !PREEMPT_RT all rw-semaphores are compat:
+ */
+#define compat_rw_semaphore rw_semaphore
+#endif
+
+struct compat_rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
-#include <linux/rwsem-spinlock.h> /* use a generic implementation */
+# include <linux/rwsem-spinlock.h> /* use a generic implementation */
+#  ifndef CONFIG_PREEMPT_RT
+#  define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER
+#  define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+# endif
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+# include <asm/rwsem.h> /* use an arch-specific implementation */
 #endif
 
 /*
  * lock for reading
  */
-extern void down_read(struct rw_semaphore *sem);
+extern void compat_down_read(struct compat_rw_semaphore *sem);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-extern int down_read_trylock(struct rw_semaphore *sem);
+extern int compat_down_read_trylock(struct compat_rw_semaphore *sem);
 
 /*
  * lock for writing
  */
-extern void down_write(struct rw_semaphore *sem);
+extern void compat_down_write(struct compat_rw_semaphore *sem);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-extern int down_write_trylock(struct rw_semaphore *sem);
+extern int compat_down_write_trylock(struct compat_rw_semaphore *sem);
 
 /*
  * release a read lock
  */
-extern void up_read(struct rw_semaphore *sem);
+extern void compat_up_read(struct compat_rw_semaphore *sem);
 
 /*
  * release a write lock
  */
-extern void up_write(struct rw_semaphore *sem);
+extern void compat_up_write(struct compat_rw_semaphore *sem);
 
 /*
  * downgrade write lock to read lock
  */
-extern void downgrade_write(struct rw_semaphore *sem);
+extern void compat_downgrade_write(struct compat_rw_semaphore *sem);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
@@ -73,22 +88,74 @@
  * lockdep_set_class() at lock initialization time.
  * See Documentation/lockdep-design.txt for more details.)
  */
-extern void down_read_nested(struct rw_semaphore *sem, int subclass);
-extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern void
+compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass);
 /*
  * Take/release a lock when not the owner will release it.
  *
  * [ This API should be avoided as much as possible - the
  *   proper abstraction for this case is completions. ]
  */
-extern void down_read_non_owner(struct rw_semaphore *sem);
-extern void up_read_non_owner(struct rw_semaphore *sem);
+extern void
+compat_down_read_non_owner(struct compat_rw_semaphore *sem);
+extern void
+compat_up_read_non_owner(struct compat_rw_semaphore *sem);
 #else
-# define down_read_nested(sem, subclass)		down_read(sem)
-# define down_write_nested(sem, subclass)	down_write(sem)
-# define down_read_non_owner(sem)		down_read(sem)
-# define up_read_non_owner(sem)			up_read(sem)
+# define compat_down_write_nested(sem, subclass)	compat_down_write(sem)
+# define compat_down_read_non_owner(sem)		compat_down_read(sem)
+# define compat_up_read_non_owner(sem)			compat_up_read(sem)
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+
+#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+
+/*
+ * NOTE, lockdep: this has to be a macro, so that separate class-keys
+ * get generated by the compiler, if the same function does multiple
+ * init_rwsem() calls to different rwsems.
+ */
+#define init_rwsem(rwsem)	compat_init_rwsem(rwsem)
+
+static inline void down_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_read(rwsem);
+}
+static inline int down_read_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_read_trylock(rwsem);
+}
+static inline void down_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_write(rwsem);
+}
+static inline int down_write_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_write_trylock(rwsem);
+}
+static inline void up_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_read(rwsem);
+}
+static inline void up_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_write(rwsem);
+}
+static inline void downgrade_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_downgrade_write(rwsem);
+}
+static inline int rwsem_is_locked(struct compat_rw_semaphore *sem)
+{
+	return compat_rwsem_is_locked(sem);
+}
+# define down_write_nested(sem, subclass) \
+		compat_down_write_nested(sem, subclass)
+# define down_read_non_owner(sem) \
+		compat_down_read_non_owner(sem)
+# define up_read_non_owner(sem) \
+		compat_up_read_non_owner(sem)
+#endif /* !CONFIG_PREEMPT_RT */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_H */
diff -urN ./linux-2.6.18.1/include/linux/sched.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sched.h
--- ./linux-2.6.18.1/include/linux/sched.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sched.h	2007-05-20 14:14:28.000000000 +0900
@@ -83,6 +83,26 @@
 
 #include <asm/processor.h>
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#else
+# define hardirq_preemption 0
+#endif
 struct exec_domain;
 struct futex_pi_state;
 
@@ -139,21 +159,44 @@
  * mistake.
  */
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define TASK_STOPPED		4
-#define TASK_TRACED		8
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define TASK_STOPPED		8
+#define TASK_TRACED		16
 /* in tsk->exit_state */
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 /* in tsk->state again */
-#define TASK_NONINTERACTIVE	64
+#define TASK_NONINTERACTIVE	128
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)		\
 	set_mb((tsk)->state, (state_value))
 
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
+struct exec_domain;
+
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
@@ -226,6 +269,101 @@
 }
 #endif
 
+#ifdef CONFIG_PREEMPT_BKL
+extern struct semaphore kernel_sem;
+#endif
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern int debug_direct_keyboard;
+#else
+# define debug_direct_keyboard 0
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# else
+   extern unsigned long arm_return_addr(int level);
+#  define CALLER_ADDR0 arm_return_addr(0)
+#  define CALLER_ADDR1 arm_return_addr(1)
+#  define CALLER_ADDR2 arm_return_addr(2)
+#  define CALLER_ADDR3 arm_return_addr(3)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_user_trigger_irq, trace_freerunning, trace_verbose,
+		trace_print_at_crash, trace_all_cpus, print_functions;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+  extern void stop_trace(void);
+# define start_trace() do { trace_enabled = 1; } while (0)
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define trace_special_u64(v1,v2)		do { } while (0)
+# define stop_trace()				do { } while (0)
+# define start_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
 
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
@@ -237,6 +375,11 @@
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+extern void __sched __schedule(void);
 
 struct namespace;
 
@@ -342,6 +485,9 @@
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Token based thrashing protection. */
 	unsigned long swap_token_time;
 	char recent_pagein;
@@ -501,7 +647,7 @@
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_prio(prio)		((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
 #define has_rt_policy(p) \
@@ -795,6 +941,12 @@
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 
+#ifdef CONFIG_PREEMPT_RCU
+        int rcu_read_lock_nesting;
+        atomic_t *rcu_flipctr1;
+        atomic_t *rcu_flipctr2;
+#endif
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
@@ -853,6 +1005,8 @@
 	unsigned long long it_sched_expires;
 	struct list_head cpu_timers[3];
 
+	struct task_struct* posix_timer_list;
+
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -907,7 +1061,7 @@
 	spinlock_t alloc_lock;
 
 	/* Protection of the PI data structures: */
-	spinlock_t pi_lock;
+	raw_spinlock_t pi_lock;
 
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
@@ -943,9 +1097,38 @@
 	unsigned int lockdep_recursion;
 #endif
 
+#define MAX_PREEMPT_TRACE 25
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	int lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+	/* realtime bits */
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	void *last_kernel_lock;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
-
+	
+/* CPU Accounting System info  */
+	void *cabi_info;
+	
 /* VM state */
 	struct reclaim_state *reclaim_state;
 
@@ -1019,6 +1202,15 @@
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
+#ifdef CONFIG_PREEMPT_RT
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
+#else
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
@@ -1026,6 +1218,7 @@
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
+#endif
 
 /*
  * Per process flags
@@ -1035,6 +1228,7 @@
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_DEAD		0x00000008	/* Dead */
+#define PF_NOSCHED	0x00000010	/* Userspace does not expect scheduling */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
@@ -1056,6 +1250,8 @@
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_SOFTIRQ	0x40000000	/* softirq context */
+#define PF_HARDIRQ	0x80000000	/* hardirq context */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -1134,8 +1330,10 @@
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
+extern void set_thread_priority(struct task_struct *p, int prio);
 
 void yield(void);
+void __yield(void);
 
 /*
  * The default (Linux) execution domain.
@@ -1183,6 +1381,9 @@
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk));
 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 						unsigned long clone_flags));
 #ifdef CONFIG_SMP
@@ -1269,12 +1470,20 @@
 
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -1435,43 +1644,97 @@
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
   
-static inline int need_resched(void)
+static inline int _need_resched(void)
 {
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
 
-/*
- * cond_resched() and cond_resched_lock(): latency reduction via
- * explicit rescheduling in places that are safe. The return
- * value indicates whether a reschedule was done in fact.
- * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
- */
-extern int cond_resched(void);
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
+static inline int need_resched(void)
+{
+	touch_critical_timing();
+	return _need_resched();
+}
+
+static inline void set_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline int need_resched_delayed(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED));
+}
 
 /*
  * Does a critical section need to be broken due to another
  * task waiting?:
  */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
+#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT)
+# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
 #else
 # define need_lockbreak(lock) 0
 #endif
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
+#else
+# define need_lockbreak_raw(lock) 0
+#endif
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting or preemption being signalled:
  */
-static inline int lock_need_resched(spinlock_t *lock)
+#define lock_need_resched(lock) \
+	unlikely(need_lockbreak(lock) || need_resched())
+
+static inline int softirq_need_resched(void)
 {
-	if (need_lockbreak(lock) || need_resched())
-		return 1;
+	if (softirq_preemption)
+		return need_resched();
 	return 0;
 }
 
+static inline int hardirq_need_resched(void)
+{
+	if (current->flags & PF_HARDIRQ)
+		return need_resched();
+	return 0;
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 		\
+		__ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = __cond_resched_spinlock((spinlock_t *)lock); \
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+extern int cond_resched_softirq(void);
+extern int cond_resched_hardirq(void);
+extern int cond_resched_all(void);
+
 /* Reevaluate whether the task has signals pending delivery.
    This is required every time the blocked sigset_t changes.
    callers must hold sighand->siglock.  */
@@ -1493,6 +1756,7 @@
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
+	trace_change_sched_cpu(p, cpu);
 	task_thread_info(p)->cpu = cpu;
 }
 
diff -urN ./linux-2.6.18.1/include/linux/sched.h.orig linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sched.h.orig
--- ./linux-2.6.18.1/include/linux/sched.h.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sched.h.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1877 @@
+#ifndef _LINUX_SCHED_H
+#define _LINUX_SCHED_H
+
+#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+#define CLONE_VM	0x00000100	/* set if VM shared between processes */
+#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD	0x00010000	/* Same thread group? */
+#define CLONE_NEWNS	0x00020000	/* New namespace group? */
+#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
+#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
+#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
+#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
+#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
+#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
+#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
+#define CLONE_STOPPED		0x02000000	/* Start in stopped state */
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_NORMAL		0
+#define SCHED_FIFO		1
+#define SCHED_RR		2
+#define SCHED_BATCH		3
+
+#ifdef __KERNEL__
+
+struct sched_param {
+	int sched_priority;
+};
+
+#include <asm/param.h>	/* for HZ */
+
+#include <linux/capability.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/thread_info.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/nodemask.h>
+
+#include <asm/system.h>
+#include <asm/semaphore.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/cputime.h>
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+#include <linux/rcupdate.h>
+#include <linux/futex.h>
+#include <linux/rtmutex.h>
+
+#include <linux/time.h>
+#include <linux/param.h>
+#include <linux/resource.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+
+#include <asm/processor.h>
+
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#else
+# define hardirq_preemption 0
+#endif
+struct exec_domain;
+struct futex_pi_state;
+
+/*
+ * List of flags we want to share for kernel threads,
+ * if only because they are not used by them anyway.
+ */
+#define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
+
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ *    a load-average precision of 10 bits integer + 11 bits fractional
+ *  - if you want to count load-averages more often, you need more
+ *    precision, or rounding will get you. With 2-second counting freq,
+ *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+ *    11 bit fractions.
+ */
+extern unsigned long avenrun[];		/* Load averages */
+
+#define FSHIFT		11		/* nr of bits of precision */
+#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
+#define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
+#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
+#define EXP_5		2014		/* 1/exp(5sec/5min) */
+#define EXP_15		2037		/* 1/exp(5sec/15min) */
+
+#define CALC_LOAD(load,exp,n) \
+	load *= exp; \
+	load += n*(FIXED_1-exp); \
+	load >>= FSHIFT;
+
+extern unsigned long total_forks;
+extern int nr_threads;
+extern int last_pid;
+DECLARE_PER_CPU(unsigned long, process_counts);
+extern int nr_processes(void);
+extern unsigned long nr_running(void);
+extern unsigned long nr_uninterruptible(void);
+extern unsigned long nr_active(void);
+extern unsigned long nr_iowait(void);
+extern unsigned long weighted_cpuload(const int cpu);
+
+
+/*
+ * Task state bitmask. NOTE! These bits are also
+ * encoded in fs/proc/array.c: get_task_state().
+ *
+ * We have two separate sets of flags: task->state
+ * is about runnability, while task->exit_state are
+ * about the task exiting. Confusing, but this way
+ * modifying one set can't modify the other one by
+ * mistake.
+ */
+#define TASK_RUNNING		0
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define TASK_STOPPED		8
+#define TASK_TRACED		16
+/* in tsk->exit_state */
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
+/* in tsk->state again */
+#define TASK_NONINTERACTIVE	128
+
+#define __set_task_state(tsk, state_value)		\
+	do { (tsk)->state = (state_value); } while (0)
+#define set_task_state(tsk, state_value)		\
+	set_mb((tsk)->state, (state_value))
+
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
+struct exec_domain;
+
+/*
+ * set_current_state() includes a barrier so that the write of current->state
+ * is correctly serialised wrt the caller's subsequent test of whether to
+ * actually sleep:
+ *
+ *	set_current_state(TASK_UNINTERRUPTIBLE);
+ *	if (do_i_need_to_sleep())
+ *		schedule();
+ *
+ * If the caller does not need such serialisation then use __set_current_state()
+ */
+#define __set_current_state(state_value)			\
+	do { current->state = (state_value); } while (0)
+#define set_current_state(state_value)		\
+	set_mb(current->state, (state_value))
+
+/* Task command name length */
+#define TASK_COMM_LEN 16
+
+#include <linux/spinlock.h>
+
+/*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+extern rwlock_t tasklist_lock;
+extern spinlock_t mmlist_lock;
+
+struct task_struct;
+
+extern void sched_init(void);
+extern void sched_init_smp(void);
+extern void init_idle(struct task_struct *idle, int cpu);
+
+extern cpumask_t nohz_cpu_mask;
+
+extern void show_state(void);
+extern void show_regs(struct pt_regs *);
+
+/*
+ * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
+ * task), SP is the stack pointer of the first frame that should be shown in the back
+ * trace (or NULL if the entire call-chain of the task should be shown).
+ */
+extern void show_stack(struct task_struct *task, unsigned long *sp);
+
+void io_schedule(void);
+long io_schedule_timeout(long timeout);
+
+extern void cpu_init (void);
+extern void trap_init(void);
+extern void update_process_times(int user);
+extern void scheduler_tick(void);
+
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(void);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(void)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_BKL
+extern struct semaphore kernel_sem;
+#endif
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern int debug_direct_keyboard;
+#else
+# define debug_direct_keyboard 0
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# else
+   extern unsigned long arm_return_addr(int level);
+#  define CALLER_ADDR0 arm_return_addr(0)
+#  define CALLER_ADDR1 arm_return_addr(1)
+#  define CALLER_ADDR2 arm_return_addr(2)
+#  define CALLER_ADDR3 arm_return_addr(3)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_user_trigger_irq, trace_freerunning, trace_verbose,
+		trace_print_at_crash, trace_all_cpus, print_functions;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+  extern void stop_trace(void);
+# define start_trace() do { trace_enabled = 1; } while (0)
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define trace_special_u64(v1,v2)		do { } while (0)
+# define stop_trace()				do { } while (0)
+# define start_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
+
+/* Attach to any functions which should be ignored in wchan output. */
+#define __sched		__attribute__((__section__(".sched.text")))
+/* Is this address in the __sched functions? */
+extern int in_sched_functions(unsigned long addr);
+
+#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+extern signed long FASTCALL(schedule_timeout(signed long timeout));
+extern signed long schedule_timeout_interruptible(signed long timeout);
+extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+extern void __sched __schedule(void);
+
+struct namespace;
+
+/* Maximum number of active map areas.. This is a random (large) number */
+#define DEFAULT_MAX_MAP_COUNT	65536
+
+extern int sysctl_max_map_count;
+
+#include <linux/aio.h>
+
+extern unsigned long
+arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+		       unsigned long, unsigned long);
+extern unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+			  unsigned long len, unsigned long pgoff,
+			  unsigned long flags);
+extern void arch_unmap_area(struct mm_struct *, unsigned long);
+extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
+typedef atomic_long_t mm_counter_t;
+
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
+#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
+#define get_mm_counter(mm, member) ((mm)->_##member)
+#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
+#define inc_mm_counter(mm, member) (mm)->_##member++
+#define dec_mm_counter(mm, member) (mm)->_##member--
+typedef unsigned long mm_counter_t;
+
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
+#define get_mm_rss(mm)					\
+	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
+struct mm_struct {
+	struct vm_area_struct * mmap;		/* list of VMAs */
+	struct rb_root mm_rb;
+	struct vm_area_struct * mmap_cache;	/* last find_vma result */
+	unsigned long (*get_unmapped_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
+	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long task_size;		/* size of task vm space */
+	unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
+	pgd_t * pgd;
+	atomic_t mm_users;			/* How many users with user space? */
+	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	int map_count;				/* number of VMAs */
+	struct rw_semaphore mmap_sem;
+	spinlock_t page_table_lock;		/* Protects page tables and some counters */
+
+	struct list_head mmlist;		/* List of maybe swapped mm's.  These are globally strung
+						 * together off init_mm.mmlist, and are protected
+						 * by mmlist_lock
+						 */
+
+	/* Special counters, in some configurations protected by the
+	 * page_table_lock, in other configurations by being atomic.
+	 */
+	mm_counter_t _file_rss;
+	mm_counter_t _anon_rss;
+
+	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
+	unsigned long hiwater_vm;	/* High-water virtual memory usage */
+
+	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long start_brk, brk, start_stack;
+	unsigned long arg_start, arg_end, env_start, env_end;
+
+	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+
+	unsigned dumpable:2;
+	cpumask_t cpu_vm_mask;
+
+	/* Architecture-specific MM context */
+	mm_context_t context;
+
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
+	/* Token based thrashing protection. */
+	unsigned long swap_token_time;
+	char recent_pagein;
+
+	/* coredumping support */
+	int core_waiters;
+	struct completion *core_startup_done, core_done;
+
+	/* aio bits */
+	rwlock_t		ioctx_list_lock;
+	struct kioctx		*ioctx_list;
+};
+
+struct sighand_struct {
+	atomic_t		count;
+	struct k_sigaction	action[_NSIG];
+	spinlock_t		siglock;
+};
+
+struct pacct_struct {
+	int			ac_flag;
+	long			ac_exitcode;
+	unsigned long		ac_mem;
+	cputime_t		ac_utime, ac_stime;
+	unsigned long		ac_minflt, ac_majflt;
+};
+
+/*
+ * NOTE! "signal_struct" does not have it's own
+ * locking, because a shared signal_struct always
+ * implies a shared sighand_struct, so locking
+ * sighand_struct is always a proper superset of
+ * the locking of signal_struct.
+ */
+struct signal_struct {
+	atomic_t		count;
+	atomic_t		live;
+
+	wait_queue_head_t	wait_chldexit;	/* for wait4() */
+
+	/* current thread group signal load-balancing target: */
+	struct task_struct	*curr_target;
+
+	/* shared signal handling: */
+	struct sigpending	shared_pending;
+
+	/* thread group exit support */
+	int			group_exit_code;
+	/* overloaded:
+	 * - notify group_exit_task when ->count is equal to notify_count
+	 * - everyone except group_exit_task is stopped during signal delivery
+	 *   of fatal signals, group_exit_task processes the signal.
+	 */
+	struct task_struct	*group_exit_task;
+	int			notify_count;
+
+	/* thread group stop support, overloads group_exit_code too */
+	int			group_stop_count;
+	unsigned int		flags; /* see SIGNAL_* flags below */
+
+	/* POSIX.1b Interval Timers */
+	struct list_head posix_timers;
+
+	/* ITIMER_REAL timer for the process */
+	struct hrtimer real_timer;
+	struct task_struct *tsk;
+	ktime_t it_real_incr;
+
+	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
+	cputime_t it_prof_expires, it_virt_expires;
+	cputime_t it_prof_incr, it_virt_incr;
+
+	/* job control IDs */
+	pid_t pgrp;
+	pid_t tty_old_pgrp;
+	pid_t session;
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
+
+	/*
+	 * Cumulative resource counters for dead threads in the group,
+	 * and for reaped dead child processes forked by this group.
+	 * Live threads maintain their own counters and add to these
+	 * in __exit_signal, except for the group leader.
+	 */
+	cputime_t utime, stime, cutime, cstime;
+	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+
+	/*
+	 * Cumulative ns of scheduled CPU time for dead threads in the
+	 * group, not including a zombie group leader.  (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sched_time;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
+
+	struct list_head cpu_timers[3];
+
+	/* keep the process-shared keyrings here so that they do the right
+	 * thing in threads created with CLONE_THREAD */
+#ifdef CONFIG_KEYS
+	struct key *session_keyring;	/* keyring inherited over fork */
+	struct key *process_keyring;	/* keyring private to this process */
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	struct pacct_struct pacct;	/* per-process accounting information */
+#endif
+#ifdef CONFIG_TASKSTATS
+	spinlock_t stats_lock;
+	struct taskstats *stats;
+#endif
+};
+
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
+/*
+ * Bits in flags field of signal_struct.
+ */
+#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
+#define SIGNAL_STOP_DEQUEUED	0x00000002 /* stop signal dequeued */
+#define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
+#define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
+
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+
+#define rt_prio(prio)		((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define has_rt_policy(p) \
+	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
+
+/*
+ * Some day this will be a full-fledged user tracking system..
+ */
+struct user_struct {
+	atomic_t __count;	/* reference count */
+	atomic_t processes;	/* How many processes does this user have? */
+	atomic_t files;		/* How many open files does this user have? */
+	atomic_t sigpending;	/* How many pending signals does this user have? */
+#ifdef CONFIG_INOTIFY_USER
+	atomic_t inotify_watches; /* How many inotify watches does this user have? */
+	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
+#endif
+	/* protected by mq_lock	*/
+	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
+	unsigned long locked_shm; /* How many pages of mlocked shm ? */
+
+#ifdef CONFIG_KEYS
+	struct key *uid_keyring;	/* UID specific keyring */
+	struct key *session_keyring;	/* UID's default session keyring */
+#endif
+
+	/* Hash table maintenance information */
+	struct list_head uidhash_list;
+	uid_t uid;
+};
+
+extern struct user_struct *find_user(uid_t);
+
+extern struct user_struct root_user;
+#define INIT_USER (&root_user)
+
+struct backing_dev_info;
+struct reclaim_state;
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+struct sched_info {
+	/* cumulative counters */
+	unsigned long	cpu_time,	/* time spent on the cpu */
+			run_delay,	/* time spent waiting on a runqueue */
+			pcnt;		/* # of timeslices run on this cpu */
+
+	/* timestamps */
+	unsigned long	last_arrival,	/* when we last ran on a cpu */
+			last_queued;	/* when we were last queued to run */
+};
+#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+
+#ifdef CONFIG_SCHEDSTATS
+extern struct file_operations proc_schedstat_operations;
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+	spinlock_t	lock;
+	unsigned int	flags;	/* Private per-task flags */
+
+	/* For each stat XXX, add following, aligned appropriately
+	 *
+	 * struct timespec XXX_start, XXX_end;
+	 * u64 XXX_delay;
+	 * u32 XXX_count;
+	 *
+	 * Atomicity of updates to XXX_delay, XXX_count protected by
+	 * single lock above (split into XXX_lock if contention is an issue).
+	 */
+
+	/*
+	 * XXX_count is incremented on every XXX operation, the delay
+	 * associated with the operation is added to XXX_delay.
+	 * XXX_delay contains the accumulated delay time in nanoseconds.
+	 */
+	struct timespec blkio_start, blkio_end;	/* Shared by blkio, swapin */
+	u64 blkio_delay;	/* wait for sync block io completion */
+	u64 swapin_delay;	/* wait for swapin block io completion */
+	u32 blkio_count;	/* total count of the number of sync block */
+				/* io operations performed */
+	u32 swapin_count;	/* total count of the number of swapin block */
+				/* io operations performed */
+};
+#endif	/* CONFIG_TASK_DELAY_ACCT */
+
+static inline int sched_info_on(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	return 1;
+#elif defined(CONFIG_TASK_DELAY_ACCT)
+	extern int delayacct_on;
+	return delayacct_on;
+#else
+	return 0;
+#endif
+}
+
+enum idle_type
+{
+	SCHED_IDLE,
+	NOT_IDLE,
+	NEWLY_IDLE,
+	MAX_IDLE_TYPES
+};
+
+/*
+ * sched-domains (multiprocessor balancing) declarations:
+ */
+#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
+
+#ifdef CONFIG_SMP
+#define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
+#define SD_BALANCE_EXEC		4	/* Balance on exec */
+#define SD_BALANCE_FORK		8	/* Balance on fork, clone */
+#define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
+
+#define BALANCE_FOR_POWER	((sched_mc_power_savings || sched_smt_power_savings) \
+				 ? SD_POWERSAVINGS_BALANCE : 0)
+
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	cpumask_t cpumask;
+
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU. This is read only (except for setup, hotplug CPU).
+	 */
+	unsigned long cpu_power;
+};
+
+struct sched_domain {
+	/* These fields must be setup */
+	struct sched_domain *parent;	/* top domain must be null terminated */
+	struct sched_group *groups;	/* the balancing groups of the domain */
+	cpumask_t span;			/* span of all CPUs in this domain */
+	unsigned long min_interval;	/* Minimum balance interval ms */
+	unsigned long max_interval;	/* Maximum balance interval ms */
+	unsigned int busy_factor;	/* less balancing by factor if busy */
+	unsigned int imbalance_pct;	/* No balance until over watermark */
+	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
+	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
+	unsigned int forkexec_idx;
+	int flags;			/* See SD_* */
+
+	/* Runtime fields. */
+	unsigned long last_balance;	/* init to jiffies. units in jiffies */
+	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+	unsigned int nr_balance_failed; /* initialise to 0 */
+
+#ifdef CONFIG_SCHEDSTATS
+	/* load_balance() stats */
+	unsigned long lb_cnt[MAX_IDLE_TYPES];
+	unsigned long lb_failed[MAX_IDLE_TYPES];
+	unsigned long lb_balanced[MAX_IDLE_TYPES];
+	unsigned long lb_imbalance[MAX_IDLE_TYPES];
+	unsigned long lb_gained[MAX_IDLE_TYPES];
+	unsigned long lb_hot_gained[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+
+	/* Active load balancing */
+	unsigned long alb_cnt;
+	unsigned long alb_failed;
+	unsigned long alb_pushed;
+
+	/* SD_BALANCE_EXEC stats */
+	unsigned long sbe_cnt;
+	unsigned long sbe_balanced;
+	unsigned long sbe_pushed;
+
+	/* SD_BALANCE_FORK stats */
+	unsigned long sbf_cnt;
+	unsigned long sbf_balanced;
+	unsigned long sbf_pushed;
+
+	/* try_to_wake_up() stats */
+	unsigned long ttwu_wake_remote;
+	unsigned long ttwu_move_affine;
+	unsigned long ttwu_move_balance;
+#endif
+};
+
+extern int partition_sched_domains(cpumask_t *partition1,
+				    cpumask_t *partition2);
+
+/*
+ * Maximum cache size the migration-costs auto-tuning code will
+ * search from:
+ */
+extern unsigned int max_cache_size;
+
+#endif	/* CONFIG_SMP */
+
+
+struct io_context;			/* See blkdev.h */
+void exit_io_context(void);
+struct cpuset;
+
+#define NGROUPS_SMALL		32
+#define NGROUPS_PER_BLOCK	((int)(PAGE_SIZE / sizeof(gid_t)))
+struct group_info {
+	int ngroups;
+	atomic_t usage;
+	gid_t small_block[NGROUPS_SMALL];
+	int nblocks;
+	gid_t *blocks[0];
+};
+
+/*
+ * get_group_info() must be called with the owning task locked (via task_lock())
+ * when task != current.  The reason being that the vast majority of callers are
+ * looking at current->group_info, which can not be changed except by the
+ * current task.  Changing current->group_info requires the task lock, too.
+ */
+#define get_group_info(group_info) do { \
+	atomic_inc(&(group_info)->usage); \
+} while (0)
+
+#define put_group_info(group_info) do { \
+	if (atomic_dec_and_test(&(group_info)->usage)) \
+		groups_free(group_info); \
+} while (0)
+
+extern struct group_info *groups_alloc(int gidsetsize);
+extern void groups_free(struct group_info *group_info);
+extern int set_current_groups(struct group_info *group_info);
+extern int groups_search(struct group_info *group_info, gid_t grp);
+/* access the groups "array" with this macro */
+#define GROUP_AT(gi, i) \
+    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
+
+#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
+extern void prefetch_stack(struct task_struct *t);
+#else
+static inline void prefetch_stack(struct task_struct *t) { }
+#endif
+
+struct audit_context;		/* See audit.c */
+struct mempolicy;
+struct pipe_inode_info;
+
+enum sleep_type {
+	SLEEP_NORMAL,
+	SLEEP_NONINTERACTIVE,
+	SLEEP_INTERACTIVE,
+	SLEEP_INTERRUPTED,
+};
+
+struct prio_array;
+
+struct task_struct {
+	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	struct thread_info *thread_info;
+	atomic_t usage;
+	unsigned long flags;	/* per process flags, defined below */
+	unsigned long ptrace;
+
+	int lock_depth;		/* BKL lock depth */
+
+#ifdef CONFIG_SMP
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	int oncpu;
+#endif
+#endif
+	int load_weight;	/* for niceness load balancing purposes */
+	int prio, static_prio, normal_prio;
+	struct list_head run_list;
+	struct prio_array *array;
+
+	unsigned short ioprio;
+	unsigned int btrace_seq;
+
+	unsigned long sleep_avg;
+	unsigned long long timestamp, last_ran;
+	unsigned long long sched_time; /* sched_clock time spent running */
+	enum sleep_type sleep_type;
+
+	unsigned long policy;
+	cpumask_t cpus_allowed;
+	unsigned int time_slice, first_time_slice;
+
+#ifdef CONFIG_PREEMPT_RCU
+        int rcu_read_lock_nesting;
+        atomic_t *rcu_flipctr1;
+        atomic_t *rcu_flipctr2;
+#endif
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	struct sched_info sched_info;
+#endif
+
+	struct list_head tasks;
+	/*
+	 * ptrace_list/ptrace_children forms the list of my children
+	 * that were stolen by a ptracer.
+	 */
+	struct list_head ptrace_children;
+	struct list_head ptrace_list;
+
+	struct mm_struct *mm, *active_mm;
+
+/* task state */
+	struct linux_binfmt *binfmt;
+	long exit_state;
+	int exit_code, exit_signal;
+	int pdeath_signal;  /*  The signal sent when the parent dies  */
+	/* ??? */
+	unsigned long personality;
+	unsigned did_exec:1;
+	pid_t pid;
+	pid_t tgid;
+	/* 
+	 * pointers to (original) parent process, youngest child, younger sibling,
+	 * older sibling, respectively.  (p->father can be replaced with 
+	 * p->parent->pid)
+	 */
+	struct task_struct *real_parent; /* real parent process (when being debugged) */
+	struct task_struct *parent;	/* parent process */
+	/*
+	 * children/sibling forms the list of my children plus the
+	 * tasks I'm ptracing.
+	 */
+	struct list_head children;	/* list of my children */
+	struct list_head sibling;	/* linkage in my parent's children list */
+	struct task_struct *group_leader;	/* threadgroup leader */
+
+	/* PID/PID hash table linkage. */
+	struct pid_link pids[PIDTYPE_MAX];
+	struct list_head thread_group;
+
+	struct completion *vfork_done;		/* for vfork() */
+	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
+	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
+
+	unsigned long rt_priority;
+	cputime_t utime, stime;
+	unsigned long nvcsw, nivcsw; /* context switch counts */
+	struct timespec start_time;
+/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+	unsigned long min_flt, maj_flt;
+
+  	cputime_t it_prof_expires, it_virt_expires;
+	unsigned long long it_sched_expires;
+	struct list_head cpu_timers[3];
+
+	struct task_struct* posix_timer_list;
+
+/* process credentials */
+	uid_t uid,euid,suid,fsuid;
+	gid_t gid,egid,sgid,fsgid;
+	struct group_info *group_info;
+	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
+	unsigned keep_capabilities:1;
+	struct user_struct *user;
+#ifdef CONFIG_KEYS
+	struct key *request_key_auth;	/* assumed request_key authority */
+	struct key *thread_keyring;	/* keyring private to this thread */
+	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
+#endif
+	int oomkilladj; /* OOM kill score adjustment (bit shift). */
+	char comm[TASK_COMM_LEN]; /* executable name excluding path
+				     - access with [gs]et_task_comm (which lock
+				       it with task_lock())
+				     - initialized normally by flush_old_exec */
+/* file system info */
+	int link_count, total_link_count;
+/* ipc stuff */
+	struct sysv_sem sysvsem;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/* filesystem information */
+	struct fs_struct *fs;
+/* open file information */
+	struct files_struct *files;
+/* namespace */
+	struct namespace *namespace;
+/* signal handlers */
+	struct signal_struct *signal;
+	struct sighand_struct *sighand;
+
+	sigset_t blocked, real_blocked;
+	sigset_t saved_sigmask;		/* To be restored with TIF_RESTORE_SIGMASK */
+	struct sigpending pending;
+
+	unsigned long sas_ss_sp;
+	size_t sas_ss_size;
+	int (*notifier)(void *priv);
+	void *notifier_data;
+	sigset_t *notifier_mask;
+	
+	void *security;
+	struct audit_context *audit_context;
+	seccomp_t seccomp;
+
+/* Thread group tracking */
+   	u32 parent_exec_id;
+   	u32 self_exec_id;
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
+	spinlock_t alloc_lock;
+
+	/* Protection of the PI data structures: */
+	raw_spinlock_t pi_lock;
+
+#ifdef CONFIG_RT_MUTEXES
+	/* PI waiters blocked on a rt_mutex held by this task */
+	struct plist_head pi_waiters;
+	/* Deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *pi_blocked_on;
+#endif
+
+#ifdef CONFIG_DEBUG_MUTEXES
+	/* mutex deadlock detection */
+	struct mutex_waiter *blocked_on;
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	unsigned int irq_events;
+	int hardirqs_enabled;
+	unsigned long hardirq_enable_ip;
+	unsigned int hardirq_enable_event;
+	unsigned long hardirq_disable_ip;
+	unsigned int hardirq_disable_event;
+	int softirqs_enabled;
+	unsigned long softirq_disable_ip;
+	unsigned int softirq_disable_event;
+	unsigned long softirq_enable_ip;
+	unsigned int softirq_enable_event;
+	int hardirq_context;
+	int softirq_context;
+#endif
+#ifdef CONFIG_LOCKDEP
+# define MAX_LOCK_DEPTH 30UL
+	u64 curr_chain_key;
+	int lockdep_depth;
+	struct held_lock held_locks[MAX_LOCK_DEPTH];
+	unsigned int lockdep_recursion;
+#endif
+
+#define MAX_PREEMPT_TRACE 25
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	int lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+	/* realtime bits */
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	void *last_kernel_lock;
+#endif
+
+/* journalling filesystem info */
+	void *journal_info;
+
+/* VM state */
+	struct reclaim_state *reclaim_state;
+
+	struct backing_dev_info *backing_dev_info;
+
+	struct io_context *io_context;
+
+	unsigned long ptrace_message;
+	siginfo_t *last_siginfo; /* For ptrace use.  */
+/*
+ * current io wait handle: wait queue entry to use for io waits
+ * If this thread is processing aio, this points at the waitqueue
+ * inside the currently handled kiocb. It may be NULL (i.e. default
+ * to a stack based synchronous wait) if its doing sync IO.
+ */
+	wait_queue_t *io_wait;
+/* i/o counters(bytes read/written, #syscalls */
+	u64 rchar, wchar, syscr, syscw;
+#if defined(CONFIG_BSD_PROCESS_ACCT)
+	u64 acct_rss_mem1;	/* accumulated rss usage */
+	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
+	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
+#endif
+#ifdef CONFIG_NUMA
+  	struct mempolicy *mempolicy;
+	short il_next;
+#endif
+#ifdef CONFIG_CPUSETS
+	struct cpuset *cpuset;
+	nodemask_t mems_allowed;
+	int cpuset_mems_generation;
+	int cpuset_mem_spread_rotor;
+#endif
+	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
+	struct list_head pi_state_list;
+	struct futex_pi_state *pi_state_cache;
+
+	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
+
+	/*
+	 * cache last used pipe for splice
+	 */
+	struct pipe_inode_info *splice_pipe;
+#ifdef	CONFIG_TASK_DELAY_ACCT
+	struct task_delay_info *delays;
+#endif
+};
+
+static inline pid_t process_group(struct task_struct *tsk)
+{
+	return tsk->signal->pgrp;
+}
+
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ */
+static inline int pid_alive(struct task_struct *p)
+{
+	return p->pids[PIDTYPE_PID].pid != NULL;
+}
+
+extern void free_task(struct task_struct *tsk);
+#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+
+#ifdef CONFIG_PREEMPT_RT
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
+#else
+extern void __put_task_struct(struct task_struct *t);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		__put_task_struct(t);
+}
+#endif
+
+/*
+ * Per process flags
+ */
+#define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
+					/* Not implemented yet, only for 486*/
+#define PF_STARTING	0x00000002	/* being created */
+#define PF_EXITING	0x00000004	/* getting shut down */
+#define PF_DEAD		0x00000008	/* Dead */
+#define PF_NOSCHED	0x00000010	/* Userspace does not expect scheduling */
+#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+#define PF_DUMPCORE	0x00000200	/* dumped core */
+#define PF_SIGNALED	0x00000400	/* killed by a signal */
+#define PF_MEMALLOC	0x00000800	/* Allocating memory */
+#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
+#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
+#define PF_FREEZE	0x00004000	/* this task is being frozen for suspend now */
+#define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
+#define PF_FROZEN	0x00010000	/* frozen for system suspend */
+#define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_SWAPOFF	0x00080000	/* I am in swapoff */
+#define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
+#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_SOFTIRQ	0x40000000	/* softirq context */
+#define PF_HARDIRQ	0x80000000	/* hardirq context */
+
+/*
+ * Only the _current_ task can read/write to tsk->flags, but other
+ * tasks can access tsk->flags in readonly mode for example
+ * with tsk_used_math (like during threaded core dumping).
+ * There is however an exception to this rule during ptrace
+ * or during fork: the ptracer task is allowed to write to the
+ * child->flags of its traced child (same goes for fork, the parent
+ * can write to the child->flags), because we're guaranteed the
+ * child is not running and in turn not changing child->flags
+ * at the same time the parent does it.
+ */
+#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
+#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
+#define clear_used_math() clear_stopped_child_used_math(current)
+#define set_used_math() set_stopped_child_used_math(current)
+#define conditional_stopped_child_used_math(condition, child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
+#define conditional_used_math(condition) \
+	conditional_stopped_child_used_math(condition, current)
+#define copy_to_stopped_child_used_math(child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
+/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
+#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
+#define used_math() tsk_used_math(current)
+
+#ifdef CONFIG_SMP
+extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
+#else
+static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+{
+	if (!cpu_isset(0, new_mask))
+		return -EINVAL;
+	return 0;
+}
+#endif
+
+extern unsigned long long sched_clock(void);
+extern unsigned long long
+current_sched_time(const struct task_struct *current_task);
+
+/* sched_exec is called by processes performing an exec */
+#ifdef CONFIG_SMP
+extern void sched_exec(void);
+#else
+#define sched_exec()   {}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void idle_task_exit(void);
+#else
+static inline void idle_task_exit(void) {}
+#endif
+
+extern void sched_idle_next(void);
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(struct task_struct *p);
+extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern void rt_mutex_adjust_pi(struct task_struct *p);
+#else
+static inline int rt_mutex_getprio(struct task_struct *p)
+{
+	return p->normal_prio;
+}
+# define rt_mutex_adjust_pi(p)		do { } while (0)
+#endif
+
+extern void set_user_nice(struct task_struct *p, long nice);
+extern int task_prio(const struct task_struct *p);
+extern int task_nice(const struct task_struct *p);
+extern int can_nice(const struct task_struct *p, const int nice);
+extern int task_curr(const struct task_struct *p);
+extern int idle_cpu(int cpu);
+extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern struct task_struct *idle_task(int cpu);
+extern struct task_struct *curr_task(int cpu);
+extern void set_curr_task(int cpu, struct task_struct *p);
+extern void set_thread_priority(struct task_struct *p, int prio);
+
+void yield(void);
+void __yield(void);
+
+/*
+ * The default (Linux) execution domain.
+ */
+extern struct exec_domain	default_exec_domain;
+
+union thread_union {
+	struct thread_info thread_info;
+	unsigned long stack[THREAD_SIZE/sizeof(long)];
+};
+
+#ifndef __HAVE_ARCH_KSTACK_END
+static inline int kstack_end(void *addr)
+{
+	/* Reliable end of stack detection:
+	 * Some APM bios versions misalign the stack
+	 */
+	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
+}
+#endif
+
+extern union thread_union init_thread_union;
+extern struct task_struct init_task;
+
+extern struct   mm_struct init_mm;
+
+#define find_task_by_pid(nr)	find_task_by_pid_type(PIDTYPE_PID, nr)
+extern struct task_struct *find_task_by_pid_type(int type, int pid);
+extern void set_special_pids(pid_t session, pid_t pgrp);
+extern void __set_special_pids(pid_t session, pid_t pgrp);
+
+/* per-UID process charging. */
+extern struct user_struct * alloc_uid(uid_t);
+static inline struct user_struct *get_uid(struct user_struct *u)
+{
+	atomic_inc(&u->__count);
+	return u;
+}
+extern void free_uid(struct user_struct *);
+extern void switch_uid(struct user_struct *);
+
+#include <asm/current.h>
+
+extern void do_timer(struct pt_regs *);
+
+extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk));
+extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
+						unsigned long clone_flags));
+#ifdef CONFIG_SMP
+ extern void kick_process(struct task_struct *tsk);
+#else
+ static inline void kick_process(struct task_struct *tsk) { }
+#endif
+extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
+extern void FASTCALL(sched_exit(struct task_struct * p));
+
+extern int in_group_p(gid_t);
+extern int in_egroup_p(gid_t);
+
+extern void proc_caches_init(void);
+extern void flush_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *, int force_default);
+extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
+
+static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	ret = dequeue_signal(tsk, mask, info);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+
+	return ret;
+}	
+
+extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+			      sigset_t *mask);
+extern void unblock_all_signals(void);
+extern void release_task(struct task_struct * p);
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sigsegv(int, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
+extern int kill_pg_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info_as_uid(int, struct siginfo *, pid_t, uid_t, uid_t, u32);
+extern void do_notify_parent(struct task_struct *, int);
+extern void force_sig(int, struct task_struct *);
+extern void force_sig_specific(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern void zap_other_threads(struct task_struct *p);
+extern int kill_pg(pid_t, int, int);
+extern int kill_proc(pid_t, int, int);
+extern struct sigqueue *sigqueue_alloc(void);
+extern void sigqueue_free(struct sigqueue *);
+extern int send_sigqueue(int, struct sigqueue *,  struct task_struct *);
+extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
+
+/* These can be the second arg to send_sig_info/send_group_sig_info.  */
+#define SEND_SIG_NOINFO ((struct siginfo *) 0)
+#define SEND_SIG_PRIV	((struct siginfo *) 1)
+#define SEND_SIG_FORCED	((struct siginfo *) 2)
+
+static inline int is_si_special(const struct siginfo *info)
+{
+	return info <= SEND_SIG_FORCED;
+}
+
+/* True if we are on the alternate signal stack.  */
+
+static inline int on_sig_stack(unsigned long sp)
+{
+	return (sp - current->sas_ss_sp < current->sas_ss_size);
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+	return (current->sas_ss_size == 0 ? SS_DISABLE
+		: on_sig_stack(sp) ? SS_ONSTACK : 0);
+}
+
+/*
+ * Routines for handling mm_structs
+ */
+extern struct mm_struct * mm_alloc(void);
+
+/* mmdrop drops the mm and the page tables */
+extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+
+static inline void mmdrop(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop(mm);
+}
+
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
+/* mmput gets rid of the mappings and all user-space */
+extern void mmput(struct mm_struct *);
+/* Grab a reference to a task's mm, if it is not already going away */
+extern struct mm_struct *get_task_mm(struct task_struct *task);
+/* Remove the current tasks stale references to the old mm_struct */
+extern void mm_release(struct task_struct *, struct mm_struct *);
+
+extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern void flush_thread(void);
+extern void exit_thread(void);
+
+extern void exit_files(struct task_struct *);
+extern void __cleanup_signal(struct signal_struct *);
+extern void __cleanup_sighand(struct sighand_struct *);
+extern void exit_itimers(struct signal_struct *);
+
+extern NORET_TYPE void do_group_exit(int);
+
+extern void daemonize(const char *, ...);
+extern int allow_signal(int);
+extern int disallow_signal(int);
+extern struct task_struct *child_reaper;
+
+extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
+extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+struct task_struct *fork_idle(int);
+
+extern void set_task_comm(struct task_struct *tsk, char *from);
+extern void get_task_comm(char *to, struct task_struct *tsk);
+
+#ifdef CONFIG_SMP
+extern void wait_task_inactive(struct task_struct * p);
+#else
+#define wait_task_inactive(p)	do { } while (0)
+#endif
+
+#define remove_parent(p)	list_del_init(&(p)->sibling)
+#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
+
+#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+
+#define for_each_process(p) \
+	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+
+/*
+ * Careful: do_each_thread/while_each_thread is a double loop so
+ *          'break' will not work as expected - use goto instead.
+ */
+#define do_each_thread(g, t) \
+	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+
+#define while_each_thread(g, t) \
+	while ((t = next_thread(t)) != g)
+
+/* de_thread depends on thread_group_leader not being a pid based check */
+#define thread_group_leader(p)	(p == p->group_leader)
+
+static inline struct task_struct *next_thread(const struct task_struct *p)
+{
+	return list_entry(rcu_dereference(p->thread_group.next),
+			  struct task_struct, thread_group);
+}
+
+static inline int thread_group_empty(struct task_struct *p)
+{
+	return list_empty(&p->thread_group);
+}
+
+#define delay_group_leader(p) \
+		(thread_group_leader(p) && !thread_group_empty(p))
+
+/*
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.  Also protects ->cpuset.
+ *
+ * Nests both inside and outside of read_lock(&tasklist_lock).
+ * It must not be nested with write_lock_irq(&tasklist_lock),
+ * neither inside nor outside.
+ */
+static inline void task_lock(struct task_struct *p)
+{
+	spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+	spin_unlock(&p->alloc_lock);
+}
+
+extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
+#ifndef __HAVE_THREAD_FUNCTIONS
+
+#define task_thread_info(task) (task)->thread_info
+#define task_stack_page(task) ((void*)((task)->thread_info))
+
+static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
+{
+	*task_thread_info(p) = *task_thread_info(org);
+	task_thread_info(p)->task = p;
+}
+
+static inline unsigned long *end_of_stack(struct task_struct *p)
+{
+	return (unsigned long *)(p->thread_info + 1);
+}
+
+#endif
+
+/* set thread flags in other task's structures
+ * - see asm/thread_info.h for TIF_xxxx flags available
+ */
+static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	set_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	clear_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline void set_tsk_need_resched(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+}
+
+static inline void clear_tsk_need_resched(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+}
+
+static inline int signal_pending(struct task_struct *p)
+{
+	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+}
+  
+static inline int _need_resched(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+}
+
+static inline int need_resched(void)
+{
+	touch_critical_timing();
+	return _need_resched();
+}
+
+static inline void set_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline int need_resched_delayed(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED));
+}
+
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting?:
+ */
+#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT)
+# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
+#else
+# define need_lockbreak(lock) 0
+#endif
+
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
+#else
+# define need_lockbreak_raw(lock) 0
+#endif
+
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting or preemption being signalled:
+ */
+#define lock_need_resched(lock) \
+	unlikely(need_lockbreak(lock) || need_resched())
+
+static inline int softirq_need_resched(void)
+{
+	if (softirq_preemption)
+		return need_resched();
+	return 0;
+}
+
+static inline int hardirq_need_resched(void)
+{
+	if (current->flags & PF_HARDIRQ)
+		return need_resched();
+	return 0;
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 		\
+		__ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = __cond_resched_spinlock((spinlock_t *)lock); \
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+extern int cond_resched_softirq(void);
+extern int cond_resched_hardirq(void);
+extern int cond_resched_all(void);
+
+/* Reevaluate whether the task has signals pending delivery.
+   This is required every time the blocked sigset_t changes.
+   callers must hold sighand->siglock.  */
+
+extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t));
+extern void recalc_sigpending(void);
+
+extern void signal_wake_up(struct task_struct *t, int resume_stopped);
+
+/*
+ * Wrappers for p->thread_info->cpu access. No-op on UP.
+ */
+#ifdef CONFIG_SMP
+
+static inline unsigned int task_cpu(const struct task_struct *p)
+{
+	return task_thread_info(p)->cpu;
+}
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	trace_change_sched_cpu(p, cpu);
+	task_thread_info(p)->cpu = cpu;
+}
+
+#else
+
+static inline unsigned int task_cpu(const struct task_struct *p)
+{
+	return 0;
+}
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
+#endif /* CONFIG_SMP */
+
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+extern void arch_pick_mmap_layout(struct mm_struct *mm);
+#else
+static inline void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	mm->mmap_base = TASK_UNMAPPED_BASE;
+	mm->get_unmapped_area = arch_get_unmapped_area;
+	mm->unmap_area = arch_unmap_area;
+}
+#endif
+
+extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
+extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
+
+#include <linux/sysdev.h>
+extern int sched_mc_power_savings, sched_smt_power_savings;
+extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings;
+extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
+
+extern void normalize_rt_tasks(void);
+
+#ifdef CONFIG_PM
+/*
+ * Check if a process has been frozen
+ */
+static inline int frozen(struct task_struct *p)
+{
+	return p->flags & PF_FROZEN;
+}
+
+/*
+ * Check if there is a request to freeze a process
+ */
+static inline int freezing(struct task_struct *p)
+{
+	return p->flags & PF_FREEZE;
+}
+
+/*
+ * Request that a process be frozen
+ * FIXME: SMP problem. We may not modify other process' flags!
+ */
+static inline void freeze(struct task_struct *p)
+{
+	p->flags |= PF_FREEZE;
+}
+
+/*
+ * Sometimes we may need to cancel the previous 'freeze' request
+ */
+static inline void do_not_freeze(struct task_struct *p)
+{
+	p->flags &= ~PF_FREEZE;
+}
+
+/*
+ * Wake up a frozen process
+ */
+static inline int thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		wake_up_process(p);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * freezing is complete, mark process as frozen
+ */
+static inline void frozen_process(struct task_struct *p)
+{
+	p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
+}
+
+extern void refrigerator(void);
+extern int freeze_processes(void);
+extern void thaw_processes(void);
+
+static inline int try_to_freeze(void)
+{
+	if (freezing(current)) {
+		refrigerator();
+		return 1;
+	} else
+		return 0;
+}
+#else
+static inline int frozen(struct task_struct *p) { return 0; }
+static inline int freezing(struct task_struct *p) { return 0; }
+static inline void freeze(struct task_struct *p) { BUG(); }
+static inline int thaw_process(struct task_struct *p) { return 1; }
+static inline void frozen_process(struct task_struct *p) { BUG(); }
+
+static inline void refrigerator(void) {}
+static inline int freeze_processes(void) { BUG(); return 0; }
+static inline void thaw_processes(void) {}
+
+static inline int try_to_freeze(void) { return 0; }
+
+#endif /* CONFIG_PM */
+#endif /* __KERNEL__ */
+
+#endif
diff -urN ./linux-2.6.18.1/include/linux/semaphore.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/semaphore.h
--- ./linux-2.6.18.1/include/linux/semaphore.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/semaphore.h	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,52 @@
+#ifndef _LINUX_SEMAPHORE_H
+#define _LINUX_SEMAPHORE_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
+#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+static inline void sema_init(struct compat_semaphore *sem, int val)
+{
+	compat_sema_init(sem, val);
+}
+static inline void init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX(sem);
+}
+static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX_LOCKED(sem);
+}
+static inline void down(struct compat_semaphore *sem)
+{
+	compat_down(sem);
+}
+static inline int down_interruptible(struct compat_semaphore *sem)
+{
+	return compat_down_interruptible(sem);
+}
+static inline int down_trylock(struct compat_semaphore *sem)
+{
+	return compat_down_trylock(sem);
+}
+static inline void up(struct compat_semaphore *sem)
+{
+	compat_up(sem);
+}
+static inline int sem_is_locked(struct compat_semaphore *sem)
+{
+	return compat_sem_is_locked(sem);
+}
+static inline int sema_count(struct compat_semaphore *sem)
+{
+	return compat_sema_count(sem);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* _LINUX_SEMAPHORE_H */
diff -urN ./linux-2.6.18.1/include/linux/seqlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/seqlock.h
--- ./linux-2.6.18.1/include/linux/seqlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/seqlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -32,43 +32,72 @@
 typedef struct {
 	unsigned sequence;
 	spinlock_t lock;
-} seqlock_t;
+} __seqlock_t;
+
+typedef struct {
+	unsigned sequence;
+	raw_spinlock_t lock;
+} __raw_seqlock_t;
+
+#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
+
+#ifdef CONFIG_PREEMPT_RT
+typedef __seqlock_t seqlock_t;
+#else
+typedef __raw_seqlock_t seqlock_t;
+#endif
+
+typedef __raw_seqlock_t raw_seqlock_t;
 
 /*
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
-#define __SEQLOCK_UNLOCKED(lockname) \
-		 { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+#define __RAW_SEQLOCK_UNLOCKED(lockname) \
+		{ 0, RAW_SPIN_LOCK_UNLOCKED(name) }
+
+#ifdef CONFIG_PREEMPT_RT
+# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) }
+#else
+# define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname)
+#endif
 
 #define SEQLOCK_UNLOCKED \
 		 __SEQLOCK_UNLOCKED(old_style_seqlock_init)
 
+#define raw_seqlock_init(x) \
+	do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); } while (0)
+
 #define seqlock_init(x) \
 		do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); } while (0)
 
 #define DEFINE_SEQLOCK(x) \
 		seqlock_t x = __SEQLOCK_UNLOCKED(x)
 
+#define DEFINE_RAW_SEQLOCK(name) \
+	raw_seqlock_t name __cacheline_aligned_in_smp = \
+					__RAW_SEQLOCK_UNLOCKED(name)
+
+
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
  * Don't need preempt_disable() because that is in the spin_lock already.
  */
-static inline void write_seqlock(seqlock_t *sl)
+static inline void __write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();			
 }	
 
-static inline void write_sequnlock(seqlock_t *sl) 
+static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
+static inline int __write_tryseqlock(seqlock_t *sl)
 {
 	int ret = spin_trylock(&sl->lock);
 
@@ -80,7 +109,7 @@
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
+static __always_inline unsigned __read_seqbegin(const seqlock_t *sl)
 {
 	unsigned ret = sl->sequence;
 	smp_rmb();
@@ -95,12 +124,118 @@
  *    
  * Using xor saves one conditional branch.
  */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
+{
+	int ret;
+
+	smp_rmb();
+	ret = (iv & 1) | (sl->sequence ^ iv);
+	/*
+	 * If invalid then serialize with the writer, to make sure we
+	 * are not livelocking it:
+	 */
+	if (unlikely(ret)) {
+		unsigned long flags;
+		spin_lock_irqsave(&sl->lock, flags);
+		spin_unlock_irqrestore(&sl->lock, flags);
+	}
+	return ret;
+}
+
+static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	++sl->sequence;
+	smp_wmb();
+}
+
+static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl)
+{
+	smp_wmb();
+	sl->sequence++;
+	spin_unlock(&sl->lock);
+}
+
+static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
+{
+	int ret = spin_trylock(&sl->lock);
+
+	if (ret) {
+		++sl->sequence;
+		smp_wmb();
+	}
+	return ret;
+}
+
+static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+{
+	unsigned ret = sl->sequence;
+	smp_rmb();
+	return ret;
+}
+
+static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv)
 {
 	smp_rmb();
 	return (iv & 1) | (sl->sequence ^ iv);
 }
 
+extern int __bad_seqlock_type(void);
+
+#define PICK_SEQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op##_raw((raw_seqlock_t *)(lock));		\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		op((seqlock_t *)(lock));			\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_SEQOP_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((raw_seqlock_t *)(lock));	\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		__ret = op((seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP_CONST_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((const raw_seqlock_t *)(lock));\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		__ret = op((seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
+ ({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))				\
+		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
+	else if (TYPE_EQUAL((lock), seqlock_t))				\
+		__ret = op((seqlock_t *)(lock), (arg));			\
+	else __ret = __bad_seqlock_type();				\
+									\
+	__ret;								\
+})
+
+
+#define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
+#define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
+#define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
+#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
+#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
 
 /*
  * Version using sequence counter only.
@@ -152,30 +287,51 @@
 	s->sequence++;
 }
 
+#define PICK_IRQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op();						\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_IRQOP2(op, arg, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op(arg);					\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+
+
 /*
  * Possible sw/hw IRQ protected versions of the interfaces.
  */
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
+	do { PICK_IRQOP2(local_irq_save, flags, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
+	do { PICK_IRQOP(local_irq_disable, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+        do { PICK_IRQOP(local_bh_disable, lock); write_seqlock(lock); } while (0)
 
 #define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP2(local_irq_restore, flags, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_irq_enable, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_bh_enable, lock); } while(0)
 
 #define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
+	({ PICK_IRQOP2(local_irq_save, flags, lock); read_seqbegin(lock); })
 
 #define read_seqretry_irqrestore(lock, iv, flags)			\
 	({								\
 		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
+		PICK_IRQOP2(local_irq_restore, flags, lock);		\
+		preempt_check_resched(); 				\
 		ret;							\
 	})
 
diff -urN ./linux-2.6.18.1/include/linux/smp.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/smp.h
--- ./linux-2.6.18.1/include/linux/smp.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/smp.h	2007-05-19 23:58:35.000000000 +0900
@@ -32,6 +32,16 @@
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -85,7 +95,7 @@
 {
 	return 0;
 }
-#define smp_call_function(func,info,retry,wait)	(up_smp_call_function())
+#define smp_call_function(func,info,retry,wait)	((void)(func), up_smp_call_function())
 #define on_each_cpu(func,info,retry,wait)	\
 	({					\
 		local_irq_disable();		\
@@ -94,6 +104,7 @@
 		0;				\
 	})
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 
@@ -123,7 +134,7 @@
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
+#define put_cpu_no_resched()	__preempt_enable_no_resched()
 
 void smp_setup_processor_id(void);
 
diff -urN ./linux-2.6.18.1/include/linux/smp_lock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/smp_lock.h
--- ./linux-2.6.18.1/include/linux/smp_lock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/smp_lock.h	2007-05-19 23:58:35.000000000 +0900
@@ -18,6 +18,8 @@
 		__release_kernel_lock();	\
 } while (0)
 
+
+
 /*
  * Non-SMP kernels will never block on the kernel lock,
  * so we are better off returning a constant zero from
@@ -45,7 +47,7 @@
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
diff -urN ./linux-2.6.18.1/include/linux/spinlock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock.h
--- ./linux-2.6.18.1/include/linux/spinlock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock.h	2007-05-19 23:58:35.000000000 +0900
@@ -44,6 +44,42 @@
  *                        builds the _spin_*() APIs.
  *
  *  linux/spinlock.h:     builds the final spin_*() APIs.
+ *
+ *
+ * Public types and naming conventions:
+ * ------------------------------------
+ * spinlock_t:				type:  sleep-lock
+ * raw_spinlock_t:			type:  spin-lock (debug)
+ *
+ * spin_lock([raw_]spinlock_t):		API:   acquire lock, both types
+ *
+ *
+ * Internal types and naming conventions:
+ * -------------------------------------
+ * __raw_spinlock_t:			type: lowlevel spin-lock
+ *
+ * _spin_lock(struct rt_mutex):		API:  acquire sleep-lock
+ * __spin_lock(raw_spinlock_t):		API:  acquire spin-lock (highlevel)
+ * _raw_spin_lock(raw_spinlock_t):	API:  acquire spin-lock (debug)
+ * __raw_spin_lock(__raw_spinlock_t):	API:  acquire spin-lock (lowlevel)
+ *
+ *
+ * spin_lock(raw_spinlock_t) translates into the following chain of
+ * calls/inlines/macros, if spin-lock debugging is enabled:
+ *
+ *       spin_lock()			[include/linux/spinlock.h]
+ * ->    __spin_lock()			[kernel/spinlock.c]
+ *  ->   _raw_spin_lock()		[lib/spinlock_debug.c]
+ *   ->  __raw_spin_lock()		[include/asm/spinlock.h]
+ *
+ * spin_lock(spinlock_t) translates into the following chain of
+ * calls/inlines/macros:
+ *
+ *       spin_lock()			[include/linux/spinlock.h]
+ * ->    _spin_lock()			[include/linux/spinlock.h]
+ *  ->   rt_spin_lock()			[kernel/rtmutex.c]
+ *   ->  rt_spin_lock_fastlock()	[kernel/rtmutex.c]
+ *    -> rt_spin_lock_slowlock()	[kernel/rtmutex.c]
  */
 
 #include <linux/preempt.h>
@@ -51,28 +87,12 @@
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
+#include <linux/cache.h>
 #include <linux/stringify.h>
 
 #include <asm/system.h>
 
 /*
- * Must define these before including other files, inline functions need them
- */
-#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
-
-#define LOCK_SECTION_START(extra)               \
-        ".subsection 1\n\t"                     \
-        extra                                   \
-        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
-        LOCK_SECTION_NAME ":\n\t"               \
-        ".endif\n"
-
-#define LOCK_SECTION_END                        \
-        ".previous\n\t"
-
-#define __lockfunc fastcall __attribute__((section(".spinlock.text")))
-
-/*
  * Pull the raw_spinlock_t and raw_rwlock_t definitions:
  */
 #include <linux/spinlock_types.h>
@@ -88,42 +108,10 @@
 # include <linux/spinlock_up.h>
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK
-  extern void __spin_lock_init(spinlock_t *lock, const char *name,
-			       struct lock_class_key *key);
-# define spin_lock_init(lock)					\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__spin_lock_init((lock), #lock, &__key);		\
-} while (0)
-
-#else
-# define spin_lock_init(lock)					\
-	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
-#endif
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-  extern void __rwlock_init(rwlock_t *lock, const char *name,
-			    struct lock_class_key *key);
-# define rwlock_init(lock)					\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__rwlock_init((lock), #lock, &__key);			\
-} while (0)
-#else
-# define rwlock_init(lock)					\
-	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
-#endif
-
-#define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
-
-/**
- * spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
+/*
+ * Pull the RT types:
  */
-#define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
+#include <linux/rt_lock.h>
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -135,16 +123,16 @@
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
- extern void _raw_spin_lock(spinlock_t *lock);
+ extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock);
 #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
- extern int _raw_spin_trylock(spinlock_t *lock);
- extern void _raw_spin_unlock(spinlock_t *lock);
- extern void _raw_read_lock(rwlock_t *lock);
- extern int _raw_read_trylock(rwlock_t *lock);
- extern void _raw_read_unlock(rwlock_t *lock);
- extern void _raw_write_lock(rwlock_t *lock);
- extern int _raw_write_trylock(rwlock_t *lock);
- extern void _raw_write_unlock(rwlock_t *lock);
+ extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock);
+ extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock);
+ extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock);
+ extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock);
+ extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock);
+ extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock);
+ extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock);
+ extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock);
 #else
 # define _raw_spin_lock(lock)		__raw_spin_lock(&(lock)->raw_lock)
 # define _raw_spin_lock_flags(lock, flags) \
@@ -159,117 +147,510 @@
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
 
-#define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
+extern int __bad_spinlock_type(void);
+extern int __bad_rwlock_type(void);
+
+extern void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc
+rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
+extern int _atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
 
 /*
+ * lockdep-less calls, for derived types like rwlock:
+ * (for trylock they can use rt_mutex_trylock() directly.
+ */
+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
+
+#ifdef CONFIG_PREEMPT_RT
+# define _spin_lock(l)			rt_spin_lock(l)
+# define _spin_lock_nested(l, s)	rt_spin_lock_nested(l, s)
+# define _spin_lock_bh(l)		rt_spin_lock(l)
+# define _spin_lock_irq(l)		rt_spin_lock(l)
+# define _spin_unlock(l)		rt_spin_unlock(l)
+# define _spin_unlock_no_resched(l)	rt_spin_unlock(l)
+# define _spin_unlock_bh(l)		rt_spin_unlock(l)
+# define _spin_unlock_irq(l)		rt_spin_unlock(l)
+# define _spin_unlock_irqrestore(l, f)	rt_spin_unlock(l)
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	rt_spin_lock(lock);
+	return 0;
+}
+#else
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	return 0;
+}
+# define _spin_lock(l)			do { } while (0)
+# define _spin_lock_nested(l, s)	do { } while (0)
+# define _spin_lock_bh(l)		do { } while (0)
+# define _spin_lock_irq(l)		do { } while (0)
+# define _spin_lock(l)			do { } while (0)
+# define _spin_lock_bh(l)		do { } while (0)
+# define _spin_lock_irq(l)		do { } while (0)
+# define _spin_unlock(l)		do { } while (0)
+# define _spin_unlock_no_resched(l)	do { } while (0)
+# define _spin_unlock_bh(l)		do { } while (0)
+# define _spin_unlock_irq(l)		do { } while (0)
+# define _spin_unlock_irqrestore(l, f)	do { } while (0)
+#endif
+
+#define _spin_lock_init(sl, n, f, l) \
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_spin_lock_init(sl, n, &__key);		\
+} while (0)
+
+# ifdef CONFIG_PREEMPT_RT
+#  define _spin_can_lock(l)		(!rt_mutex_is_locked(&(l)->lock))
+#  define _spin_is_locked(l)		rt_mutex_is_locked(&(l)->lock)
+#  define _spin_unlock_wait(l)		rt_spin_unlock_wait(l)
+
+#  define _spin_trylock(l)		rt_spin_trylock(l)
+#  define _spin_trylock_bh(l)		rt_spin_trylock(l)
+#  define _spin_trylock_irq(l)		rt_spin_trylock(l)
+#  define _spin_trylock_irqsave(l,f)	rt_spin_trylock_irqsave(l, f)
+# else
+
+   extern int this_should_never_be_called_on_non_rt(spinlock_t *lock);
+#  define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l)
+#  define _spin_can_lock(l)		TSNBCONRT(l)
+#  define _spin_is_locked(l)		TSNBCONRT(l)
+#  define _spin_unlock_wait(l)		TSNBCONRT(l)
+
+#  define _spin_trylock(l)		TSNBCONRT(l)
+#  define _spin_trylock_bh(l)		TSNBCONRT(l)
+#  define _spin_trylock_irq(l)		TSNBCONRT(l)
+#  define _spin_trylock_irqsave(l,f)	TSNBCONRT(l)
+#endif
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(lock, type) \
+		__builtin_types_compatible_p(typeof(lock), type *)
+
+#define PICK_OP(op, lock)						\
+do {									\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))				\
+		__spin##op((raw_spinlock_t *)(lock));			\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock));			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define PICK_OP_RET(op, lock...)					\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 			\
+		__ret = __spin##op((raw_spinlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = _spin##op((spinlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_OP2(op, lock, flags)					\
+do {									\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))				\
+		__spin##op((raw_spinlock_t *)(lock), flags);		\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock), flags);			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define PICK_OP2_RET(op, lock, flags)					\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))				\
+		__ret = __spin##op((raw_spinlock_t *)(lock), flags);	\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = _spin##op((spinlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();					\
+									\
+	__ret;								\
+})
+
+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
+extern void
+__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
+
+#define _rwlock_init(rwl, n, f, l)			\
+do {							\
+	static struct lock_class_key __key;		\
+							\
+	__rt_rwlock_init(rwl, n, &__key);		\
+} while (0)
+
+#ifdef CONFIG_PREEMPT_RT
+# define rt_read_can_lock(rwl)	(!rt_mutex_is_locked(&(rwl)->lock))
+# define rt_write_can_lock(rwl)	(!rt_mutex_is_locked(&(rwl)->lock))
+#else
+ extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock);
+# define rt_read_can_lock(rwl)	rt_rwlock_can_lock_never_call_on_non_rt(rwl)
+# define rt_write_can_lock(rwl)	rt_rwlock_can_lock_never_call_on_non_rt(rwl)
+#endif
+
+# define _read_can_lock(rwl)	rt_read_can_lock(rwl)
+# define _write_can_lock(rwl)	rt_write_can_lock(rwl)
+
+# define _read_trylock(rwl)	rt_read_trylock(rwl)
+# define _write_trylock(rwl)	rt_write_trylock(rwl)
+
+# define _read_lock(rwl)	rt_read_lock(rwl)
+# define _write_lock(rwl)	rt_write_lock(rwl)
+# define _read_unlock(rwl)	rt_read_unlock(rwl)
+# define _write_unlock(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_bh(rwl)	rt_read_lock(rwl)
+# define _write_lock_bh(rwl)	rt_write_lock(rwl)
+# define _read_unlock_bh(rwl)	rt_read_unlock(rwl)
+# define _write_unlock_bh(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_irq(rwl)	rt_read_lock(rwl)
+# define _write_lock_irq(rwl)	rt_write_lock(rwl)
+# define _read_unlock_irq(rwl)	rt_read_unlock(rwl)
+# define _write_unlock_irq(rwl)	rt_write_unlock(rwl)
+
+# define _read_lock_irqsave(rwl) 	rt_read_lock_irqsave(rwl)
+# define _write_lock_irqsave(rwl)	rt_write_lock_irqsave(rwl)
+
+# define _read_unlock_irqrestore(rwl, f)	rt_read_unlock(rwl)
+# define _write_unlock_irqrestore(rwl, f)	rt_write_unlock(rwl)
+
+#define __PICK_RW_OP(optype, op, lock)					\
+do {									\
+	if (TYPE_EQUAL((lock), raw_rwlock_t))				\
+		__##optype##op((raw_rwlock_t *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		##op((rwlock_t *)(lock));				\
+	else __bad_rwlock_type();					\
+} while (0)
+
+#define PICK_RW_OP(optype, op, lock)					\
+do {									\
+	if (TYPE_EQUAL((lock), raw_rwlock_t))				\
+		__##optype##op((raw_rwlock_t *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock));			\
+	else __bad_rwlock_type();					\
+} while (0)
+
+#define __PICK_RW_OP_RET(optype, op, lock...)				\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_rwlock_t))		  		\
+		__ret = __##optype##op((raw_rwlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_rwlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP_RET(optype, op, lock...)				\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_rwlock_t))				\
+		__ret = __##optype##op((raw_rwlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_rwlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP2(optype, op, lock, flags)				\
+do {									\
+	if (TYPE_EQUAL((lock), raw_rwlock_t))				\
+		__##optype##op((raw_rwlock_t *)(lock), flags);		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), flags);		\
+	else __bad_rwlock_type();					\
+} while (0)
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+				   struct lock_class_key *key);
+# define _raw_spin_lock_init(lock)				\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__raw_spin_lock_init((lock), #lock, &__key);		\
+} while (0)
+
+#else
+#define __raw_spin_lock_init(lock) \
+	do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
+# define _raw_spin_lock_init(lock) __raw_spin_lock_init(lock)
+#endif
+
+#define PICK_OP_INIT(op, lock)						\
+do {									\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))				\
+		_raw_spin##op((raw_spinlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock), #lock, __FILE__, __LINE__); \
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define spin_lock_init(lock)		PICK_OP_INIT(_lock_init, lock)
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+  extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name,
+				struct lock_class_key *key);
+# define _raw_rwlock_init(lock)					\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__raw_rwlock_init((lock), #lock, &__key);		\
+} while (0)
+#else
+#define __raw_rwlock_init(lock) \
+	do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0)
+# define _raw_rwlock_init(lock) __raw_rwlock_init(lock)
+#endif
+
+#define __PICK_RW_OP_INIT(optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), raw_rwlock_t))				\
+		_raw_##optype##op((raw_rwlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), #lock, __FILE__, __LINE__);\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define rwlock_init(lock)	__PICK_RW_OP_INIT(rwlock, _init, lock)
+
+#define __spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
+
+#define spin_is_locked(lock)	PICK_OP_RET(_is_locked, lock)
+
+#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
+
+#define spin_unlock_wait(lock)	PICK_OP(_unlock_wait, lock)
+/*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
-#define spin_trylock(lock)		__cond_lock(_spin_trylock(lock))
-#define read_trylock(lock)		__cond_lock(_read_trylock(lock))
-#define write_trylock(lock)		__cond_lock(_write_trylock(lock))
+// #define spin_trylock(lock)	_spin_trylock(lock)
+#define spin_trylock(lock)	__cond_lock(PICK_OP_RET(_trylock, lock))
+
+//#define read_trylock(lock)	_read_trylock(lock)
+#define read_trylock(lock)	__cond_lock(PICK_RW_OP_RET(read, _trylock, lock))
 
-#define spin_lock(lock)			_spin_lock(lock)
+//#define write_trylock(lock)	_write_trylock(lock)
+#define write_trylock(lock)	__cond_lock(PICK_RW_OP_RET(write, _trylock, lock))
+
+#define __spin_can_lock(lock)	__raw_spin_can_lock(&(lock)->raw_lock)
+#define __read_can_lock(lock)	__raw_read_can_lock(&(lock)->raw_lock)
+#define __write_can_lock(lock)	__raw_write_can_lock(&(lock)->raw_lock)
+
+#define spin_can_lock(lock) \
+	__cond_lock(PICK_OP_RET(_can_lock, lock))
+
+#define read_can_lock(lock) \
+	__cond_lock(PICK_RW_OP_RET(read, _can_lock, lock))
+
+#define write_can_lock(lock) \
+	__cond_lock(PICK_RW_OP_RET(write, _can_lock, lock))
+
+// #define spin_lock(lock)	_spin_lock(lock)
+#define spin_lock(lock)		PICK_OP(_lock, lock)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
+# define spin_lock_nested(lock, subclass) PICK_OP2(_lock_nested, lock, subclass)
 #else
-# define spin_lock_nested(lock, subclass) _spin_lock(lock)
+# define spin_lock_nested(lock, subclass) spin_lock(lock)
 #endif
 
-#define write_lock(lock)		_write_lock(lock)
-#define read_lock(lock)			_read_lock(lock)
+//#define write_lock(lock)	_write_lock(lock)
+#define write_lock(lock)	PICK_RW_OP(write, _lock, lock)
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
-#define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
-#define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
-#else
-#define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
-#define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
-#define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
-#endif
-
-#define spin_lock_irq(lock)		_spin_lock_irq(lock)
-#define spin_lock_bh(lock)		_spin_lock_bh(lock)
-
-#define read_lock_irq(lock)		_read_lock_irq(lock)
-#define read_lock_bh(lock)		_read_lock_bh(lock)
-
-#define write_lock_irq(lock)		_write_lock_irq(lock)
-#define write_lock_bh(lock)		_write_lock_bh(lock)
-
-/*
- * We inline the unlock functions in the nondebug case:
- */
-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \
-	!defined(CONFIG_SMP)
-# define spin_unlock(lock)		_spin_unlock(lock)
-# define read_unlock(lock)		_read_unlock(lock)
-# define write_unlock(lock)		_write_unlock(lock)
-# define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
-# define read_unlock_irq(lock)		_read_unlock_irq(lock)
-# define write_unlock_irq(lock)		_write_unlock_irq(lock)
-#else
-# define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
-# define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
-# define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
-# define spin_unlock_irq(lock) \
-    do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-# define read_unlock_irq(lock) \
-    do { __raw_read_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-# define write_unlock_irq(lock) \
-    do { __raw_write_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
-#endif
+// #define read_lock(lock)	_read_lock(lock)
+#define read_lock(lock)		PICK_RW_OP(read, _lock, lock)
 
+# define spin_lock_irqsave(lock, flags) \
+	flags = PICK_OP_RET(_lock_irqsave, lock)
+# define read_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(read, _lock_irqsave, lock)
+# define write_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(write, _lock_irqsave, lock)
+
+// #define spin_lock_irq(lock)	_spin_lock_irq(lock)
+// #define spin_lock_bh(lock)	_spin_lock_bh(lock)
+#define spin_lock_irq(lock)	PICK_OP(_lock_irq, lock)
+#define spin_lock_bh(lock)	PICK_OP(_lock_bh, lock)
+
+// #define read_lock_irq(lock)	_read_lock_irq(lock)
+// #define read_lock_bh(lock)	_read_lock_bh(lock)
+#define read_lock_irq(lock)	PICK_RW_OP(read, _lock_irq, lock)
+#define read_lock_bh(lock)	PICK_RW_OP(read, _lock_bh, lock)
+
+// #define write_lock_irq(lock)		_write_lock_irq(lock)
+// #define write_lock_bh(lock)		_write_lock_bh(lock)
+#define write_lock_irq(lock)	PICK_RW_OP(write, _lock_irq, lock)
+#define write_lock_bh(lock)	PICK_RW_OP(write, _lock_bh, lock)
+
+// #define spin_unlock(lock)	_spin_unlock(lock)
+// #define write_unlock(lock)	_write_unlock(lock)
+// #define read_unlock(lock)	_read_unlock(lock)
+#define spin_unlock(lock)	PICK_OP(_unlock, lock)
+#define read_unlock(lock)	PICK_RW_OP(read, _unlock, lock)
+#define write_unlock(lock)	PICK_RW_OP(write, _unlock, lock)
+
+// #define spin_unlock(lock)	_spin_unlock_no_resched(lock)
+#define spin_unlock_no_resched(lock) \
+				PICK_OP(_unlock_no_resched, lock)
+
+//#define spin_unlock_irqrestore(lock, flags)
+//		_spin_unlock_irqrestore(lock, flags)
+//#define spin_unlock_irq(lock)	_spin_unlock_irq(lock)
+//#define spin_unlock_bh(lock)	_spin_unlock_bh(lock)
 #define spin_unlock_irqrestore(lock, flags) \
-					_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
-
+				PICK_OP2(_unlock_irqrestore, lock, flags)
+#define spin_unlock_irq(lock)	PICK_OP(_unlock_irq, lock)
+#define spin_unlock_bh(lock)	PICK_OP(_unlock_bh, lock)
+
+// #define read_unlock_irqrestore(lock, flags)
+// 		_read_unlock_irqrestore(lock, flags)
+// #define read_unlock_irq(lock)	_read_unlock_irq(lock)
+// #define read_unlock_bh(lock)	_read_unlock_bh(lock)
 #define read_unlock_irqrestore(lock, flags) \
-					_read_unlock_irqrestore(lock, flags)
-#define read_unlock_bh(lock)		_read_unlock_bh(lock)
-
+		PICK_RW_OP2(read, _unlock_irqrestore, lock, flags)
+#define read_unlock_irq(lock)	PICK_RW_OP(read, _unlock_irq, lock)
+#define read_unlock_bh(lock)	PICK_RW_OP(read, _unlock_bh, lock)
+
+// #define write_unlock_irqrestore(lock, flags)
+// 	_write_unlock_irqrestore(lock, flags)
+// #define write_unlock_irq(lock)			_write_unlock_irq(lock)
+// #define write_unlock_bh(lock)			_write_unlock_bh(lock)
 #define write_unlock_irqrestore(lock, flags) \
-					_write_unlock_irqrestore(lock, flags)
-#define write_unlock_bh(lock)		_write_unlock_bh(lock)
+	PICK_RW_OP2(write, _unlock_irqrestore, lock, flags)
+#define write_unlock_irq(lock)	PICK_RW_OP(write, _unlock_irq, lock)
+#define write_unlock_bh(lock)	PICK_RW_OP(write, _unlock_bh, lock)
 
-#define spin_trylock_bh(lock)		__cond_lock(_spin_trylock_bh(lock))
+// #define spin_trylock_bh(lock)	_spin_trylock_bh(lock)
+#define spin_trylock_bh(lock)	__cond_lock(PICK_OP_RET(_trylock_bh, lock))
 
-#define spin_trylock_irq(lock) \
-({ \
-	local_irq_disable(); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_enable(); 0;  }); \
-})
+// #define spin_trylock_irq(lock)
+
+#define spin_trylock_irq(lock)	__cond_lock(PICK_OP_RET(_trylock_irq, lock))
+
+// #define spin_trylock_irqsave(lock, flags)
 
 #define spin_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_restore(flags); 0; }); \
-})
+		__cond_lock(PICK_OP2_RET(_trylock_irqsave, lock, &flags))
+
+/* "lock on reference count zero" */
+#ifndef ATOMIC_DEC_AND_LOCK
+# include <asm/atomic.h>
+  extern int __atomic_dec_and_spin_lock(atomic_t *atomic, raw_spinlock_t *lock);
+#endif
+
+#define atomic_dec_and_lock(atomic, lock)				\
+__cond_lock(({								\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL(lock, raw_spinlock_t))				\
+		__ret = __atomic_dec_and_spin_lock(atomic,		\
+					(raw_spinlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = _atomic_dec_and_spin_lock(atomic,		\
+					(spinlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+}))
+
 
 /*
- * Pull the atomic_t declaration:
- * (asm-mips/atomic.h needs above definitions)
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
  */
-#include <asm/atomic.h>
-/**
- * atomic_dec_and_lock - lock on reaching reference count zero
- * @atomic: the atomic counter
- * @lock: the spinlock in question
- */
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#define atomic_dec_and_lock(atomic, lock) \
-		__cond_lock(_atomic_dec_and_lock(atomic, lock))
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	while (test_and_set_bit(bitnum, addr))
+		while (test_bit(bitnum, addr))
+			cpu_relax();
+#endif
+	__acquire(bitlock);
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	if (test_and_set_bit(bitnum, addr))
+		return 0;
+#endif
+	__acquire(bitlock);
+	return 1;
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	__release(bitlock);
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	return test_bit(bitnum, addr);
+#else
+	return 1;
+#endif
+}
 
 /**
- * spin_can_lock - would spin_trylock() succeed?
+ * __raw_spin_can_lock - would __raw_spin_trylock() succeed?
  * @lock: the spinlock in question.
  */
-#define spin_can_lock(lock)	(!spin_is_locked(lock))
+#define __raw_spin_can_lock(lock)            (!__raw_spin_is_locked(lock))
 
 #endif /* __LINUX_SPINLOCK_H */
+
diff -urN ./linux-2.6.18.1/include/linux/spinlock_api_smp.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_api_smp.h
--- ./linux-2.6.18.1/include/linux/spinlock_api_smp.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_api_smp.h	2007-05-19 23:58:35.000000000 +0900
@@ -19,41 +19,54 @@
 
 #define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
 
-void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
-							__acquires(spinlock_t);
-void __lockfunc _read_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
-							__acquires(spinlock_t);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-void __lockfunc _spin_unlock(spinlock_t *lock)		__releases(spinlock_t);
-void __lockfunc _read_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-							__releases(spinlock_t);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
+#define ACQUIRE_SPIN		__acquires(raw_spinlock_t)
+#define ACQUIRE_RW		__acquires(raw_rwlock_t)
+#define RELEASE_SPIN		__releases(raw_spinlock_t)
+#define RELEASE_RW		__releases(raw_rwlock_t)
+
+void __lockfunc __spin_lock(raw_spinlock_t *lock)		ACQUIRE_SPIN;
+void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass)
+								ACQUIRE_SPIN;
+void __lockfunc __read_lock(raw_rwlock_t *lock)			ACQUIRE_RW;
+void __lockfunc __write_lock(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __spin_lock_bh(raw_spinlock_t *lock)		ACQUIRE_SPIN;
+void __lockfunc __read_lock_bh(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __write_lock_bh(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __spin_lock_irq(raw_spinlock_t *lock)		ACQUIRE_SPIN;
+void __lockfunc __read_lock_irq(raw_rwlock_t *lock)		ACQUIRE_RW;
+void __lockfunc __write_lock_irq(raw_rwlock_t *lock)		ACQUIRE_RW;
+unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock)
+								ACQUIRE_SPIN;
+unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock)
+								ACQUIRE_RW;
+unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock)
+								ACQUIRE_RW;
+int __lockfunc __spin_trylock(raw_spinlock_t *lock);
+int __lockfunc
+__spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags);
+int __lockfunc __read_trylock(raw_rwlock_t *lock);
+int __lockfunc __write_trylock(raw_rwlock_t *lock);
+int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock);
+int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock);
+void __lockfunc __spin_unlock(raw_spinlock_t *lock)		RELEASE_SPIN;
+void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock)
+								RELEASE_SPIN;
+void __lockfunc __read_unlock(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __write_unlock(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock)		RELEASE_SPIN;
+void __lockfunc __read_unlock_bh(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __write_unlock_bh(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock)		RELEASE_SPIN;
+void __lockfunc __read_unlock_irq(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc __write_unlock_irq(raw_rwlock_t *lock)		RELEASE_RW;
+void __lockfunc
+__spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+								RELEASE_SPIN;
+void __lockfunc
+__read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+								RELEASE_RW;
+void
+__lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+								RELEASE_RW;
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
diff -urN ./linux-2.6.18.1/include/linux/spinlock_api_up.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_api_up.h
--- ./linux-2.6.18.1/include/linux/spinlock_api_up.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_api_up.h	2007-05-19 23:58:35.000000000 +0900
@@ -33,12 +33,20 @@
 #define __LOCK_IRQ(lock) \
   do { local_irq_disable(); __LOCK(lock); } while (0)
 
-#define __LOCK_IRQSAVE(lock, flags) \
-  do { local_irq_save(flags); __LOCK(lock); } while (0)
+#define __LOCK_IRQSAVE(lock) \
+  ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; })
+
+#define __TRYLOCK_IRQSAVE(lock, flags) \
+	({ local_irq_save(*(flags)); __LOCK(lock); 1; })
+
+#define __spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
 
 #define __UNLOCK(lock) \
   do { preempt_enable(); __release(lock); (void)(lock); } while (0)
 
+#define __UNLOCK_NO_RESCHED(lock) \
+  do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0)
+
 #define __UNLOCK_BH(lock) \
   do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
 
@@ -48,34 +56,36 @@
 #define __UNLOCK_IRQRESTORE(lock, flags) \
   do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
 
-#define _spin_lock(lock)			__LOCK(lock)
-#define _spin_lock_nested(lock, subclass)	__LOCK(lock)
-#define _read_lock(lock)			__LOCK(lock)
-#define _write_lock(lock)			__LOCK(lock)
-#define _spin_lock_bh(lock)			__LOCK_BH(lock)
-#define _read_lock_bh(lock)			__LOCK_BH(lock)
-#define _write_lock_bh(lock)			__LOCK_BH(lock)
-#define _spin_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _read_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _write_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _spin_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _read_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
-#define _spin_trylock(lock)			({ __LOCK(lock); 1; })
-#define _read_trylock(lock)			({ __LOCK(lock); 1; })
-#define _write_trylock(lock)			({ __LOCK(lock); 1; })
-#define _spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
-#define _spin_unlock(lock)			__UNLOCK(lock)
-#define _read_unlock(lock)			__UNLOCK(lock)
-#define _write_unlock(lock)			__UNLOCK(lock)
-#define _spin_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _write_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _read_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _write_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define __spin_lock(lock)			__LOCK(lock)
+#define __spin_lock_nested(lock, subclass)	__LOCK(lock)
+#define __read_lock(lock)			__LOCK(lock)
+#define __write_lock(lock)			__LOCK(lock)
+#define __spin_lock_bh(lock)			__LOCK_BH(lock)
+#define __read_lock_bh(lock)			__LOCK_BH(lock)
+#define __write_lock_bh(lock)			__LOCK_BH(lock)
+#define __spin_lock_irq(lock)			__LOCK_IRQ(lock)
+#define __read_lock_irq(lock)			__LOCK_IRQ(lock)
+#define __write_lock_irq(lock)			__LOCK_IRQ(lock)
+#define __spin_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define __read_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define __write_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define __spin_trylock(lock)			({ __LOCK(lock); 1; })
+#define __read_trylock(lock)			({ __LOCK(lock); 1; })
+#define __write_trylock(lock)			({ __LOCK(lock); 1; })
+#define __spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
+#define __spin_trylock_irq(lock)		({ __LOCK_IRQ(lock); 1; })
+#define __spin_unlock(lock)			__UNLOCK(lock)
+#define __spin_unlock_no_resched(lock)		__UNLOCK_NO_RESCHED(lock)
+#define __read_unlock(lock)			__UNLOCK(lock)
+#define __write_unlock(lock)			__UNLOCK(lock)
+#define __spin_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define __write_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define __read_unlock_bh(lock)			__UNLOCK_BH(lock)
+#define __spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define __read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
+#define __write_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define __spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define __read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define __write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
 
 #endif /* __LINUX_SPINLOCK_API_UP_H */
diff -urN ./linux-2.6.18.1/include/linux/spinlock_types.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_types.h
--- ./linux-2.6.18.1/include/linux/spinlock_types.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_types.h	2007-05-19 23:58:35.000000000 +0900
@@ -9,6 +9,23 @@
  * Released under the General Public License (GPL).
  */
 
+/*
+ * Must define these before including other files, inline functions need them
+ */
+#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
+
+#define LOCK_SECTION_START(extra)               \
+        ".subsection 1\n\t"                     \
+        extra                                   \
+        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
+        LOCK_SECTION_NAME ":\n\t"               \
+        ".endif\n"
+
+#define LOCK_SECTION_END                        \
+        ".previous\n\t"
+
+#define __lockfunc fastcall __attribute__((section(".spinlock.text")))
+
 #include <linux/lockdep.h>
 
 #if defined(CONFIG_SMP)
@@ -18,7 +35,7 @@
 #endif
 
 typedef struct {
-	raw_spinlock_t raw_lock;
+	__raw_spinlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -29,12 +46,12 @@
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
-} spinlock_t;
+} raw_spinlock_t;
 
 #define SPINLOCK_MAGIC		0xdead4ead
 
 typedef struct {
-	raw_rwlock_t raw_lock;
+	__raw_rwlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -45,7 +62,7 @@
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
-} rwlock_t;
+} raw_rwlock_t;
 
 #define RWLOCK_MAGIC		0xdeaf1eed
 
@@ -64,31 +81,42 @@
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-# define __SPIN_LOCK_UNLOCKED(lockname)					\
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+# define _RAW_SPIN_LOCK_UNLOCKED(lockname)				\
+			{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
 				.magic = SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1,			\
 				SPIN_DEP_MAP_INIT(lockname) }
-#define __RW_LOCK_UNLOCKED(lockname)					\
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+#define _RAW_RW_LOCK_UNLOCKED(lockname)					\
+			{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1,			\
 				RW_DEP_MAP_INIT(lockname) }
 #else
-# define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+# define _RAW_SPIN_LOCK_UNLOCKED(lockname)				\
+			{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
 				SPIN_DEP_MAP_INIT(lockname) }
-#define __RW_LOCK_UNLOCKED(lockname) \
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+# define _RAW_RW_LOCK_UNLOCKED(lockname)				\
+			{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
-#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(old_style_spin_init)
-#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(old_style_rw_init)
+# define RAW_SPIN_LOCK_UNLOCKED(lockname) 				\
+	(raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED(lockname)
+
+# define RAW_RW_LOCK_UNLOCKED(lockname) 				\
+	(raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED(lockname)
 
-#define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
-#define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
+#define DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name __cacheline_aligned_in_smp = \
+		RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define __DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_RAW_RWLOCK(name) \
+	raw_rwlock_t name __cacheline_aligned_in_smp = \
+		RAW_RW_LOCK_UNLOCKED(name)
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
diff -urN ./linux-2.6.18.1/include/linux/spinlock_types_up.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_types_up.h
--- ./linux-2.6.18.1/include/linux/spinlock_types_up.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_types_up.h	2007-05-19 23:58:35.000000000 +0900
@@ -20,13 +20,13 @@
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
 
 #else
 
-typedef struct { } raw_spinlock_t;
+typedef struct { } __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
 
@@ -37,7 +37,7 @@
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
 
diff -urN ./linux-2.6.18.1/include/linux/spinlock_up.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_up.h
--- ./linux-2.6.18.1/include/linux/spinlock_up.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/spinlock_up.h	2007-05-19 23:58:35.000000000 +0900
@@ -20,19 +20,19 @@
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define __raw_spin_is_locked(x)		((x)->slock == 0)
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	lock->slock = 0;
 }
 
 static inline void
-__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+__raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	local_irq_save(flags);
 	lock->slock = 0;
 }
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval = lock->slock;
 
@@ -41,7 +41,7 @@
 	return oldval > 0;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	lock->slock = 1;
 }
diff -urN ./linux-2.6.18.1/include/linux/sunrpc/sched.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sunrpc/sched.h
--- ./linux-2.6.18.1/include/linux/sunrpc/sched.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sunrpc/sched.h	2007-05-19 23:58:35.000000000 +0900
@@ -224,7 +224,7 @@
 
 #ifndef RPC_DEBUG
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = __SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
diff -urN ./linux-2.6.18.1/include/linux/sysctl.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sysctl.h
--- ./linux-2.6.18.1/include/linux/sysctl.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sysctl.h	2007-05-20 00:11:13.000000000 +0900
@@ -150,6 +150,7 @@
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
 	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
 	KERN_MAX_LOCK_DEPTH=74,
+	KERN_TIMEOUT_GRANULARITY=73, /* int: timeout granularity in jiffies */
 };
 
 
diff -urN ./linux-2.6.18.1/include/linux/sysctl.h.orig linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sysctl.h.orig
--- ./linux-2.6.18.1/include/linux/sysctl.h.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/sysctl.h.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1025 @@
+/*
+ * sysctl.h: General linux system control interface
+ *
+ * Begun 24 March 1995, Stephen Tweedie
+ *
+ ****************************************************************
+ ****************************************************************
+ **
+ **  The values in this file are exported to user space via 
+ **  the sysctl() binary interface.  However this interface
+ **  is unstable and deprecated and will be removed in the future. 
+ **  For a stable interface use /proc/sys.
+ **
+ ****************************************************************
+ ****************************************************************
+ */
+
+#ifndef _LINUX_SYSCTL_H
+#define _LINUX_SYSCTL_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+struct file;
+struct completion;
+
+#define CTL_MAXNAME 10		/* how many path components do we allow in a
+				   call to sysctl?   In other words, what is
+				   the largest acceptable value for the nlen
+				   member of a struct __sysctl_args to have? */
+
+struct __sysctl_args {
+	int __user *name;
+	int nlen;
+	void __user *oldval;
+	size_t __user *oldlenp;
+	void __user *newval;
+	size_t newlen;
+	unsigned long __unused[4];
+};
+
+/* Define sysctl names first */
+
+/* Top-level names: */
+
+/* For internal pattern-matching use only: */
+#ifdef __KERNEL__
+#define CTL_ANY		-1	/* Matches any name */
+#define CTL_NONE	0
+#endif
+
+enum
+{
+	CTL_KERN=1,		/* General kernel info and control */
+	CTL_VM=2,		/* VM management */
+	CTL_NET=3,		/* Networking */
+	/* was CTL_PROC */
+	CTL_FS=5,		/* Filesystems */
+	CTL_DEBUG=6,		/* Debugging */
+	CTL_DEV=7,		/* Devices */
+	CTL_BUS=8,		/* Busses */
+	CTL_ABI=9,		/* Binary emulation */
+	CTL_CPU=10		/* CPU stuff (speed scaling, etc) */
+};
+
+/* CTL_BUS names: */
+enum
+{
+	CTL_BUS_ISA=1		/* ISA */
+};
+
+/* /proc/sys/fs/inotify/ */
+enum
+{
+	INOTIFY_MAX_USER_INSTANCES=1,	/* max instances per user */
+	INOTIFY_MAX_USER_WATCHES=2,	/* max watches per user */
+	INOTIFY_MAX_QUEUED_EVENTS=3	/* max queued events per instance */
+};
+
+/* CTL_KERN names: */
+enum
+{
+	KERN_OSTYPE=1,		/* string: system version */
+	KERN_OSRELEASE=2,	/* string: system release */
+	KERN_OSREV=3,		/* int: system revision */
+	KERN_VERSION=4,		/* string: compile time info */
+	KERN_SECUREMASK=5,	/* struct: maximum rights mask */
+	KERN_PROF=6,		/* table: profiling information */
+	KERN_NODENAME=7,
+	KERN_DOMAINNAME=8,
+
+	KERN_CAP_BSET=14,	/* int: capability bounding set */
+	KERN_PANIC=15,		/* int: panic timeout */
+	KERN_REALROOTDEV=16,	/* real root device to mount after initrd */
+
+	KERN_SPARC_REBOOT=21,	/* reboot command on Sparc */
+	KERN_CTLALTDEL=22,	/* int: allow ctl-alt-del to reboot */
+	KERN_PRINTK=23,		/* struct: control printk logging parameters */
+	KERN_NAMETRANS=24,	/* Name translation */
+	KERN_PPC_HTABRECLAIM=25, /* turn htab reclaimation on/off on PPC */
+	KERN_PPC_ZEROPAGED=26,	/* turn idle page zeroing on/off on PPC */
+	KERN_PPC_POWERSAVE_NAP=27, /* use nap mode for power saving */
+	KERN_MODPROBE=28,
+	KERN_SG_BIG_BUFF=29,
+	KERN_ACCT=30,		/* BSD process accounting parameters */
+	KERN_PPC_L2CR=31,	/* l2cr register on PPC */
+
+	KERN_RTSIGNR=32,	/* Number of rt sigs queued */
+	KERN_RTSIGMAX=33,	/* Max queuable */
+	
+	KERN_SHMMAX=34,         /* long: Maximum shared memory segment */
+	KERN_MSGMAX=35,         /* int: Maximum size of a messege */
+	KERN_MSGMNB=36,         /* int: Maximum message queue size */
+	KERN_MSGPOOL=37,        /* int: Maximum system message pool size */
+	KERN_SYSRQ=38,		/* int: Sysreq enable */
+	KERN_MAX_THREADS=39,	/* int: Maximum nr of threads in the system */
+ 	KERN_RANDOM=40,		/* Random driver */
+ 	KERN_SHMALL=41,		/* int: Maximum size of shared memory */
+ 	KERN_MSGMNI=42,		/* int: msg queue identifiers */
+ 	KERN_SEM=43,		/* struct: sysv semaphore limits */
+ 	KERN_SPARC_STOP_A=44,	/* int: Sparc Stop-A enable */
+ 	KERN_SHMMNI=45,		/* int: shm array identifiers */
+	KERN_OVERFLOWUID=46,	/* int: overflow UID */
+	KERN_OVERFLOWGID=47,	/* int: overflow GID */
+	KERN_SHMPATH=48,	/* string: path to shm fs */
+	KERN_HOTPLUG=49,	/* string: path to uevent helper (deprecated) */
+	KERN_IEEE_EMULATION_WARNINGS=50, /* int: unimplemented ieee instructions */
+	KERN_S390_USER_DEBUG_LOGGING=51,  /* int: dumps of user faults */
+	KERN_CORE_USES_PID=52,		/* int: use core or core.%pid */
+	KERN_TAINTED=53,	/* int: various kernel tainted flags */
+	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
+	KERN_PIDMAX=55,		/* int: PID # limit */
+  	KERN_CORE_PATTERN=56,	/* string: pattern for core-file names */
+	KERN_PANIC_ON_OOPS=57,  /* int: whether we will panic on an oops */
+	KERN_HPPA_PWRSW=58,	/* int: hppa soft-power enable */
+	KERN_HPPA_UNALIGNED=59,	/* int: hppa unaligned-trap enable */
+	KERN_PRINTK_RATELIMIT=60, /* int: tune printk ratelimiting */
+	KERN_PRINTK_RATELIMIT_BURST=61,	/* int: tune printk ratelimiting */
+	KERN_PTY=62,		/* dir: pty driver */
+	KERN_NGROUPS_MAX=63,	/* int: NGROUPS_MAX */
+	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
+	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
+	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
+	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
+	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
+	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
+	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
+	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
+	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
+	KERN_MAX_LOCK_DEPTH=74,
+	KERN_TIMEOUT_GRANULARITY=73, /* int: timeout granularity in jiffies */
+};
+
+
+
+/* CTL_VM names: */
+enum
+{
+	VM_UNUSED1=1,		/* was: struct: Set vm swapping control */
+	VM_UNUSED2=2,		/* was; int: Linear or sqrt() swapout for hogs */
+	VM_UNUSED3=3,		/* was: struct: Set free page thresholds */
+	VM_UNUSED4=4,		/* Spare */
+	VM_OVERCOMMIT_MEMORY=5,	/* Turn off the virtual memory safety limit */
+	VM_UNUSED5=6,		/* was: struct: Set buffer memory thresholds */
+	VM_UNUSED7=7,		/* was: struct: Set cache memory thresholds */
+	VM_UNUSED8=8,		/* was: struct: Control kswapd behaviour */
+	VM_UNUSED9=9,		/* was: struct: Set page table cache parameters */
+	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
+	VM_DIRTY_BACKGROUND=11,	/* dirty_background_ratio */
+	VM_DIRTY_RATIO=12,	/* dirty_ratio */
+	VM_DIRTY_WB_CS=13,	/* dirty_writeback_centisecs */
+	VM_DIRTY_EXPIRE_CS=14,	/* dirty_expire_centisecs */
+	VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */
+	VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
+	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
+	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
+	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
+	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
+	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
+	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
+	VM_LAPTOP_MODE=23,	/* vm laptop mode */
+	VM_BLOCK_DUMP=24,	/* block dump mode */
+	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
+	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
+	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
+	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
+	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
+	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
+	VM_MIN_UNMAPPED=32,	/* Set min percent of unmapped pages */
+	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
+	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
+};
+
+
+/* CTL_NET names: */
+enum
+{
+	NET_CORE=1,
+	NET_ETHER=2,
+	NET_802=3,
+	NET_UNIX=4,
+	NET_IPV4=5,
+	NET_IPX=6,
+	NET_ATALK=7,
+	NET_NETROM=8,
+	NET_AX25=9,
+	NET_BRIDGE=10,
+	NET_ROSE=11,
+	NET_IPV6=12,
+	NET_X25=13,
+	NET_TR=14,
+	NET_DECNET=15,
+	NET_ECONET=16,
+	NET_SCTP=17,
+	NET_LLC=18,
+	NET_NETFILTER=19,
+	NET_DCCP=20,
+};
+
+/* /proc/sys/kernel/random */
+enum
+{
+	RANDOM_POOLSIZE=1,
+	RANDOM_ENTROPY_COUNT=2,
+	RANDOM_READ_THRESH=3,
+	RANDOM_WRITE_THRESH=4,
+	RANDOM_BOOT_ID=5,
+	RANDOM_UUID=6
+};
+
+/* /proc/sys/kernel/pty */
+enum
+{
+	PTY_MAX=1,
+	PTY_NR=2
+};
+
+/* /proc/sys/bus/isa */
+enum
+{
+	BUS_ISA_MEM_BASE=1,
+	BUS_ISA_PORT_BASE=2,
+	BUS_ISA_PORT_SHIFT=3
+};
+
+/* /proc/sys/net/core */
+enum
+{
+	NET_CORE_WMEM_MAX=1,
+	NET_CORE_RMEM_MAX=2,
+	NET_CORE_WMEM_DEFAULT=3,
+	NET_CORE_RMEM_DEFAULT=4,
+/* was	NET_CORE_DESTROY_DELAY */
+	NET_CORE_MAX_BACKLOG=6,
+	NET_CORE_FASTROUTE=7,
+	NET_CORE_MSG_COST=8,
+	NET_CORE_MSG_BURST=9,
+	NET_CORE_OPTMEM_MAX=10,
+	NET_CORE_HOT_LIST_LENGTH=11,
+	NET_CORE_DIVERT_VERSION=12,
+	NET_CORE_NO_CONG_THRESH=13,
+	NET_CORE_NO_CONG=14,
+	NET_CORE_LO_CONG=15,
+	NET_CORE_MOD_CONG=16,
+	NET_CORE_DEV_WEIGHT=17,
+	NET_CORE_SOMAXCONN=18,
+	NET_CORE_BUDGET=19,
+	NET_CORE_AEVENT_ETIME=20,
+	NET_CORE_AEVENT_RSEQTH=21,
+};
+
+/* /proc/sys/net/ethernet */
+
+/* /proc/sys/net/802 */
+
+/* /proc/sys/net/unix */
+
+enum
+{
+	NET_UNIX_DESTROY_DELAY=1,
+	NET_UNIX_DELETE_DELAY=2,
+	NET_UNIX_MAX_DGRAM_QLEN=3,
+};
+
+/* /proc/sys/net/netfilter */
+enum
+{
+	NET_NF_CONNTRACK_MAX=1,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT=2,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV=3,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED=4,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT=5,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT=6,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK=7,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT=8,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE=9,
+	NET_NF_CONNTRACK_UDP_TIMEOUT=10,
+	NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM=11,
+	NET_NF_CONNTRACK_ICMP_TIMEOUT=12,
+	NET_NF_CONNTRACK_GENERIC_TIMEOUT=13,
+	NET_NF_CONNTRACK_BUCKETS=14,
+	NET_NF_CONNTRACK_LOG_INVALID=15,
+	NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS=16,
+	NET_NF_CONNTRACK_TCP_LOOSE=17,
+	NET_NF_CONNTRACK_TCP_BE_LIBERAL=18,
+	NET_NF_CONNTRACK_TCP_MAX_RETRANS=19,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED=20,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT=21,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED=22,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED=23,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT=24,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25,
+	NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26,
+	NET_NF_CONNTRACK_COUNT=27,
+	NET_NF_CONNTRACK_ICMPV6_TIMEOUT=28,
+	NET_NF_CONNTRACK_FRAG6_TIMEOUT=29,
+	NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30,
+	NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31,
+	NET_NF_CONNTRACK_CHECKSUM=32,
+};
+
+/* /proc/sys/net/ipv4 */
+enum
+{
+	/* v2.0 compatibile variables */
+	NET_IPV4_FORWARD=8,
+	NET_IPV4_DYNADDR=9,
+
+	NET_IPV4_CONF=16,
+	NET_IPV4_NEIGH=17,
+	NET_IPV4_ROUTE=18,
+	NET_IPV4_FIB_HASH=19,
+	NET_IPV4_NETFILTER=20,
+
+	NET_IPV4_TCP_TIMESTAMPS=33,
+	NET_IPV4_TCP_WINDOW_SCALING=34,
+	NET_IPV4_TCP_SACK=35,
+	NET_IPV4_TCP_RETRANS_COLLAPSE=36,
+	NET_IPV4_DEFAULT_TTL=37,
+	NET_IPV4_AUTOCONFIG=38,
+	NET_IPV4_NO_PMTU_DISC=39,
+	NET_IPV4_TCP_SYN_RETRIES=40,
+	NET_IPV4_IPFRAG_HIGH_THRESH=41,
+	NET_IPV4_IPFRAG_LOW_THRESH=42,
+	NET_IPV4_IPFRAG_TIME=43,
+	NET_IPV4_TCP_MAX_KA_PROBES=44,
+	NET_IPV4_TCP_KEEPALIVE_TIME=45,
+	NET_IPV4_TCP_KEEPALIVE_PROBES=46,
+	NET_IPV4_TCP_RETRIES1=47,
+	NET_IPV4_TCP_RETRIES2=48,
+	NET_IPV4_TCP_FIN_TIMEOUT=49,
+	NET_IPV4_IP_MASQ_DEBUG=50,
+	NET_TCP_SYNCOOKIES=51,
+	NET_TCP_STDURG=52,
+	NET_TCP_RFC1337=53,
+	NET_TCP_SYN_TAILDROP=54,
+	NET_TCP_MAX_SYN_BACKLOG=55,
+	NET_IPV4_LOCAL_PORT_RANGE=56,
+	NET_IPV4_ICMP_ECHO_IGNORE_ALL=57,
+	NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS=58,
+	NET_IPV4_ICMP_SOURCEQUENCH_RATE=59,
+	NET_IPV4_ICMP_DESTUNREACH_RATE=60,
+	NET_IPV4_ICMP_TIMEEXCEED_RATE=61,
+	NET_IPV4_ICMP_PARAMPROB_RATE=62,
+	NET_IPV4_ICMP_ECHOREPLY_RATE=63,
+	NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES=64,
+	NET_IPV4_IGMP_MAX_MEMBERSHIPS=65,
+	NET_TCP_TW_RECYCLE=66,
+	NET_IPV4_ALWAYS_DEFRAG=67,
+	NET_IPV4_TCP_KEEPALIVE_INTVL=68,
+	NET_IPV4_INET_PEER_THRESHOLD=69,
+	NET_IPV4_INET_PEER_MINTTL=70,
+	NET_IPV4_INET_PEER_MAXTTL=71,
+	NET_IPV4_INET_PEER_GC_MINTIME=72,
+	NET_IPV4_INET_PEER_GC_MAXTIME=73,
+	NET_TCP_ORPHAN_RETRIES=74,
+	NET_TCP_ABORT_ON_OVERFLOW=75,
+	NET_TCP_SYNACK_RETRIES=76,
+	NET_TCP_MAX_ORPHANS=77,
+	NET_TCP_MAX_TW_BUCKETS=78,
+	NET_TCP_FACK=79,
+	NET_TCP_REORDERING=80,
+	NET_TCP_ECN=81,
+	NET_TCP_DSACK=82,
+	NET_TCP_MEM=83,
+	NET_TCP_WMEM=84,
+	NET_TCP_RMEM=85,
+	NET_TCP_APP_WIN=86,
+	NET_TCP_ADV_WIN_SCALE=87,
+	NET_IPV4_NONLOCAL_BIND=88,
+	NET_IPV4_ICMP_RATELIMIT=89,
+	NET_IPV4_ICMP_RATEMASK=90,
+	NET_TCP_TW_REUSE=91,
+	NET_TCP_FRTO=92,
+	NET_TCP_LOW_LATENCY=93,
+	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
+	NET_IPV4_IGMP_MAX_MSF=96,
+	NET_TCP_NO_METRICS_SAVE=97,
+	NET_TCP_DEFAULT_WIN_SCALE=105,
+	NET_TCP_MODERATE_RCVBUF=106,
+	NET_TCP_TSO_WIN_DIVISOR=107,
+	NET_TCP_BIC_BETA=108,
+	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
+	NET_TCP_CONG_CONTROL=110,
+	NET_TCP_ABC=111,
+	NET_IPV4_IPFRAG_MAX_DIST=112,
+ 	NET_TCP_MTU_PROBING=113,
+	NET_TCP_BASE_MSS=114,
+	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+	NET_TCP_DMA_COPYBREAK=116,
+	NET_TCP_SLOW_START_AFTER_IDLE=117,
+};
+
+enum {
+	NET_IPV4_ROUTE_FLUSH=1,
+	NET_IPV4_ROUTE_MIN_DELAY=2,
+	NET_IPV4_ROUTE_MAX_DELAY=3,
+	NET_IPV4_ROUTE_GC_THRESH=4,
+	NET_IPV4_ROUTE_MAX_SIZE=5,
+	NET_IPV4_ROUTE_GC_MIN_INTERVAL=6,
+	NET_IPV4_ROUTE_GC_TIMEOUT=7,
+	NET_IPV4_ROUTE_GC_INTERVAL=8,
+	NET_IPV4_ROUTE_REDIRECT_LOAD=9,
+	NET_IPV4_ROUTE_REDIRECT_NUMBER=10,
+	NET_IPV4_ROUTE_REDIRECT_SILENCE=11,
+	NET_IPV4_ROUTE_ERROR_COST=12,
+	NET_IPV4_ROUTE_ERROR_BURST=13,
+	NET_IPV4_ROUTE_GC_ELASTICITY=14,
+	NET_IPV4_ROUTE_MTU_EXPIRES=15,
+	NET_IPV4_ROUTE_MIN_PMTU=16,
+	NET_IPV4_ROUTE_MIN_ADVMSS=17,
+	NET_IPV4_ROUTE_SECRET_INTERVAL=18,
+	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19,
+};
+
+enum
+{
+	NET_PROTO_CONF_ALL=-2,
+	NET_PROTO_CONF_DEFAULT=-3
+
+	/* And device ifindices ... */
+};
+
+enum
+{
+	NET_IPV4_CONF_FORWARDING=1,
+	NET_IPV4_CONF_MC_FORWARDING=2,
+	NET_IPV4_CONF_PROXY_ARP=3,
+	NET_IPV4_CONF_ACCEPT_REDIRECTS=4,
+	NET_IPV4_CONF_SECURE_REDIRECTS=5,
+	NET_IPV4_CONF_SEND_REDIRECTS=6,
+	NET_IPV4_CONF_SHARED_MEDIA=7,
+	NET_IPV4_CONF_RP_FILTER=8,
+	NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE=9,
+	NET_IPV4_CONF_BOOTP_RELAY=10,
+	NET_IPV4_CONF_LOG_MARTIANS=11,
+	NET_IPV4_CONF_TAG=12,
+	NET_IPV4_CONF_ARPFILTER=13,
+	NET_IPV4_CONF_MEDIUM_ID=14,
+	NET_IPV4_CONF_NOXFRM=15,
+	NET_IPV4_CONF_NOPOLICY=16,
+	NET_IPV4_CONF_FORCE_IGMP_VERSION=17,
+	NET_IPV4_CONF_ARP_ANNOUNCE=18,
+	NET_IPV4_CONF_ARP_IGNORE=19,
+	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
+	NET_IPV4_CONF_ARP_ACCEPT=21,
+	__NET_IPV4_CONF_MAX
+};
+
+/* /proc/sys/net/ipv4/netfilter */
+enum
+{
+	NET_IPV4_NF_CONNTRACK_MAX=1,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT=2,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV=3,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED=4,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT=5,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT=6,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK=7,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT=8,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE=9,
+	NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT=10,
+	NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM=11,
+	NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT=12,
+	NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT=13,
+	NET_IPV4_NF_CONNTRACK_BUCKETS=14,
+	NET_IPV4_NF_CONNTRACK_LOG_INVALID=15,
+	NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS=16,
+	NET_IPV4_NF_CONNTRACK_TCP_LOOSE=17,
+	NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL=18,
+	NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS=19,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED=20,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT=21,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED=22,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED=23,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT=24,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25,
+ 	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26,
+	NET_IPV4_NF_CONNTRACK_COUNT=27,
+	NET_IPV4_NF_CONNTRACK_CHECKSUM=28,
+};
+ 
+/* /proc/sys/net/ipv6 */
+enum {
+	NET_IPV6_CONF=16,
+	NET_IPV6_NEIGH=17,
+	NET_IPV6_ROUTE=18,
+	NET_IPV6_ICMP=19,
+	NET_IPV6_BINDV6ONLY=20,
+	NET_IPV6_IP6FRAG_HIGH_THRESH=21,
+	NET_IPV6_IP6FRAG_LOW_THRESH=22,
+	NET_IPV6_IP6FRAG_TIME=23,
+	NET_IPV6_IP6FRAG_SECRET_INTERVAL=24,
+	NET_IPV6_MLD_MAX_MSF=25,
+};
+
+enum {
+	NET_IPV6_ROUTE_FLUSH=1,
+	NET_IPV6_ROUTE_GC_THRESH=2,
+	NET_IPV6_ROUTE_MAX_SIZE=3,
+	NET_IPV6_ROUTE_GC_MIN_INTERVAL=4,
+	NET_IPV6_ROUTE_GC_TIMEOUT=5,
+	NET_IPV6_ROUTE_GC_INTERVAL=6,
+	NET_IPV6_ROUTE_GC_ELASTICITY=7,
+	NET_IPV6_ROUTE_MTU_EXPIRES=8,
+	NET_IPV6_ROUTE_MIN_ADVMSS=9,
+	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS=10
+};
+
+enum {
+	NET_IPV6_FORWARDING=1,
+	NET_IPV6_HOP_LIMIT=2,
+	NET_IPV6_MTU=3,
+	NET_IPV6_ACCEPT_RA=4,
+	NET_IPV6_ACCEPT_REDIRECTS=5,
+	NET_IPV6_AUTOCONF=6,
+	NET_IPV6_DAD_TRANSMITS=7,
+	NET_IPV6_RTR_SOLICITS=8,
+	NET_IPV6_RTR_SOLICIT_INTERVAL=9,
+	NET_IPV6_RTR_SOLICIT_DELAY=10,
+	NET_IPV6_USE_TEMPADDR=11,
+	NET_IPV6_TEMP_VALID_LFT=12,
+	NET_IPV6_TEMP_PREFERED_LFT=13,
+	NET_IPV6_REGEN_MAX_RETRY=14,
+	NET_IPV6_MAX_DESYNC_FACTOR=15,
+	NET_IPV6_MAX_ADDRESSES=16,
+	NET_IPV6_FORCE_MLD_VERSION=17,
+	NET_IPV6_ACCEPT_RA_DEFRTR=18,
+	NET_IPV6_ACCEPT_RA_PINFO=19,
+	NET_IPV6_ACCEPT_RA_RTR_PREF=20,
+	NET_IPV6_RTR_PROBE_INTERVAL=21,
+	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22,
+	__NET_IPV6_MAX
+};
+
+/* /proc/sys/net/ipv6/icmp */
+enum {
+	NET_IPV6_ICMP_RATELIMIT=1
+};
+
+/* /proc/sys/net/<protocol>/neigh/<dev> */
+enum {
+	NET_NEIGH_MCAST_SOLICIT=1,
+	NET_NEIGH_UCAST_SOLICIT=2,
+	NET_NEIGH_APP_SOLICIT=3,
+	NET_NEIGH_RETRANS_TIME=4,
+	NET_NEIGH_REACHABLE_TIME=5,
+	NET_NEIGH_DELAY_PROBE_TIME=6,
+	NET_NEIGH_GC_STALE_TIME=7,
+	NET_NEIGH_UNRES_QLEN=8,
+	NET_NEIGH_PROXY_QLEN=9,
+	NET_NEIGH_ANYCAST_DELAY=10,
+	NET_NEIGH_PROXY_DELAY=11,
+	NET_NEIGH_LOCKTIME=12,
+	NET_NEIGH_GC_INTERVAL=13,
+	NET_NEIGH_GC_THRESH1=14,
+	NET_NEIGH_GC_THRESH2=15,
+	NET_NEIGH_GC_THRESH3=16,
+	NET_NEIGH_RETRANS_TIME_MS=17,
+	NET_NEIGH_REACHABLE_TIME_MS=18,
+	__NET_NEIGH_MAX
+};
+
+/* /proc/sys/net/dccp */
+enum {
+	NET_DCCP_DEFAULT=1,
+};
+
+/* /proc/sys/net/dccp/default */
+enum {
+	NET_DCCP_DEFAULT_SEQ_WINDOW  = 1,
+	NET_DCCP_DEFAULT_RX_CCID     = 2,
+	NET_DCCP_DEFAULT_TX_CCID     = 3,
+	NET_DCCP_DEFAULT_ACK_RATIO   = 4,
+	NET_DCCP_DEFAULT_SEND_ACKVEC = 5,
+	NET_DCCP_DEFAULT_SEND_NDP    = 6,
+};
+
+/* /proc/sys/net/ipx */
+enum {
+	NET_IPX_PPROP_BROADCASTING=1,
+	NET_IPX_FORWARDING=2
+};
+
+/* /proc/sys/net/llc */
+enum {
+	NET_LLC2=1,
+	NET_LLC_STATION=2,
+};
+
+/* /proc/sys/net/llc/llc2 */
+enum {
+	NET_LLC2_TIMEOUT=1,
+};
+
+/* /proc/sys/net/llc/station */
+enum {
+	NET_LLC_STATION_ACK_TIMEOUT=1,
+};
+
+/* /proc/sys/net/llc/llc2/timeout */
+enum {
+	NET_LLC2_ACK_TIMEOUT=1,
+	NET_LLC2_P_TIMEOUT=2,
+	NET_LLC2_REJ_TIMEOUT=3,
+	NET_LLC2_BUSY_TIMEOUT=4,
+};
+
+/* /proc/sys/net/appletalk */
+enum {
+	NET_ATALK_AARP_EXPIRY_TIME=1,
+	NET_ATALK_AARP_TICK_TIME=2,
+	NET_ATALK_AARP_RETRANSMIT_LIMIT=3,
+	NET_ATALK_AARP_RESOLVE_TIME=4
+};
+
+
+/* /proc/sys/net/netrom */
+enum {
+	NET_NETROM_DEFAULT_PATH_QUALITY=1,
+	NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER=2,
+	NET_NETROM_NETWORK_TTL_INITIALISER=3,
+	NET_NETROM_TRANSPORT_TIMEOUT=4,
+	NET_NETROM_TRANSPORT_MAXIMUM_TRIES=5,
+	NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY=6,
+	NET_NETROM_TRANSPORT_BUSY_DELAY=7,
+	NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE=8,
+	NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT=9,
+	NET_NETROM_ROUTING_CONTROL=10,
+	NET_NETROM_LINK_FAILS_COUNT=11,
+	NET_NETROM_RESET=12
+};
+
+/* /proc/sys/net/ax25 */
+enum {
+	NET_AX25_IP_DEFAULT_MODE=1,
+	NET_AX25_DEFAULT_MODE=2,
+	NET_AX25_BACKOFF_TYPE=3,
+	NET_AX25_CONNECT_MODE=4,
+	NET_AX25_STANDARD_WINDOW=5,
+	NET_AX25_EXTENDED_WINDOW=6,
+	NET_AX25_T1_TIMEOUT=7,
+	NET_AX25_T2_TIMEOUT=8,
+	NET_AX25_T3_TIMEOUT=9,
+	NET_AX25_IDLE_TIMEOUT=10,
+	NET_AX25_N2=11,
+	NET_AX25_PACLEN=12,
+	NET_AX25_PROTOCOL=13,
+	NET_AX25_DAMA_SLAVE_TIMEOUT=14
+};
+
+/* /proc/sys/net/rose */
+enum {
+	NET_ROSE_RESTART_REQUEST_TIMEOUT=1,
+	NET_ROSE_CALL_REQUEST_TIMEOUT=2,
+	NET_ROSE_RESET_REQUEST_TIMEOUT=3,
+	NET_ROSE_CLEAR_REQUEST_TIMEOUT=4,
+	NET_ROSE_ACK_HOLD_BACK_TIMEOUT=5,
+	NET_ROSE_ROUTING_CONTROL=6,
+	NET_ROSE_LINK_FAIL_TIMEOUT=7,
+	NET_ROSE_MAX_VCS=8,
+	NET_ROSE_WINDOW_SIZE=9,
+	NET_ROSE_NO_ACTIVITY_TIMEOUT=10
+};
+
+/* /proc/sys/net/x25 */
+enum {
+	NET_X25_RESTART_REQUEST_TIMEOUT=1,
+	NET_X25_CALL_REQUEST_TIMEOUT=2,
+	NET_X25_RESET_REQUEST_TIMEOUT=3,
+	NET_X25_CLEAR_REQUEST_TIMEOUT=4,
+	NET_X25_ACK_HOLD_BACK_TIMEOUT=5
+};
+
+/* /proc/sys/net/token-ring */
+enum
+{
+	NET_TR_RIF_TIMEOUT=1
+};
+
+/* /proc/sys/net/decnet/ */
+enum {
+	NET_DECNET_NODE_TYPE = 1,
+	NET_DECNET_NODE_ADDRESS = 2,
+	NET_DECNET_NODE_NAME = 3,
+	NET_DECNET_DEFAULT_DEVICE = 4,
+	NET_DECNET_TIME_WAIT = 5,
+	NET_DECNET_DN_COUNT = 6,
+	NET_DECNET_DI_COUNT = 7,
+	NET_DECNET_DR_COUNT = 8,
+	NET_DECNET_DST_GC_INTERVAL = 9,
+	NET_DECNET_CONF = 10,
+	NET_DECNET_NO_FC_MAX_CWND = 11,
+	NET_DECNET_MEM = 12,
+	NET_DECNET_RMEM = 13,
+	NET_DECNET_WMEM = 14,
+	NET_DECNET_DEBUG_LEVEL = 255
+};
+
+/* /proc/sys/net/decnet/conf/<dev> */
+enum {
+	NET_DECNET_CONF_LOOPBACK = -2,
+	NET_DECNET_CONF_DDCMP = -3,
+	NET_DECNET_CONF_PPP = -4,
+	NET_DECNET_CONF_X25 = -5,
+	NET_DECNET_CONF_GRE = -6,
+	NET_DECNET_CONF_ETHER = -7
+
+	/* ... and ifindex of devices */
+};
+
+/* /proc/sys/net/decnet/conf/<dev>/ */
+enum {
+	NET_DECNET_CONF_DEV_PRIORITY = 1,
+	NET_DECNET_CONF_DEV_T1 = 2,
+	NET_DECNET_CONF_DEV_T2 = 3,
+	NET_DECNET_CONF_DEV_T3 = 4,
+	NET_DECNET_CONF_DEV_FORWARDING = 5,
+	NET_DECNET_CONF_DEV_BLKSIZE = 6,
+	NET_DECNET_CONF_DEV_STATE = 7
+};
+
+/* /proc/sys/net/sctp */
+enum {
+	NET_SCTP_RTO_INITIAL = 1,
+	NET_SCTP_RTO_MIN     = 2,
+	NET_SCTP_RTO_MAX     = 3,
+	NET_SCTP_RTO_ALPHA   = 4,
+	NET_SCTP_RTO_BETA    = 5,
+	NET_SCTP_VALID_COOKIE_LIFE       =  6,
+	NET_SCTP_ASSOCIATION_MAX_RETRANS =  7,
+	NET_SCTP_PATH_MAX_RETRANS        =  8,
+	NET_SCTP_MAX_INIT_RETRANSMITS    =  9,
+	NET_SCTP_HB_INTERVAL             = 10,
+	NET_SCTP_PRESERVE_ENABLE         = 11,
+	NET_SCTP_MAX_BURST               = 12,
+	NET_SCTP_ADDIP_ENABLE		 = 13,
+	NET_SCTP_PRSCTP_ENABLE		 = 14,
+	NET_SCTP_SNDBUF_POLICY		 = 15,
+	NET_SCTP_SACK_TIMEOUT		 = 16,
+	NET_SCTP_RCVBUF_POLICY		 = 17,
+};
+
+/* /proc/sys/net/bridge */
+enum {
+	NET_BRIDGE_NF_CALL_ARPTABLES = 1,
+	NET_BRIDGE_NF_CALL_IPTABLES = 2,
+	NET_BRIDGE_NF_CALL_IP6TABLES = 3,
+	NET_BRIDGE_NF_FILTER_VLAN_TAGGED = 4,
+};
+
+/* CTL_FS names: */
+enum
+{
+	FS_NRINODE=1,	/* int:current number of allocated inodes */
+	FS_STATINODE=2,
+	FS_MAXINODE=3,	/* int:maximum number of inodes that can be allocated */
+	FS_NRDQUOT=4,	/* int:current number of allocated dquots */
+	FS_MAXDQUOT=5,	/* int:maximum number of dquots that can be allocated */
+	FS_NRFILE=6,	/* int:current number of allocated filedescriptors */
+	FS_MAXFILE=7,	/* int:maximum number of filedescriptors that can be allocated */
+	FS_DENTRY=8,
+	FS_NRSUPER=9,	/* int:current number of allocated super_blocks */
+	FS_MAXSUPER=10,	/* int:maximum number of super_blocks that can be allocated */
+	FS_OVERFLOWUID=11,	/* int: overflow UID */
+	FS_OVERFLOWGID=12,	/* int: overflow GID */
+	FS_LEASES=13,	/* int: leases enabled */
+	FS_DIR_NOTIFY=14,	/* int: directory notification enabled */
+	FS_LEASE_TIME=15,	/* int: maximum time to wait for a lease break */
+	FS_DQSTATS=16,	/* disc quota usage statistics and control */
+	FS_XFS=17,	/* struct: control xfs parameters */
+	FS_AIO_NR=18,	/* current system-wide number of aio requests */
+	FS_AIO_MAX_NR=19,	/* system-wide maximum number of aio requests */
+	FS_INOTIFY=20,	/* inotify submenu */
+};
+
+/* /proc/sys/fs/quota/ */
+enum {
+	FS_DQ_LOOKUPS = 1,
+	FS_DQ_DROPS = 2,
+	FS_DQ_READS = 3,
+	FS_DQ_WRITES = 4,
+	FS_DQ_CACHE_HITS = 5,
+	FS_DQ_ALLOCATED = 6,
+	FS_DQ_FREE = 7,
+	FS_DQ_SYNCS = 8,
+	FS_DQ_WARNINGS = 9,
+};
+
+/* CTL_DEBUG names: */
+
+/* CTL_DEV names: */
+enum {
+	DEV_CDROM=1,
+	DEV_HWMON=2,
+	DEV_PARPORT=3,
+	DEV_RAID=4,
+	DEV_MAC_HID=5,
+	DEV_SCSI=6,
+	DEV_IPMI=7,
+};
+
+/* /proc/sys/dev/cdrom */
+enum {
+	DEV_CDROM_INFO=1,
+	DEV_CDROM_AUTOCLOSE=2,
+	DEV_CDROM_AUTOEJECT=3,
+	DEV_CDROM_DEBUG=4,
+	DEV_CDROM_LOCK=5,
+	DEV_CDROM_CHECK_MEDIA=6
+};
+
+/* /proc/sys/dev/parport */
+enum {
+	DEV_PARPORT_DEFAULT=-3
+};
+
+/* /proc/sys/dev/raid */
+enum {
+	DEV_RAID_SPEED_LIMIT_MIN=1,
+	DEV_RAID_SPEED_LIMIT_MAX=2
+};
+
+/* /proc/sys/dev/parport/default */
+enum {
+	DEV_PARPORT_DEFAULT_TIMESLICE=1,
+	DEV_PARPORT_DEFAULT_SPINTIME=2
+};
+
+/* /proc/sys/dev/parport/parport n */
+enum {
+	DEV_PARPORT_SPINTIME=1,
+	DEV_PARPORT_BASE_ADDR=2,
+	DEV_PARPORT_IRQ=3,
+	DEV_PARPORT_DMA=4,
+	DEV_PARPORT_MODES=5,
+	DEV_PARPORT_DEVICES=6,
+	DEV_PARPORT_AUTOPROBE=16
+};
+
+/* /proc/sys/dev/parport/parport n/devices/ */
+enum {
+	DEV_PARPORT_DEVICES_ACTIVE=-3,
+};
+
+/* /proc/sys/dev/parport/parport n/devices/device n */
+enum {
+	DEV_PARPORT_DEVICE_TIMESLICE=1,
+};
+
+/* /proc/sys/dev/mac_hid */
+enum {
+	DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES=1,
+	DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES=2,
+	DEV_MAC_HID_MOUSE_BUTTON_EMULATION=3,
+	DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE=4,
+	DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE=5,
+	DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES=6
+};
+
+/* /proc/sys/dev/scsi */
+enum {
+	DEV_SCSI_LOGGING_LEVEL=1,
+};
+
+/* /proc/sys/dev/ipmi */
+enum {
+	DEV_IPMI_POWEROFF_POWERCYCLE=1,
+};
+
+/* /proc/sys/abi */
+enum
+{
+	ABI_DEFHANDLER_COFF=1,	/* default handler for coff binaries */
+	ABI_DEFHANDLER_ELF=2, 	/* default handler for ELF binaries */
+	ABI_DEFHANDLER_LCALL7=3,/* default handler for procs using lcall7 */
+	ABI_DEFHANDLER_LIBCSO=4,/* default handler for an libc.so ELF interp */
+	ABI_TRACE=5,		/* tracing flags */
+	ABI_FAKE_UTSNAME=6,	/* fake target utsname information */
+};
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+
+extern void sysctl_init(void);
+
+typedef struct ctl_table ctl_table;
+
+typedef int ctl_handler (ctl_table *table, int __user *name, int nlen,
+			 void __user *oldval, size_t __user *oldlenp,
+			 void __user *newval, size_t newlen, 
+			 void **context);
+
+typedef int proc_handler (ctl_table *ctl, int write, struct file * filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos);
+
+extern int proc_dostring(ctl_table *, int, struct file *,
+			 void __user *, size_t *, loff_t *);
+extern int proc_dointvec(ctl_table *, int, struct file *,
+			 void __user *, size_t *, loff_t *);
+extern int proc_dointvec_bset(ctl_table *, int, struct file *,
+			      void __user *, size_t *, loff_t *);
+extern int proc_dointvec_minmax(ctl_table *, int, struct file *,
+				void __user *, size_t *, loff_t *);
+extern int proc_dointvec_jiffies(ctl_table *, int, struct file *,
+				 void __user *, size_t *, loff_t *);
+extern int proc_dointvec_userhz_jiffies(ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+extern int proc_dointvec_ms_jiffies(ctl_table *, int, struct file *,
+				    void __user *, size_t *, loff_t *);
+extern int proc_doulongvec_minmax(ctl_table *, int, struct file *,
+				  void __user *, size_t *, loff_t *);
+extern int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int,
+				      struct file *, void __user *, size_t *, loff_t *);
+
+extern int do_sysctl (int __user *name, int nlen,
+		      void __user *oldval, size_t __user *oldlenp,
+		      void __user *newval, size_t newlen);
+
+extern int do_sysctl_strategy (ctl_table *table, 
+			       int __user *name, int nlen,
+			       void __user *oldval, size_t __user *oldlenp,
+			       void __user *newval, size_t newlen, void ** context);
+
+extern ctl_handler sysctl_string;
+extern ctl_handler sysctl_intvec;
+extern ctl_handler sysctl_jiffies;
+extern ctl_handler sysctl_ms_jiffies;
+
+
+/*
+ * Register a set of sysctl names by calling register_sysctl_table
+ * with an initialised array of ctl_table's.  An entry with zero
+ * ctl_name terminates the table.  table->de will be set up by the
+ * registration and need not be initialised in advance.
+ *
+ * sysctl names can be mirrored automatically under /proc/sys.  The
+ * procname supplied controls /proc naming.
+ *
+ * The table's mode will be honoured both for sys_sysctl(2) and
+ * proc-fs access.
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.  A
+ * null procname disables /proc mirroring at this node.
+ * 
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ * 
+ * More sophisticated management can be enabled by the provision of a
+ * strategy routine with the table entry.  This will be called before
+ * any automatic read or write of the data is performed.
+ * 
+ * The strategy routine may return:
+ * <0: Error occurred (error is passed to user process)
+ * 0:  OK - proceed with automatic read or write.
+ * >0: OK - read or write has been done by the strategy routine, so 
+ *     return immediately.
+ * 
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases.
+ */
+
+/* A sysctl table is an array of struct ctl_table: */
+struct ctl_table 
+{
+	int ctl_name;			/* Binary ID */
+	const char *procname;		/* Text ID for /proc/sys, or zero */
+	void *data;
+	int maxlen;
+	mode_t mode;
+	ctl_table *child;
+	proc_handler *proc_handler;	/* Callback for text formatting */
+	ctl_handler *strategy;		/* Callback function for all r/w */
+	struct proc_dir_entry *de;	/* /proc control block */
+	void *extra1;
+	void *extra2;
+};
+
+/* struct ctl_table_header is used to maintain dynamic lists of
+   ctl_table trees. */
+struct ctl_table_header
+{
+	ctl_table *ctl_table;
+	struct list_head ctl_entry;
+	int used;
+	struct completion *unregistering;
+};
+
+struct ctl_table_header * register_sysctl_table(ctl_table * table, 
+						int insert_at_head);
+void unregister_sysctl_table(struct ctl_table_header * table);
+
+#else /* __KERNEL__ */
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SYSCTL_H */
diff -urN ./linux-2.6.18.1/include/linux/time.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/time.h
--- ./linux-2.6.18.1/include/linux/time.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/time.h	2007-05-19 23:58:35.000000000 +0900
@@ -90,8 +90,9 @@
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
+extern unsigned long read_persistent_clock(void);
 void timekeeping_init(void);
 
 static inline unsigned long get_seconds(void)
@@ -118,6 +119,7 @@
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_is_continuous(void);
+extern void warp_check_clock_was_changed(void);
 
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
@@ -174,6 +176,7 @@
 	}
 	a->tv_nsec = ns;
 }
+
 #endif /* __KERNEL__ */
 
 #define NFDBITS			__NFDBITS
diff -urN ./linux-2.6.18.1/include/linux/timer.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/timer.h
--- ./linux-2.6.18.1/include/linux/timer.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/timer.h	2007-05-19 23:58:35.000000000 +0900
@@ -2,6 +2,7 @@
 #define _LINUX_TIMER_H
 
 #include <linux/list.h>
+#include <linux/ktime.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
@@ -15,9 +16,15 @@
 	unsigned long data;
 
 	struct tvec_t_base_s *base;
+#ifdef CONFIG_TIMER_STATS
+	void *start_site;
+	char start_comm[16];
+	int start_pid;
+#endif
 };
 
 extern struct tvec_t_base_s boot_tvec_bases;
+extern unsigned int timeout_granularity;
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
@@ -61,7 +68,59 @@
 extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 
+/*
+ * Return when the next timer-wheel timeout occurs (in absolute jiffies),
+ * locks the timer base:
+ */
 extern unsigned long next_timer_interrupt(void);
+/*
+ * Return when the next timer-wheel timeout occurs (in absolute jiffies),
+ * locks the timer base and does the comparison against the given
+ * jiffie.
+ */
+extern unsigned long get_next_timer_interrupt(unsigned long now);
+
+/*
+ * Timer-statistics info:
+ */
+#ifdef CONFIG_TIMER_STATS
+
+extern void tstats_update_stats(void *timer, pid_t pid, void *startf,
+				void *timerf, char * comm);
+
+static inline void tstats_account_timer(struct timer_list *timer)
+{
+	tstats_update_stats(timer, timer->start_pid, timer->start_site,
+			    timer->function, timer->start_comm);
+}
+
+extern void __tstats_timer_set_start_info(struct timer_list *timer, void *addr);
+
+static inline void tstats_timer_set_start_info(struct timer_list *timer)
+{
+	__tstats_timer_set_start_info(timer, __builtin_return_address(0));
+}
+
+static inline void tstats_timer_clear_start_info(struct timer_list *timer)
+{
+	timer->start_site = NULL;
+}
+#else
+static inline void tstats_account_timer(struct timer_list *timer)
+{
+}
+
+static inline void tstats_timer_set_start_info(struct timer_list *timer)
+{
+}
+
+static inline void tstats_timer_clear_start_info(struct timer_list *timer)
+{
+}
+#endif
+
+extern void delayed_work_timer_fn(unsigned long __data);
+
 
 /***
  * add_timer - start a timer
@@ -83,10 +142,12 @@
 	__mod_timer(timer, timer->expires);
 }
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+  extern int timer_pending_sync(struct timer_list *timer);
   extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
+# define timer_pending_sync(t)		timer_pending(t)
 # define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
diff -urN ./linux-2.6.18.1/include/linux/timex.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/timex.h
--- ./linux-2.6.18.1/include/linux/timex.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/timex.h	2007-05-19 23:58:35.000000000 +0900
@@ -69,34 +69,28 @@
  * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours,
  * respectively.
  */
-#define SHIFT_KG 6		/* phase factor (shift) */
-#define SHIFT_KF 16		/* PLL frequency factor (shift) */
-#define SHIFT_KH 2		/* FLL frequency factor (shift) */
-#define MAXTC 6			/* maximum time constant (shift) */
+#define SHIFT_PLL	4	/* PLL frequency factor (shift) */
+#define SHIFT_FLL	2	/* FLL frequency factor (shift) */
+#define MAXTC		10	/* maximum time constant (shift) */
 
 /*
- * The SHIFT_SCALE define establishes the decimal point of the time_phase
- * variable which serves as an extension to the low-order bits of the
- * system clock variable. The SHIFT_UPDATE define establishes the decimal
- * point of the time_offset variable which represents the current offset
- * with respect to standard time. The FINENSEC define represents 1 nsec in
- * scaled units.
+ * The SHIFT_UPDATE define establishes the decimal point of the
+ * time_offset variable which represents the current offset with
+ * respect to standard time.
  *
  * SHIFT_USEC defines the scaling (shift) of the time_freq and
  * time_tolerance variables, which represent the current frequency
  * offset and maximum frequency tolerance.
- *
- * FINENSEC is 1 ns in SHIFT_UPDATE units of the time_phase variable.
  */
-#define SHIFT_SCALE 22		/* phase scale (shift) */
-#define SHIFT_UPDATE (SHIFT_KG + MAXTC) /* time offset scale (shift) */
+#define SHIFT_UPDATE (SHIFT_HZ + 1) /* time offset scale (shift) */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
-#define FINENSEC (1L << (SHIFT_SCALE - 10)) /* ~1 ns in phase units */
+#define SHIFT_NSEC 12		/* kernel frequency offset scale */
 
 #define MAXPHASE 512000L        /* max phase error (us) */
 #define MAXFREQ (512L << SHIFT_USEC)  /* max frequency error (ppm) */
-#define MINSEC 16L              /* min interval between updates (s) */
-#define MAXSEC 1200L            /* max interval between updates (s) */
+#define MAXFREQ_NSEC (512000L << SHIFT_NSEC) /* max frequency error (ppb) */
+#define MINSEC 256		/* min interval between updates (s) */
+#define MAXSEC 2048		/* max interval between updates (s) */
 #define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
 
 /*
@@ -204,33 +198,15 @@
 /*
  * phase-lock loop variables
  */
-extern int time_state;		/* clock status */
 extern int time_status;		/* clock synchronization status bits */
-extern long time_offset;	/* time adjustment (us) */
-extern long time_constant;	/* pll time constant */
-extern long time_tolerance;	/* frequency tolerance (ppm) */
-extern long time_precision;	/* clock precision (us) */
 extern long time_maxerror;	/* maximum error */
 extern long time_esterror;	/* estimated error */
 
 extern long time_freq;		/* frequency offset (scaled ppm) */
-extern long time_reftime;	/* time at last adjustment (s) */
 
 extern long time_adjust;	/* The amount of adjtime left */
-extern long time_next_adjust;	/* Value for time_adjust at next tick */
 
-/**
- * ntp_clear - Clears the NTP state variables
- *
- * Must be called while holding a write on the xtime_lock
- */
-static inline void ntp_clear(void)
-{
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
-}
+extern void ntp_clear(void);
 
 /**
  * ntp_synced - Returns 1 if the NTP status is not UNSYNC
@@ -294,11 +270,15 @@
 extern void unregister_time_interpolator(struct time_interpolator *);
 extern void time_interpolator_reset(void);
 extern unsigned long time_interpolator_get_offset(void);
+extern void time_interpolator_update(long delta_nsec);
 
 #else /* !CONFIG_TIME_INTERPOLATION */
 
-static inline void
-time_interpolator_reset(void)
+static inline void time_interpolator_reset(void)
+{
+}
+
+static inline void time_interpolator_update(long delta_nsec)
 {
 }
 
@@ -309,6 +289,8 @@
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
 extern u64 current_tick_length(void);
 
+extern void second_overflow(void);
+extern void update_ntp_one_tick(void);
 extern int do_adjtimex(struct timex *);
 
 #endif /* KERNEL */
diff -urN ./linux-2.6.18.1/include/linux/vmstat.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/vmstat.h
--- ./linux-2.6.18.1/include/linux/vmstat.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/vmstat.h	2007-05-19 23:58:35.000000000 +0900
@@ -41,7 +41,12 @@
 
 static inline void __count_vm_event(enum vm_event_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+	get_cpu_var(vm_event_states).event[item]++;
+	put_cpu();
+#else
 	__get_cpu_var(vm_event_states).event[item]++;
+#endif
 }
 
 static inline void count_vm_event(enum vm_event_item item)
@@ -52,7 +57,12 @@
 
 static inline void __count_vm_events(enum vm_event_item item, long delta)
 {
+#ifdef CONFIG_PREEMPT_RT
+	get_cpu_var(vm_event_states).event[item] += delta;
+	put_cpu();
+#else
 	__get_cpu_var(vm_event_states).event[item] += delta;
+#endif
 }
 
 static inline void count_vm_events(enum vm_event_item item, long delta)
diff -urN ./linux-2.6.18.1/include/linux/wait.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/wait.h
--- ./linux-2.6.18.1/include/linux/wait.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/wait.h	2007-05-19 23:58:35.000000000 +0900
@@ -47,11 +47,13 @@
 	wait_queue_t wait;
 };
 
+#if 1
 struct __wait_queue_head {
 	spinlock_t lock;
 	struct list_head task_list;
 };
 typedef struct __wait_queue_head wait_queue_head_t;
+#endif
 
 struct task_struct;
 
diff -urN ./linux-2.6.18.1/include/linux/workqueue.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/workqueue.h
--- ./linux-2.6.18.1/include/linux/workqueue.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/linux/workqueue.h	2007-05-19 23:58:35.000000000 +0900
@@ -58,6 +58,8 @@
 						    int singlethread);
 #define create_workqueue(name) __create_workqueue((name), 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+				int rt_priority, int nice);
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff -urN ./linux-2.6.18.1/include/net/dn_dev.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/net/dn_dev.h
--- ./linux-2.6.18.1/include/net/dn_dev.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/net/dn_dev.h	2007-05-19 23:58:35.000000000 +0900
@@ -76,9 +76,9 @@
 	int priority;             /* Priority to be a router            */
 	char *name;               /* Name for sysctl                    */
 	int ctl_name;             /* Index for sysctl                   */
-	int  (*up)(struct net_device *);
-	void (*down)(struct net_device *);
-	void (*timer3)(struct net_device *, struct dn_ifaddr *ifa);
+	int  (*dn_up)(struct net_device *);
+	void (*dn_down)(struct net_device *);
+	void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa);
 	void *sysctl;
 };
 
diff -urN ./linux-2.6.18.1/include/net/inet_hashtables.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/net/inet_hashtables.h
--- ./linux-2.6.18.1/include/net/inet_hashtables.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/net/inet_hashtables.h	2007-05-19 23:58:35.000000000 +0900
@@ -101,6 +101,7 @@
 	 * is for TIME_WAIT sockets only.
 	 */
 	struct inet_ehash_bucket	*ehash;
+	unsigned long			*ebitmask;
 
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -135,6 +136,13 @@
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }
 
+static inline unsigned int inet_ehash_index(
+	struct inet_hashinfo *hashinfo,
+	unsigned int hash)
+{
+	return hash & (hashinfo->ehash_size - 1);
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(kmem_cache_t *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -207,11 +215,25 @@
 		wake_up(&hashinfo->lhash_wait);
 }
 
+static inline void __inet_hash_setbit(unsigned long *bitmask, unsigned int index)
+{
+	if (bitmask)
+		set_bit(index, bitmask);
+}
+
+static inline void __inet_hash_clearbit(unsigned long *bitmask, unsigned int index)
+{
+	if (bitmask)
+		clear_bit(index, bitmask);
+}
+
 static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 			       struct sock *sk, const int listen_possible)
 {
 	struct hlist_head *list;
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;
 
 	BUG_TRAP(sk_unhashed(sk));
 	if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -221,12 +243,15 @@
 	} else {
 		struct inet_ehash_bucket *head;
 		sk->sk_hash = inet_sk_ehashfn(sk);
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
 		lock = &head->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
+	__inet_hash_setbit(bitmask, index);
 	sock_prot_inc_use(sk->sk_prot);
 	write_unlock(lock);
 	if (listen_possible && sk->sk_state == TCP_LISTEN)
@@ -245,6 +270,8 @@
 static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 {
 	rwlock_t *lock;
+	unsigned long *bitmask = NULL;
+	unsigned int index = 0;
 
 	if (sk_unhashed(sk))
 		goto out;
@@ -254,12 +281,16 @@
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
+		index = inet_ehash_index(hashinfo, sk->sk_hash);
 		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		bitmask = hashinfo->ebitmask;
 		write_lock_bh(lock);
 	}
 
-	if (__sk_del_node_init(sk))
+	if (__sk_del_node_init(sk)) {
+		__inet_hash_clearbit(bitmask, index);
 		sock_prot_dec_use(sk->sk_prot);
+	}
 	write_unlock_bh(lock);
 out:
 	if (sk->sk_state == TCP_LISTEN)
diff -urN ./linux-2.6.18.1/include/net/sock.h linux-2.6.18.1-cabi-20070529-RT_HRT/include/net/sock.h
--- ./linux-2.6.18.1/include/net/sock.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/include/net/sock.h	2007-05-19 23:58:35.000000000 +0900
@@ -623,12 +623,12 @@
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse++;
+	prot->stats[raw_smp_processor_id()].inuse++;
 }
 
 static __inline__ void sock_prot_dec_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse--;
+	prot->stats[raw_smp_processor_id()].inuse--;
 }
 
 /* With per-bucket locks this operation is not-atomic, so that
diff -urN ./linux-2.6.18.1/init/main.c linux-2.6.18.1-cabi-20070529-RT_HRT/init/main.c
--- ./linux-2.6.18.1/init/main.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/init/main.c	2007-05-20 14:14:28.000000000 +0900
@@ -35,9 +35,11 @@
 #include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
+#include <linux/posix-timers.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
+#include <linux/clockchips.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/efi.h>
@@ -45,6 +47,7 @@
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/irq.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/unwind.h>
@@ -62,6 +65,10 @@
 #include <asm/smp.h>
 #endif
 
+#ifdef CONFIG_CABI
+extern void cabi_init(void);
+#endif
+
 /*
  * This is one of the first .c files built. Error out early if we have compiler
  * trouble.
@@ -389,6 +396,8 @@
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	unlock_kernel();
@@ -397,7 +406,7 @@
 	 * The boot idle thread must execute schedule()
 	 * at least one to get things moving:
 	 */
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 
@@ -466,6 +475,8 @@
 	 */
 	lockdep_init();
 
+	start_trace();
+
 	local_irq_disable();
 	early_boot_irqs_off();
 	early_init_irq_lock_class();
@@ -494,8 +505,10 @@
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
+
 	build_all_zonelists();
 	page_alloc_init();
+	early_init_hardirqs();
 	printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", command_line, __start___param,
@@ -507,6 +520,7 @@
 	rcu_init();
 	init_IRQ();
 	pidhash_init();
+	clockevents_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -583,6 +597,9 @@
 
 	acpi_early_init(); /* before LAPIC and SMP init */
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
@@ -631,6 +648,12 @@
 			msg = "disabled interrupts";
 			local_irq_enable();
 		}
+#ifdef CONFIG_PREEMPT_RT
+		if (irqs_disabled()) {
+			msg = "disabled hard interrupts";
+			local_irq_enable();
+		}
+#endif
 		if (msg) {
 			printk(KERN_WARNING "initcall at 0x%p", *call);
 			print_fn_descriptor_symbol(": %s()",
@@ -661,19 +684,26 @@
 	sysctl_init();
 #endif
 
+#ifdef CONFIG_CABI
+	cabi_init();
+#endif
+
 	do_initcalls();
 }
 
 static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
+	extern int spawn_desched_task(void);
 #ifdef CONFIG_SMP
 	extern int migration_init(void);
 
 	migration_init();
 #endif
+	posix_cpu_thread_init();
 	spawn_ksoftirqd();
 	spawn_softlockup_task();
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -701,6 +731,8 @@
 
 	smp_prepare_cpus(max_cpus);
 
+	init_hardirqs();
+
 	do_pre_smp_initcalls();
 
 	smp_init();
@@ -728,7 +760,54 @@
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
+
+#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_DEBUG_PREEMPT) + defined(CONFIG_CRITICAL_PREEMPT_TIMING) + defined(CONFIG_CRITICAL_IRQSOFF_TIMING) + defined(CONFIG_LATENCY_TRACE) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP))
 
+#if DEBUG_COUNT > 0
+	printk(KERN_ERR "*****************************************************************************\n");
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  REMINDER, the following debugging option is turned on in your .config:   *\n");
+#else
+	printk(KERN_ERR "*  REMINDER, the following debugging options are turned on in your .config: *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	printk(KERN_ERR "*        CONFIG_DEBUG_RT_MUTEXES                                             *\n");
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	printk(KERN_ERR "*        CONFIG_DEBUG_PREEMPT                                               *\n");
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_PREEMPT_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_IRQSOFF_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	printk(KERN_ERR "*        CONFIG_LATENCY_TRACE                                               *\n");
+#endif
+#ifdef CONFIG_DEBUG_SLAB
+	printk(KERN_ERR "*        CONFIG_DEBUG_SLAB                                                  *\n");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_ERR "*        CONFIG_DEBUG_PAGEALLOC                                             *\n");
+#endif
+#ifdef CONFIG_LOCKDEP
+	printk(KERN_ERR "*        CONFIG_LOCKDEP                                                     *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  it may increase runtime overhead and latencies.                          *\n");
+#else
+	printk(KERN_ERR "*  they may increase runtime overhead and latencies.                        *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+	printk(KERN_ERR "*****************************************************************************\n");
+#endif
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
@@ -751,6 +830,9 @@
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(irqs_disabled());
+#endif
 
 	/*
 	 * We try each of these until one succeeds.
diff -urN ./linux-2.6.18.1/ipc/mqueue.c linux-2.6.18.1-cabi-20070529-RT_HRT/ipc/mqueue.c
--- ./linux-2.6.18.1/ipc/mqueue.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/ipc/mqueue.c	2007-05-19 23:58:35.000000000 +0900
@@ -781,12 +781,17 @@
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
diff -urN ./linux-2.6.18.1/ipc/msg.c linux-2.6.18.1-cabi-20070529-RT_HRT/ipc/msg.c
--- ./linux-2.6.18.1/ipc/msg.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/ipc/msg.c	2007-05-19 23:58:35.000000000 +0900
@@ -169,12 +169,19 @@
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver *msr;
 
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * this CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable();
+
 		msr = list_entry(tmp, struct msg_receiver, r_list);
 		tmp = tmp->next;
 		msr->r_msg = NULL;
-		wake_up_process(msr->r_tsk);
-		smp_mb();
+		wake_up_process(msr->r_tsk); /* serializes */
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable();
 	}
 }
 
@@ -338,7 +345,7 @@
 asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf)
 {
 	struct kern_ipc_perm *ipcp;
-	struct msq_setbuf setbuf;
+	struct msq_setbuf setbuf = { /* shut up gcc warning */ };
 	struct msg_queue *msq;
 	int err, version;
 
@@ -554,22 +561,28 @@
 		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
 					       msr->r_msgtype, msr->r_mode)) {
 
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * this CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable();
+
 			list_del(&msr->r_list);
 			if (msr->r_maxsize < msg->m_ts) {
 				msr->r_msg = NULL;
-				wake_up_process(msr->r_tsk);
-				smp_mb();
+				wake_up_process(msr->r_tsk); /* serializes */
 				msr->r_msg = ERR_PTR(-E2BIG);
 			} else {
 				msr->r_msg = NULL;
 				msq->q_lrpid = msr->r_tsk->pid;
 				msq->q_rtime = get_seconds();
-				wake_up_process(msr->r_tsk);
-				smp_mb();
+				wake_up_process(msr->r_tsk); /* serializes */
 				msr->r_msg = msg;
+				preempt_enable();
 
 				return 1;
 			}
+			preempt_enable();
 		}
 	}
 	return 0;
diff -urN ./linux-2.6.18.1/ipc/sem.c linux-2.6.18.1-cabi-20070529-RT_HRT/ipc/sem.c
--- ./linux-2.6.18.1/ipc/sem.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/ipc/sem.c	2007-05-19 23:58:35.000000000 +0900
@@ -365,6 +365,11 @@
 		if (error <= 0) {
 			struct sem_queue *n;
 			remove_from_queue(sma,q);
+			/*
+			 * make sure that the wakeup doesnt preempt
+			 * _this_ cpu prematurely. (on preempt_rt)
+			 */
+			preempt_disable();
 			q->status = IN_WAKEUP;
 			/*
 			 * Continue scanning. The next operation
@@ -387,6 +392,7 @@
 			 */
 			smp_wmb();
 			q->status = error;
+			preempt_enable();
 			q = n;
 		} else {
 			q = q->next;
@@ -806,7 +812,7 @@
 {
 	struct sem_array *sma;
 	int err;
-	struct sem_setbuf setbuf;
+	struct sem_setbuf setbuf = { /* shut up gcc warning */ };
 	struct kern_ipc_perm *ipcp;
 
 	if(cmd == IPC_SET) {
diff -urN ./linux-2.6.18.1/kernel/Kconfig.preempt linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/Kconfig.preempt
--- ./linux-2.6.18.1/kernel/Kconfig.preempt	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/Kconfig.preempt	2007-05-19 23:58:35.000000000 +0900
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slighly lower throughput.
 
@@ -33,33 +32,146 @@
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slighly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending on the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on SMP || PREEMPT
+config PREEMPT
+	bool
 	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
+
+config SPINLOCK_BKL
+	bool "Old-Style Big Kernel Lock"
+	depends on (PREEMPT || SMP) && !PREEMPT_RT
+	default n
 	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
+	  This option increases the latency of the kernel by making the
+	  big kernel lock spinlock-based (which is bad for latency).
+	  However, enable this option if you see any problems to revert
+	  back to the traditional spinlock BKL design.
 
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config PREEMPT_BKL
+	bool
+	depends on PREEMPT_RT || !SPINLOCK_BKL
+	default n if !PREEMPT
+	default y
+
+choice
+	prompt "RCU implementation type:"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	depends on !PREEMPT_RT
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+
+	  Say Y if you are unsure.
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+	  Say N if you are unsure.
+
+endchoice
+
+config RCU_TRACE
+	bool "Enable tracing for RCU - currently stats in /proc"
+	default y
+	help
+	  This option provides tracing in RCU which presents /proc
+          stats for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
+	  Say N if you are unsure.
diff -urN ./linux-2.6.18.1/kernel/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/Makefile
--- ./linux-2.6.18.1/kernel/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -6,13 +6,16 @@
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o
+	    extable.o params.o posix-timers.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
+	    hrtimer.o rwsem.o latency.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+ifneq ($(CONFIG_PREEMPT_RT),y)
+obj-y += mutex.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+endif
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
@@ -24,6 +27,7 @@
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_PREEMPT_RT) += rt.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -39,6 +43,9 @@
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_DEBUG_PREEMPT) += latency_trace.o
+obj-$(CONFIG_LATENCY_TIMING) += latency_trace.o
+obj-$(CONFIG_LATENCY_HIST) += latency_hist.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
@@ -47,6 +54,9 @@
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcupdate.o rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupdate.o rcupreempt.o
+obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff -urN ./linux-2.6.18.1/kernel/audit.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/audit.c
--- ./linux-2.6.18.1/kernel/audit.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/audit.c	2007-05-19 23:58:35.000000000 +0900
@@ -369,6 +369,8 @@
 			remove_wait_queue(&kauditd_wait, &wait);
 		}
 	}
+
+	return 0; /* shut up gcc */
 }
 
 int audit_send_list(void *_dest)
@@ -869,7 +871,7 @@
 {
 	struct audit_buffer	*ab	= NULL;
 	struct timespec		t;
-	unsigned int		serial;
+	unsigned int		serial = 0 /* shut up gcc */;
 	int reserve;
 	unsigned long timeout_start = jiffies;
 
diff -urN ./linux-2.6.18.1/kernel/auditfilter.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/auditfilter.c
--- ./linux-2.6.18.1/kernel/auditfilter.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/auditfilter.c	2007-05-19 23:58:35.000000000 +0900
@@ -1161,7 +1161,7 @@
 	struct audit_entry *e;
 	struct audit_field *inode_f = entry->rule.inode_f;
 	struct audit_watch *watch = entry->rule.watch;
-	struct nameidata *ndp, *ndw;
+	struct nameidata *ndp = NULL, *ndw = NULL /* shut up gcc */;
 	int h, err, putnd_needed = 0;
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
@@ -1596,7 +1596,7 @@
 int audit_filter_user(struct netlink_skb_parms *cb, int type)
 {
 	struct audit_entry *e;
-	enum audit_state   state;
+	enum audit_state state = 0 /* shut up gcc */;
 	int ret = 1;
 
 	rcu_read_lock();
diff -urN ./linux-2.6.18.1/kernel/exit.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/exit.c
--- ./linux-2.6.18.1/kernel/exit.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/exit.c	2007-05-20 14:14:28.000000000 +0900
@@ -58,7 +58,9 @@
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
+		preempt_disable();
 		__get_cpu_var(process_counts)--;
+		preempt_enable();
 	}
 	list_del_rcu(&p->thread_group);
 	remove_parent(p);
@@ -442,8 +444,10 @@
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&fdt->fd[i], NULL);
-				if (file)
+				if (file) {
 					filp_close(file, files);
+					cond_resched();
+				}
 			}
 			i++;
 			set >>= 1;
@@ -577,9 +581,11 @@
 	BUG_ON(mm != tsk->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
+	preempt_disable(); // FIXME
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	preempt_enable();
 	task_unlock(tsk);
 	mmput(mm);
 }
@@ -853,6 +859,7 @@
 
 	WARN_ON(atomic_read(&tsk->fs_excl));
 
+	BUG_ON(in_interrupt());
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
@@ -921,6 +928,17 @@
 	cpuset_exit(tsk);
 	exit_keys(tsk);
 
+#ifdef CONFIG_CABI
+	if (tsk->cabi_info) {
+		extern void cabi_account_detach(struct task_struct *);
+#ifdef DEBUG_CABI
+		printk("in exit, detach pid (%d) cabi(0x%x)\n",
+					tsk->pid, (int) tsk->cabi_info);
+#endif
+		cabi_account_detach(tsk);
+	}
+#endif
+
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
 
@@ -955,14 +973,18 @@
 		__free_pipe_info(tsk->splice_pipe);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
+again:
+	local_irq_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
 	tsk->flags |= PF_DEAD;
 
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1458,6 +1480,7 @@
 		list_for_each(_p,&tsk->children) {
 			p = list_entry(_p, struct task_struct, sibling);
 
+			BUG_ON(!atomic_read(&p->usage));
 			ret = eligible_child(pid, options, p);
 			if (!ret)
 				continue;
diff -urN ./linux-2.6.18.1/kernel/exit.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/exit.c.orig
--- ./linux-2.6.18.1/kernel/exit.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/exit.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1665 @@
+/*
+ *  linux/kernel/exit.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/namespace.h>
+#include <linux/key.h>
+#include <linux/security.h>
+#include <linux/cpu.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpuset.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/posix-timers.h>
+#include <linux/cn_proc.h>
+#include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+static void exit_mm(struct task_struct * tsk);
+
+static void __unhash_process(struct task_struct *p)
+{
+	nr_threads--;
+	detach_pid(p, PIDTYPE_PID);
+	if (thread_group_leader(p)) {
+		detach_pid(p, PIDTYPE_PGID);
+		detach_pid(p, PIDTYPE_SID);
+
+		list_del_rcu(&p->tasks);
+		preempt_disable();
+		__get_cpu_var(process_counts)--;
+		preempt_enable();
+	}
+	list_del_rcu(&p->thread_group);
+	remove_parent(p);
+}
+
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct sighand_struct *sighand;
+
+	BUG_ON(!sig);
+	BUG_ON(!atomic_read(&sig->count));
+
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
+	spin_lock(&sighand->siglock);
+
+	posix_cpu_timers_exit(tsk);
+	if (atomic_dec_and_test(&sig->count))
+		posix_cpu_timers_exit_group(tsk);
+	else {
+		/*
+		 * If there is any task waiting for the group exit
+		 * then notify it:
+		 */
+		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+			wake_up_process(sig->group_exit_task);
+			sig->group_exit_task = NULL;
+		}
+		if (tsk == sig->curr_target)
+			sig->curr_target = next_thread(tsk);
+		/*
+		 * Accumulate here the counters for all threads but the
+		 * group leader as they die, so they can be added into
+		 * the process-wide totals when those are taken.
+		 * The group leader stays around as a zombie as long
+		 * as there are other threads.  When it gets reaped,
+		 * the exit.c code will add its counts into these totals.
+		 * We won't ever get here for the group leader, since it
+		 * will have been the last reference on the signal_struct.
+		 */
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
+		sig->min_flt += tsk->min_flt;
+		sig->maj_flt += tsk->maj_flt;
+		sig->nvcsw += tsk->nvcsw;
+		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
+		sig = NULL; /* Marker for below. */
+	}
+
+	__unhash_process(tsk);
+
+	tsk->signal = NULL;
+	tsk->sighand = NULL;
+	spin_unlock(&sighand->siglock);
+	rcu_read_unlock();
+
+	__cleanup_sighand(sighand);
+	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+	flush_sigqueue(&tsk->pending);
+	if (sig) {
+		flush_sigqueue(&sig->shared_pending);
+		__cleanup_signal(sig);
+	}
+}
+
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+	put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+void release_task(struct task_struct * p)
+{
+	struct task_struct *leader;
+	int zap_leader;
+repeat:
+	atomic_dec(&p->user->processes);
+	write_lock_irq(&tasklist_lock);
+	ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	__exit_signal(p);
+
+	/*
+	 * If we are the last non-leader member of the thread
+	 * group, and the leader is zombie, then notify the
+	 * group leader's parent process. (if it wants notification.)
+	 */
+	zap_leader = 0;
+	leader = p->group_leader;
+	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+		BUG_ON(leader->exit_signal == -1);
+		do_notify_parent(leader, leader->exit_signal);
+		/*
+		 * If we were the last child thread and the leader has
+		 * exited already, and the leader's parent ignores SIGCHLD,
+		 * then we are the one who should release the leader.
+		 *
+		 * do_notify_parent() will have marked it self-reaping in
+		 * that case.
+		 */
+		zap_leader = (leader->exit_signal == -1);
+	}
+
+	sched_exit(p);
+	write_unlock_irq(&tasklist_lock);
+	proc_flush_task(p);
+	release_thread(p);
+	call_rcu(&p->rcu, delayed_put_task_struct);
+
+	p = leader;
+	if (unlikely(zap_leader))
+		goto repeat;
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+	struct task_struct *p;
+	int sid = -1;
+
+	read_lock(&tasklist_lock);
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
+			goto out;
+		}
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	p = find_task_by_pid(pgrp);
+	if (p)
+		sid = p->signal->session;
+out:
+	read_unlock(&tasklist_lock);
+	
+	return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52.  Orphaned process groups are not to be affected
+ * by terminal-generated stop signals.  Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
+{
+	struct task_struct *p;
+	int ret = 1;
+
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		if (p == ignored_task
+				|| p->exit_state
+				|| p->real_parent->pid == 1)
+			continue;
+		if (process_group(p->real_parent) != pgrp
+			    && p->real_parent->signal->session == p->signal->session) {
+			ret = 0;
+			break;
+		}
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	return ret;	/* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+	int retval;
+
+	read_lock(&tasklist_lock);
+	retval = will_become_orphaned_pgrp(pgrp, NULL);
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+static int has_stopped_jobs(int pgrp)
+{
+	int retval = 0;
+	struct task_struct *p;
+
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		if (p->state != TASK_STOPPED)
+			continue;
+
+		/* If p is stopped by a debugger on a signal that won't
+		   stop it, then don't count p as stopped.  This isn't
+		   perfect but it's a good approximation.  */
+		if (unlikely (p->ptrace)
+		    && p->exit_code != SIGSTOP
+		    && p->exit_code != SIGTSTP
+		    && p->exit_code != SIGTTOU
+		    && p->exit_code != SIGTTIN)
+			continue;
+
+		retval = 1;
+		break;
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	return retval;
+}
+
+/**
+ * reparent_to_init - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+static void reparent_to_init(void)
+{
+	write_lock_irq(&tasklist_lock);
+
+	ptrace_unlink(current);
+	/* Reparent to init */
+	remove_parent(current);
+	current->parent = child_reaper;
+	current->real_parent = child_reaper;
+	add_parent(current);
+
+	/* Set the exit signal to SIGCHLD so we signal init on exit */
+	current->exit_signal = SIGCHLD;
+
+	if ((current->policy == SCHED_NORMAL ||
+			current->policy == SCHED_BATCH)
+				&& (task_nice(current) < 0))
+		set_user_nice(current, 0);
+	/* cpus_allowed? */
+	/* rt_priority? */
+	/* signals? */
+	security_task_reparent_to_init(current);
+	memcpy(current->signal->rlim, init_task.signal->rlim,
+	       sizeof(current->signal->rlim));
+	atomic_inc(&(INIT_USER->__count));
+	write_unlock_irq(&tasklist_lock);
+	switch_uid(INIT_USER);
+}
+
+void __set_special_pids(pid_t session, pid_t pgrp)
+{
+	struct task_struct *curr = current->group_leader;
+
+	if (curr->signal->session != session) {
+		detach_pid(curr, PIDTYPE_SID);
+		curr->signal->session = session;
+		attach_pid(curr, PIDTYPE_SID, session);
+	}
+	if (process_group(curr) != pgrp) {
+		detach_pid(curr, PIDTYPE_PGID);
+		curr->signal->pgrp = pgrp;
+		attach_pid(curr, PIDTYPE_PGID, pgrp);
+	}
+}
+
+void set_special_pids(pid_t session, pid_t pgrp)
+{
+	write_lock_irq(&tasklist_lock);
+	__set_special_pids(session, pgrp);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+	if (!valid_signal(sig) || sig < 1)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigdelset(&current->blocked, sig);
+	if (!current->mm) {
+		/* Kernel threads handle their own signals.
+		   Let the signal code know it'll be handled, so
+		   that they don't get converted to SIGKILL or
+		   just silently dropped */
+		current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
+	}
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+	if (!valid_signal(sig) || sig < 1)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigaddset(&current->blocked, sig);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ *	Put all the gunge required to become a kernel thread without
+ *	attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+	va_list args;
+	struct fs_struct *fs;
+	sigset_t blocked;
+
+	va_start(args, name);
+	vsnprintf(current->comm, sizeof(current->comm), name, args);
+	va_end(args);
+
+	/*
+	 * If we were started as result of loading a module, close all of the
+	 * user space pages.  We don't need them, and if we didn't close them
+	 * they would be locked into memory.
+	 */
+	exit_mm(current);
+
+	set_special_pids(1, 1);
+	mutex_lock(&tty_mutex);
+	current->signal->tty = NULL;
+	mutex_unlock(&tty_mutex);
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* Become as one with the init task */
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = init_task.fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+	exit_namespace(current);
+	current->namespace = init_task.namespace;
+	get_namespace(current->namespace);
+ 	exit_files(current);
+	current->files = init_task.files;
+	atomic_inc(&current->files->count);
+
+	reparent_to_init();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static void close_files(struct files_struct * files)
+{
+	int i, j;
+	struct fdtable *fdt;
+
+	j = 0;
+
+	/*
+	 * It is safe to dereference the fd table without RCU or
+	 * ->file_lock because this is the last reference to the
+	 * files structure.
+	 */
+	fdt = files_fdtable(files);
+	for (;;) {
+		unsigned long set;
+		i = j * __NFDBITS;
+		if (i >= fdt->max_fdset || i >= fdt->max_fds)
+			break;
+		set = fdt->open_fds->fds_bits[j++];
+		while (set) {
+			if (set & 1) {
+				struct file * file = xchg(&fdt->fd[i], NULL);
+				if (file) {
+					filp_close(file, files);
+					cond_resched();
+				}
+			}
+			i++;
+			set >>= 1;
+		}
+	}
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+	struct files_struct *files;
+
+	task_lock(task);
+	files = task->files;
+	if (files)
+		atomic_inc(&files->count);
+	task_unlock(task);
+
+	return files;
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+	struct fdtable *fdt;
+
+	if (atomic_dec_and_test(&files->count)) {
+		close_files(files);
+		/*
+		 * Free the fd and fdset arrays if we expanded them.
+		 * If the fdtable was embedded, pass files for freeing
+		 * at the end of the RCU grace period. Otherwise,
+		 * you can free files immediately.
+		 */
+		fdt = files_fdtable(files);
+		if (fdt == &files->fdtab)
+			fdt->free_files = files;
+		else
+			kmem_cache_free(files_cachep, files);
+		free_fdtable(fdt);
+	}
+}
+
+EXPORT_SYMBOL(put_files_struct);
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+	struct files_struct * files = tsk->files;
+
+	if (files) {
+		task_lock(tsk);
+		tsk->files = NULL;
+		task_unlock(tsk);
+		put_files_struct(files);
+	}
+}
+
+void exit_files(struct task_struct *tsk)
+{
+	__exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		dput(fs->root);
+		mntput(fs->rootmnt);
+		dput(fs->pwd);
+		mntput(fs->pwdmnt);
+		if (fs->altroot) {
+			dput(fs->altroot);
+			mntput(fs->altrootmnt);
+		}
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	__put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		__put_fs_struct(fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	__exit_fs(tsk);
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static void exit_mm(struct task_struct * tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+
+	mm_release(tsk, mm);
+	if (!mm)
+		return;
+	/*
+	 * Serialize with any possible pending coredump.
+	 * We must hold mmap_sem around checking core_waiters
+	 * and clearing tsk->mm.  The core-inducing thread
+	 * will increment core_waiters for each thread in the
+	 * group with ->mm != NULL.
+	 */
+	down_read(&mm->mmap_sem);
+	if (mm->core_waiters) {
+		up_read(&mm->mmap_sem);
+		down_write(&mm->mmap_sem);
+		if (!--mm->core_waiters)
+			complete(mm->core_startup_done);
+		up_write(&mm->mmap_sem);
+
+		wait_for_completion(&mm->core_done);
+		down_read(&mm->mmap_sem);
+	}
+	atomic_inc(&mm->mm_count);
+	BUG_ON(mm != tsk->active_mm);
+	/* more a memory barrier than a real lock */
+	task_lock(tsk);
+	preempt_disable(); // FIXME
+	tsk->mm = NULL;
+	up_read(&mm->mmap_sem);
+	enter_lazy_tlb(mm, current);
+	preempt_enable();
+	task_unlock(tsk);
+	mmput(mm);
+}
+
+static inline void
+choose_new_parent(struct task_struct *p, struct task_struct *reaper)
+{
+	/*
+	 * Make sure we're not reparenting to ourselves and that
+	 * the parent is not a zombie.
+	 */
+	BUG_ON(p == reaper || reaper->exit_state);
+	p->real_parent = reaper;
+}
+
+static void
+reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+{
+	/* We don't want people slaying init.  */
+	if (p->exit_signal != -1)
+		p->exit_signal = SIGCHLD;
+
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	/* Move the child from its dying parent to the new one.  */
+	if (unlikely(traced)) {
+		/* Preserve ptrace links if someone else is tracing this child.  */
+		list_del_init(&p->ptrace_list);
+		if (p->parent != p->real_parent)
+			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+	} else {
+		/* If this child is being traced, then we're the one tracing it
+		 * anyway, so let go of it.
+		 */
+		p->ptrace = 0;
+		remove_parent(p);
+		p->parent = p->real_parent;
+		add_parent(p);
+
+		/* If we'd notified the old parent about this child's death,
+		 * also notify the new parent.
+		 */
+		if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+		    thread_group_empty(p))
+			do_notify_parent(p, p->exit_signal);
+		else if (p->state == TASK_TRACED) {
+			/*
+			 * If it was at a trace stop, turn it into
+			 * a normal stop since it's no longer being
+			 * traced.
+			 */
+			ptrace_untrace(p);
+		}
+	}
+
+	/*
+	 * process group orphan check
+	 * Case ii: Our child is in a different pgrp
+	 * than we are, and it was the only connection
+	 * outside, so the child pgrp is now orphaned.
+	 */
+	if ((process_group(p) != process_group(father)) &&
+	    (p->signal->session == father->signal->session)) {
+		int pgrp = process_group(p);
+
+		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+			__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+			__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+		}
+	}
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static void
+forget_original_parent(struct task_struct *father, struct list_head *to_release)
+{
+	struct task_struct *p, *reaper = father;
+	struct list_head *_p, *_n;
+
+	do {
+		reaper = next_thread(reaper);
+		if (reaper == father) {
+			reaper = child_reaper;
+			break;
+		}
+	} while (reaper->exit_state);
+
+	/*
+	 * There are only two places where our children can be:
+	 *
+	 * - in our child list
+	 * - in our ptraced child list
+	 *
+	 * Search them and reparent children.
+	 */
+	list_for_each_safe(_p, _n, &father->children) {
+		int ptrace;
+		p = list_entry(_p, struct task_struct, sibling);
+
+		ptrace = p->ptrace;
+
+		/* if father isn't the real parent, then ptrace must be enabled */
+		BUG_ON(father != p->real_parent && !ptrace);
+
+		if (father == p->real_parent) {
+			/* reparent with a reaper, real father it's us */
+			choose_new_parent(p, reaper);
+			reparent_thread(p, father, 0);
+		} else {
+			/* reparent ptraced task to its real parent */
+			__ptrace_unlink (p);
+			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+			    thread_group_empty(p))
+				do_notify_parent(p, p->exit_signal);
+		}
+
+		/*
+		 * if the ptraced child is a zombie with exit_signal == -1
+		 * we must collect it before we exit, or it will remain
+		 * zombie forever since we prevented it from self-reap itself
+		 * while it was being traced by us, to be able to see it in wait4.
+		 */
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
+			list_add(&p->ptrace_list, to_release);
+	}
+	list_for_each_safe(_p, _n, &father->ptrace_children) {
+		p = list_entry(_p, struct task_struct, ptrace_list);
+		choose_new_parent(p, reaper);
+		reparent_thread(p, father, 1);
+	}
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk)
+{
+	int state;
+	struct task_struct *t;
+	struct list_head ptrace_dead, *_p, *_n;
+
+	if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
+	    && !thread_group_empty(tsk)) {
+		/*
+		 * This occurs when there was a race between our exit
+		 * syscall and a group signal choosing us as the one to
+		 * wake up.  It could be that we are the only thread
+		 * alerted to check for pending signals, but another thread
+		 * should be woken now to take the signal since we will not.
+		 * Now we'll wake all the threads in the group just to make
+		 * sure someone gets all the pending signals.
+		 */
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		for (t = next_thread(tsk); t != tsk; t = next_thread(t))
+			if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+				recalc_sigpending_tsk(t);
+				if (signal_pending(t))
+					signal_wake_up(t, 0);
+			}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * This does two things:
+	 *
+  	 * A.  Make init inherit all the child processes
+	 * B.  Check to see if any process groups have become orphaned
+	 *	as a result of our exiting, and if they have any stopped
+	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+	 */
+
+	INIT_LIST_HEAD(&ptrace_dead);
+	forget_original_parent(tsk, &ptrace_dead);
+	BUG_ON(!list_empty(&tsk->children));
+	BUG_ON(!list_empty(&tsk->ptrace_children));
+
+	/*
+	 * Check to see if any process groups have become orphaned
+	 * as a result of our exiting, and if they have any stopped
+	 * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+	 *
+	 * Case i: Our father is in a different pgrp than we are
+	 * and we were the only connection outside, so our pgrp
+	 * is about to become orphaned.
+	 */
+	 
+	t = tsk->real_parent;
+	
+	if ((process_group(t) != process_group(tsk)) &&
+	    (t->signal->session == tsk->signal->session) &&
+	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+	    has_stopped_jobs(process_group(tsk))) {
+		__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
+		__kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
+	}
+
+	/* Let father know we died 
+	 *
+	 * Thread signals are configurable, but you aren't going to use
+	 * that to send signals to arbitary processes. 
+	 * That stops right now.
+	 *
+	 * If the parent exec id doesn't match the exec id we saved
+	 * when we started then we know the parent has changed security
+	 * domain.
+	 *
+	 * If our self_exec id doesn't match our parent_exec_id then
+	 * we have changed execution domain as these two values started
+	 * the same after a fork.
+	 *	
+	 */
+	
+	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+	    ( tsk->parent_exec_id != t->self_exec_id  ||
+	      tsk->self_exec_id != tsk->parent_exec_id)
+	    && !capable(CAP_KILL))
+		tsk->exit_signal = SIGCHLD;
+
+
+	/* If something other than our normal parent is ptracing us, then
+	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+	 * only has special meaning to our real parent.
+	 */
+	if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+		int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+		do_notify_parent(tsk, signal);
+	} else if (tsk->ptrace) {
+		do_notify_parent(tsk, SIGCHLD);
+	}
+
+	state = EXIT_ZOMBIE;
+	if (tsk->exit_signal == -1 &&
+	    (likely(tsk->ptrace == 0) ||
+	     unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
+		state = EXIT_DEAD;
+	tsk->exit_state = state;
+
+	write_unlock_irq(&tasklist_lock);
+
+	list_for_each_safe(_p, _n, &ptrace_dead) {
+		list_del_init(_p);
+		t = list_entry(_p, struct task_struct, ptrace_list);
+		release_task(t);
+	}
+
+	/* If the process is dead, release it - nobody will wait for it */
+	if (state == EXIT_DEAD)
+		release_task(tsk);
+}
+
+fastcall NORET_TYPE void do_exit(long code)
+{
+	struct task_struct *tsk = current;
+	struct taskstats *tidstats;
+	int group_dead;
+	unsigned int mycpu;
+
+	profile_task_exit(tsk);
+
+	WARN_ON(atomic_read(&tsk->fs_excl));
+
+	BUG_ON(in_interrupt());
+	if (unlikely(in_interrupt()))
+		panic("Aiee, killing interrupt handler!");
+	if (unlikely(!tsk->pid))
+		panic("Attempted to kill the idle task!");
+	if (unlikely(tsk == child_reaper))
+		panic("Attempted to kill init!");
+
+	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+		current->ptrace_message = code;
+		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+	}
+
+	/*
+	 * We're taking recursive faults here in do_exit. Safest is to just
+	 * leave this task alone and wait for reboot.
+	 */
+	if (unlikely(tsk->flags & PF_EXITING)) {
+		printk(KERN_ALERT
+			"Fixing recursive fault but reboot is needed!\n");
+		if (tsk->io_context)
+			exit_io_context();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule();
+	}
+
+	tsk->flags |= PF_EXITING;
+
+	if (unlikely(in_atomic()))
+		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+				current->comm, current->pid,
+				preempt_count());
+
+	taskstats_exit_alloc(&tidstats, &mycpu);
+
+	acct_update_integrals(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
+	group_dead = atomic_dec_and_test(&tsk->signal->live);
+	if (group_dead) {
+ 		hrtimer_cancel(&tsk->signal->real_timer);
+		exit_itimers(tsk->signal);
+	}
+	acct_collect(code, group_dead);
+	if (unlikely(tsk->robust_list))
+		exit_robust_list(tsk);
+#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
+	if (unlikely(tsk->compat_robust_list))
+		compat_exit_robust_list(tsk);
+#endif
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
+	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
+	taskstats_exit_free(tidstats);
+
+	exit_mm(tsk);
+
+	if (group_dead)
+		acct_process();
+	exit_sem(tsk);
+	__exit_files(tsk);
+	__exit_fs(tsk);
+	exit_namespace(tsk);
+	exit_thread();
+	cpuset_exit(tsk);
+	exit_keys(tsk);
+
+	if (group_dead && tsk->signal->leader)
+		disassociate_ctty(1);
+
+	module_put(task_thread_info(tsk)->exec_domain->module);
+	if (tsk->binfmt)
+		module_put(tsk->binfmt->module);
+
+	tsk->exit_code = code;
+	proc_exit_connector(tsk);
+	exit_notify(tsk);
+#ifdef CONFIG_NUMA
+	mpol_free(tsk->mempolicy);
+	tsk->mempolicy = NULL;
+#endif
+	/*
+	 * This must happen late, after the PID is not
+	 * hashed anymore:
+	 */
+	if (unlikely(!list_empty(&tsk->pi_state_list)))
+		exit_pi_state_list(tsk);
+	if (unlikely(current->pi_state_cache))
+		kfree(current->pi_state_cache);
+	/*
+	 * Make sure we are holding no locks:
+	 */
+	debug_check_no_locks_held(tsk);
+
+	if (tsk->io_context)
+		exit_io_context();
+
+	if (tsk->splice_pipe)
+		__free_pipe_info(tsk->splice_pipe);
+
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+again:
+	local_irq_disable();
+	BUG_ON(tsk->flags & PF_DEAD);
+	tsk->flags |= PF_DEAD;
+
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
+}
+
+EXPORT_SYMBOL_GPL(do_exit);
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+	if (comp)
+		complete(comp);
+	
+	do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+asmlinkage long sys_exit(int error_code)
+{
+	do_exit((error_code&0xff)<<8);
+}
+
+/*
+ * Take down every thread in the group.  This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+	if (current->signal->flags & SIGNAL_GROUP_EXIT)
+		exit_code = current->signal->group_exit_code;
+	else if (!thread_group_empty(current)) {
+		struct signal_struct *const sig = current->signal;
+		struct sighand_struct *const sighand = current->sighand;
+		spin_lock_irq(&sighand->siglock);
+		if (sig->flags & SIGNAL_GROUP_EXIT)
+			/* Another thread got here before we took the lock.  */
+			exit_code = sig->group_exit_code;
+		else {
+			sig->group_exit_code = exit_code;
+			zap_other_threads(current);
+		}
+		spin_unlock_irq(&sighand->siglock);
+	}
+
+	do_exit(exit_code);
+	/* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+asmlinkage void sys_exit_group(int error_code)
+{
+	do_group_exit((error_code & 0xff) << 8);
+}
+
+static int eligible_child(pid_t pid, int options, struct task_struct *p)
+{
+	if (pid > 0) {
+		if (p->pid != pid)
+			return 0;
+	} else if (!pid) {
+		if (process_group(p) != process_group(current))
+			return 0;
+	} else if (pid != -1) {
+		if (process_group(p) != -pid)
+			return 0;
+	}
+
+	/*
+	 * Do not consider detached threads that are
+	 * not ptraced:
+	 */
+	if (p->exit_signal == -1 && !p->ptrace)
+		return 0;
+
+	/* Wait for all children (clone and not) if __WALL is set;
+	 * otherwise, wait for clone children *only* if __WCLONE is
+	 * set; otherwise, wait for non-clone children *only*.  (Note:
+	 * A "clone" child here is one that reports to its parent
+	 * using a signal other than SIGCHLD.) */
+	if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+	    && !(options & __WALL))
+		return 0;
+	/*
+	 * Do not consider thread group leaders that are
+	 * in a non-empty thread group:
+	 */
+	if (delay_group_leader(p))
+		return 2;
+
+	if (security_task_wait(p))
+		return 0;
+
+	return 1;
+}
+
+static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
+			       int why, int status,
+			       struct siginfo __user *infop,
+			       struct rusage __user *rusagep)
+{
+	int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+
+	put_task_struct(p);
+	if (!retval)
+		retval = put_user(SIGCHLD, &infop->si_signo);
+	if (!retval)
+		retval = put_user(0, &infop->si_errno);
+	if (!retval)
+		retval = put_user((short)why, &infop->si_code);
+	if (!retval)
+		retval = put_user(pid, &infop->si_pid);
+	if (!retval)
+		retval = put_user(uid, &infop->si_uid);
+	if (!retval)
+		retval = put_user(status, &infop->si_status);
+	if (!retval)
+		retval = pid;
+	return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(struct task_struct *p, int noreap,
+			    struct siginfo __user *infop,
+			    int __user *stat_addr, struct rusage __user *ru)
+{
+	unsigned long state;
+	int retval;
+	int status;
+
+	if (unlikely(noreap)) {
+		pid_t pid = p->pid;
+		uid_t uid = p->uid;
+		int exit_code = p->exit_code;
+		int why, status;
+
+		if (unlikely(p->exit_state != EXIT_ZOMBIE))
+			return 0;
+		if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
+			return 0;
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		if ((exit_code & 0x7f) == 0) {
+			why = CLD_EXITED;
+			status = exit_code >> 8;
+		} else {
+			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			status = exit_code & 0x7f;
+		}
+		return wait_noreap_copyout(p, pid, uid, why,
+					   status, infop, ru);
+	}
+
+	/*
+	 * Try to move the task's state to DEAD
+	 * only one thread is allowed to do this:
+	 */
+	state = xchg(&p->exit_state, EXIT_DEAD);
+	if (state != EXIT_ZOMBIE) {
+		BUG_ON(state != EXIT_DEAD);
+		return 0;
+	}
+	if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
+		/*
+		 * This can only happen in a race with a ptraced thread
+		 * dying on another processor.
+		 */
+		return 0;
+	}
+
+	if (likely(p->real_parent == p->parent) && likely(p->signal)) {
+		struct signal_struct *psig;
+		struct signal_struct *sig;
+
+		/*
+		 * The resource counters for the group leader are in its
+		 * own task_struct.  Those for dead threads in the group
+		 * are in its signal_struct, as are those for the child
+		 * processes it has previously reaped.  All these
+		 * accumulate in the parent's signal_struct c* fields.
+		 *
+		 * We don't bother to take a lock here to protect these
+		 * p->signal fields, because they are only touched by
+		 * __exit_signal, which runs with tasklist_lock
+		 * write-locked anyway, and so is excluded here.  We do
+		 * need to protect the access to p->parent->signal fields,
+		 * as other threads in the parent group can be right
+		 * here reaping other children at the same time.
+		 */
+		spin_lock_irq(&p->parent->sighand->siglock);
+		psig = p->parent->signal;
+		sig = p->signal;
+		psig->cutime =
+			cputime_add(psig->cutime,
+			cputime_add(p->utime,
+			cputime_add(sig->utime,
+				    sig->cutime)));
+		psig->cstime =
+			cputime_add(psig->cstime,
+			cputime_add(p->stime,
+			cputime_add(sig->stime,
+				    sig->cstime)));
+		psig->cmin_flt +=
+			p->min_flt + sig->min_flt + sig->cmin_flt;
+		psig->cmaj_flt +=
+			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
+		psig->cnvcsw +=
+			p->nvcsw + sig->nvcsw + sig->cnvcsw;
+		psig->cnivcsw +=
+			p->nivcsw + sig->nivcsw + sig->cnivcsw;
+		spin_unlock_irq(&p->parent->sighand->siglock);
+	}
+
+	/*
+	 * Now we are sure this task is interesting, and no other
+	 * thread can reap it because we set its state to EXIT_DEAD.
+	 */
+	read_unlock(&tasklist_lock);
+
+	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+		? p->signal->group_exit_code : p->exit_code;
+	if (!retval && stat_addr)
+		retval = put_user(status, stat_addr);
+	if (!retval && infop)
+		retval = put_user(SIGCHLD, &infop->si_signo);
+	if (!retval && infop)
+		retval = put_user(0, &infop->si_errno);
+	if (!retval && infop) {
+		int why;
+
+		if ((status & 0x7f) == 0) {
+			why = CLD_EXITED;
+			status >>= 8;
+		} else {
+			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			status &= 0x7f;
+		}
+		retval = put_user((short)why, &infop->si_code);
+		if (!retval)
+			retval = put_user(status, &infop->si_status);
+	}
+	if (!retval && infop)
+		retval = put_user(p->pid, &infop->si_pid);
+	if (!retval && infop)
+		retval = put_user(p->uid, &infop->si_uid);
+	if (retval) {
+		// TODO: is this safe?
+		p->exit_state = EXIT_ZOMBIE;
+		return retval;
+	}
+	retval = p->pid;
+	if (p->real_parent != p->parent) {
+		write_lock_irq(&tasklist_lock);
+		/* Double-check with lock held.  */
+		if (p->real_parent != p->parent) {
+			__ptrace_unlink(p);
+			// TODO: is this safe?
+			p->exit_state = EXIT_ZOMBIE;
+			/*
+			 * If this is not a detached task, notify the parent.
+			 * If it's still not detached after that, don't release
+			 * it now.
+			 */
+			if (p->exit_signal != -1) {
+				do_notify_parent(p, p->exit_signal);
+				if (p->exit_signal != -1)
+					p = NULL;
+			}
+		}
+		write_unlock_irq(&tasklist_lock);
+	}
+	if (p != NULL)
+		release_task(p);
+	BUG_ON(!retval);
+	return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+			     int noreap, struct siginfo __user *infop,
+			     int __user *stat_addr, struct rusage __user *ru)
+{
+	int retval, exit_code;
+
+	if (!p->exit_code)
+		return 0;
+	if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
+	    p->signal && p->signal->group_stop_count > 0)
+		/*
+		 * A group stop is in progress and this is the group leader.
+		 * We won't report until all threads have stopped.
+		 */
+		return 0;
+
+	/*
+	 * Now we are pretty sure this task is interesting.
+	 * Make sure it doesn't get reaped out from under us while we
+	 * give up the lock and then examine it below.  We don't want to
+	 * keep holding onto the tasklist_lock while we call getrusage and
+	 * possibly take page faults for user memory.
+	 */
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (unlikely(noreap)) {
+		pid_t pid = p->pid;
+		uid_t uid = p->uid;
+		int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+
+		exit_code = p->exit_code;
+		if (unlikely(!exit_code) ||
+		    unlikely(p->state & TASK_TRACED))
+			goto bail_ref;
+		return wait_noreap_copyout(p, pid, uid,
+					   why, (exit_code << 8) | 0x7f,
+					   infop, ru);
+	}
+
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * This uses xchg to be atomic with the thread resuming and setting
+	 * it.  It must also be done with the write lock held to prevent a
+	 * race with the EXIT_ZOMBIE case.
+	 */
+	exit_code = xchg(&p->exit_code, 0);
+	if (unlikely(p->exit_state)) {
+		/*
+		 * The task resumed and then died.  Let the next iteration
+		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
+		 * already be zero here if it resumed and did _exit(0).
+		 * The task itself is dead and won't touch exit_code again;
+		 * other processors in this function are locked out.
+		 */
+		p->exit_code = exit_code;
+		exit_code = 0;
+	}
+	if (unlikely(exit_code == 0)) {
+		/*
+		 * Another thread in this function got to it first, or it
+		 * resumed, or it resumed and then died.
+		 */
+		write_unlock_irq(&tasklist_lock);
+bail_ref:
+		put_task_struct(p);
+		/*
+		 * We are returning to the wait loop without having successfully
+		 * removed the process and having released the lock. We cannot
+		 * continue, since the "p" task pointer is potentially stale.
+		 *
+		 * Return -EAGAIN, and do_wait() will restart the loop from the
+		 * beginning. Do _not_ re-acquire the lock.
+		 */
+		return -EAGAIN;
+	}
+
+	/* move to end of parent's list to avoid starvation */
+	remove_parent(p);
+	add_parent(p);
+
+	write_unlock_irq(&tasklist_lock);
+
+	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	if (!retval && stat_addr)
+		retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+	if (!retval && infop)
+		retval = put_user(SIGCHLD, &infop->si_signo);
+	if (!retval && infop)
+		retval = put_user(0, &infop->si_errno);
+	if (!retval && infop)
+		retval = put_user((short)((p->ptrace & PT_PTRACED)
+					  ? CLD_TRAPPED : CLD_STOPPED),
+				  &infop->si_code);
+	if (!retval && infop)
+		retval = put_user(exit_code, &infop->si_status);
+	if (!retval && infop)
+		retval = put_user(p->pid, &infop->si_pid);
+	if (!retval && infop)
+		retval = put_user(p->uid, &infop->si_uid);
+	if (!retval)
+		retval = p->pid;
+	put_task_struct(p);
+
+	BUG_ON(!retval);
+	return retval;
+}
+
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(struct task_struct *p, int noreap,
+			       struct siginfo __user *infop,
+			       int __user *stat_addr, struct rusage __user *ru)
+{
+	int retval;
+	pid_t pid;
+	uid_t uid;
+
+	if (unlikely(!p->signal))
+		return 0;
+
+	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
+		return 0;
+
+	spin_lock_irq(&p->sighand->siglock);
+	/* Re-check with the lock held.  */
+	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
+		spin_unlock_irq(&p->sighand->siglock);
+		return 0;
+	}
+	if (!noreap)
+		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+	spin_unlock_irq(&p->sighand->siglock);
+
+	pid = p->pid;
+	uid = p->uid;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (!infop) {
+		retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+		put_task_struct(p);
+		if (!retval && stat_addr)
+			retval = put_user(0xffff, stat_addr);
+		if (!retval)
+			retval = p->pid;
+	} else {
+		retval = wait_noreap_copyout(p, pid, uid,
+					     CLD_CONTINUED, SIGCONT,
+					     infop, ru);
+		BUG_ON(retval == 0);
+	}
+
+	return retval;
+}
+
+
+static inline int my_ptrace_child(struct task_struct *p)
+{
+	if (!(p->ptrace & PT_PTRACED))
+		return 0;
+	if (!(p->ptrace & PT_ATTACHED))
+		return 1;
+	/*
+	 * This child was PTRACE_ATTACH'd.  We should be seeing it only if
+	 * we are the attacher.  If we are the real parent, this is a race
+	 * inside ptrace_attach.  It is waiting for the tasklist_lock,
+	 * which we have to switch the parent links, but has already set
+	 * the flags in p->ptrace.
+	 */
+	return (p->parent != p->real_parent);
+}
+
+static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
+		    int __user *stat_addr, struct rusage __user *ru)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct task_struct *tsk;
+	int flag, retval;
+
+	add_wait_queue(&current->signal->wait_chldexit,&wait);
+repeat:
+	/*
+	 * We will set this flag if we see any child that might later
+	 * match our criteria, even if we are not able to reap it yet.
+	 */
+	flag = 0;
+	current->state = TASK_INTERRUPTIBLE;
+	read_lock(&tasklist_lock);
+	tsk = current;
+	do {
+		struct task_struct *p;
+		struct list_head *_p;
+		int ret;
+
+		list_for_each(_p,&tsk->children) {
+			p = list_entry(_p, struct task_struct, sibling);
+
+			BUG_ON(!atomic_read(&p->usage));
+			ret = eligible_child(pid, options, p);
+			if (!ret)
+				continue;
+
+			switch (p->state) {
+			case TASK_TRACED:
+				/*
+				 * When we hit the race with PTRACE_ATTACH,
+				 * we will not report this child.  But the
+				 * race means it has not yet been moved to
+				 * our ptrace_children list, so we need to
+				 * set the flag here to avoid a spurious ECHILD
+				 * when the race happens with the only child.
+				 */
+				flag = 1;
+				if (!my_ptrace_child(p))
+					continue;
+				/*FALLTHROUGH*/
+			case TASK_STOPPED:
+				/*
+				 * It's stopped now, so it might later
+				 * continue, exit, or stop again.
+				 */
+				flag = 1;
+				if (!(options & WUNTRACED) &&
+				    !my_ptrace_child(p))
+					continue;
+				retval = wait_task_stopped(p, ret == 2,
+							   (options & WNOWAIT),
+							   infop,
+							   stat_addr, ru);
+				if (retval == -EAGAIN)
+					goto repeat;
+				if (retval != 0) /* He released the lock.  */
+					goto end;
+				break;
+			default:
+			// case EXIT_DEAD:
+				if (p->exit_state == EXIT_DEAD)
+					continue;
+			// case EXIT_ZOMBIE:
+				if (p->exit_state == EXIT_ZOMBIE) {
+					/*
+					 * Eligible but we cannot release
+					 * it yet:
+					 */
+					if (ret == 2)
+						goto check_continued;
+					if (!likely(options & WEXITED))
+						continue;
+					retval = wait_task_zombie(
+						p, (options & WNOWAIT),
+						infop, stat_addr, ru);
+					/* He released the lock.  */
+					if (retval != 0)
+						goto end;
+					break;
+				}
+check_continued:
+				/*
+				 * It's running now, so it might later
+				 * exit, stop, or stop and then continue.
+				 */
+				flag = 1;
+				if (!unlikely(options & WCONTINUED))
+					continue;
+				retval = wait_task_continued(
+					p, (options & WNOWAIT),
+					infop, stat_addr, ru);
+				if (retval != 0) /* He released the lock.  */
+					goto end;
+				break;
+			}
+		}
+		if (!flag) {
+			list_for_each(_p, &tsk->ptrace_children) {
+				p = list_entry(_p, struct task_struct,
+						ptrace_list);
+				if (!eligible_child(pid, options, p))
+					continue;
+				flag = 1;
+				break;
+			}
+		}
+		if (options & __WNOTHREAD)
+			break;
+		tsk = next_thread(tsk);
+		BUG_ON(tsk->signal != current->signal);
+	} while (tsk != current);
+
+	read_unlock(&tasklist_lock);
+	if (flag) {
+		retval = 0;
+		if (options & WNOHANG)
+			goto end;
+		retval = -ERESTARTSYS;
+		if (signal_pending(current))
+			goto end;
+		schedule();
+		goto repeat;
+	}
+	retval = -ECHILD;
+end:
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&current->signal->wait_chldexit,&wait);
+	if (infop) {
+		if (retval > 0)
+		retval = 0;
+		else {
+			/*
+			 * For a WNOHANG return, clear out all the fields
+			 * we would set so the user can easily tell the
+			 * difference.
+			 */
+			if (!retval)
+				retval = put_user(0, &infop->si_signo);
+			if (!retval)
+				retval = put_user(0, &infop->si_errno);
+			if (!retval)
+				retval = put_user(0, &infop->si_code);
+			if (!retval)
+				retval = put_user(0, &infop->si_pid);
+			if (!retval)
+				retval = put_user(0, &infop->si_uid);
+			if (!retval)
+				retval = put_user(0, &infop->si_status);
+		}
+	}
+	return retval;
+}
+
+asmlinkage long sys_waitid(int which, pid_t pid,
+			   struct siginfo __user *infop, int options,
+			   struct rusage __user *ru)
+{
+	long ret;
+
+	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+		return -EINVAL;
+	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+		return -EINVAL;
+
+	switch (which) {
+	case P_ALL:
+		pid = -1;
+		break;
+	case P_PID:
+		if (pid <= 0)
+			return -EINVAL;
+		break;
+	case P_PGID:
+		if (pid <= 0)
+			return -EINVAL;
+		pid = -pid;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = do_wait(pid, options, infop, NULL, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
+}
+
+asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
+			  int options, struct rusage __user *ru)
+{
+	long ret;
+
+	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+			__WNOTHREAD|__WCLONE|__WALL))
+		return -EINVAL;
+	ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options)
+{
+	return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
diff -urN ./linux-2.6.18.1/kernel/fork.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/fork.c
--- ./linux-2.6.18.1/kernel/fork.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/fork.c	2007-06-17 00:19:37.000000000 +0900
@@ -3,7 +3,6 @@
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
-
 /*
  *  'fork.c' contains the help-routines for the 'fork' system call
  * (see also entry.S and others).
@@ -12,6 +11,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/smp_lock.h>
@@ -42,6 +42,8 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
@@ -53,6 +55,10 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_CABI
+#include <cabi/cabi.h>
+#endif
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -65,6 +71,15 @@
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
 int nr_processes(void)
 {
 	int cpu;
@@ -108,10 +123,14 @@
 }
 EXPORT_SYMBOL(free_task);
 
-void __put_task_struct(struct task_struct *tsk)
+#ifdef CONFIG_PREEMPT_RT
+void __put_task_struct_cb(struct rcu_head *rhp)
 {
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
-	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
 	security_task_free(tsk);
@@ -123,8 +142,28 @@
 		free_task(tsk);
 }
 
+#else
+
+void __put_task_struct(struct task_struct *tsk)
+{
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
+	WARN_ON(tsk == current);
+
+	security_task_free(tsk);
+	free_uid(tsk->user);
+	put_group_info(tsk->group_info);
+
+	if (!profile_handoff_task(tsk))
+		free_task(tsk);
+}
+#endif
+
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -152,6 +191,9 @@
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -328,6 +370,7 @@
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 
@@ -926,6 +969,9 @@
 	spin_lock_init(&p->pi_lock);
 	plist_head_init(&p->pi_waiters, &p->pi_lock);
 	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	p->last_kernel_lock = NULL;
+# endif
 #endif
 }
 
@@ -975,7 +1021,7 @@
 	if (!p)
 		goto fork_out;
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
@@ -1037,7 +1083,7 @@
  	INIT_LIST_HEAD(&p->cpu_timers[0]);
  	INIT_LIST_HEAD(&p->cpu_timers[1]);
  	INIT_LIST_HEAD(&p->cpu_timers[2]);
-
+	p->posix_timer_list = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
@@ -1056,7 +1102,11 @@
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	p->hardirqs_enabled = 1;
+#else
 	p->hardirqs_enabled = 0;
+#endif
 	p->hardirq_enable_ip = 0;
 	p->hardirq_enable_event = 0;
 	p->hardirq_disable_ip = _THIS_IP_;
@@ -1080,7 +1130,6 @@
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
 		p->tgid = current->tgid;
@@ -1109,6 +1158,9 @@
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_namespace;
+#ifdef CONFIG_DEBUG_PREEMPT
+	p->lock_count = 0;
+#endif
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
@@ -1171,10 +1223,12 @@
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
@@ -1201,6 +1255,20 @@
 		goto bad_fork_cleanup_namespace;
 	}
 
+#ifdef CONFIG_CABI
+        if (p->cabi_info) {
+                cabi_account_t cabi;
+                int cabi_account_attach(unsigned long, struct task_struct *);
+                cabi = p->cabi_info;
+                p->cabi_info = NULL;
+                cabi_account_attach(cabi->cabi_id, p);
+#ifdef DEBUG_CABI
+                printk("in fork, attach pid(%d), cabi(0x%x)\n",
+                        p->pid, (int) cabi);
+#endif
+                }
+#endif
+
 	if (clone_flags & CLONE_THREAD) {
 		p->group_leader = current->group_leader;
 		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
@@ -1239,7 +1307,9 @@
 			attach_pid(p, PIDTYPE_SID, p->signal->session);
 
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
 		}
 		attach_pid(p, PIDTYPE_PID, p->pid);
 		nr_threads++;
@@ -1686,3 +1756,122 @@
 bad_unshare_out:
 	return err;
 }
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (mmdrop_complete())
+			continue;
+		schedule();
+
+		/* This must be called from time to time on ia64, and is a no-op on other archs.
+		 * Used to be in cpu_idle(), but with the new -rt semantics it can't stay there.
+		 */
+		check_pgt_cache();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
diff -urN ./linux-2.6.18.1/kernel/fork.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/fork.c.orig
--- ./linux-2.6.18.1/kernel/fork.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/fork.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1860 @@
+/*
+ *  linux/kernel/fork.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ *  'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
+ */
+
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/namespace.h>
+#include <linux/personality.h>
+#include <linux/mempolicy.h>
+#include <linux/sem.h>
+#include <linux/file.h>
+#include <linux/key.h>
+#include <linux/binfmts.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/jiffies.h>
+#include <linux/futex.h>
+#include <linux/rcupdate.h>
+#include <linux/ptrace.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/profile.h>
+#include <linux/rmap.h>
+#include <linux/acct.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/cn_proc.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+unsigned long total_forks;	/* Handle normal Linux uptimes. */
+int nr_threads; 		/* The idle threads do not count.. */
+
+int max_threads;		/* tunable limit on nr_threads */
+
+DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+
+/*
+ * Delayed mmdrop. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling
+ * context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
+int nr_processes(void)
+{
+	int cpu;
+	int total = 0;
+
+	for_each_online_cpu(cpu)
+		total += per_cpu(process_counts, cpu);
+
+	return total;
+}
+
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+# define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
+static kmem_cache_t *task_struct_cachep;
+#endif
+
+/* SLAB cache for signal_struct structures (tsk->signal) */
+static kmem_cache_t *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+kmem_cache_t *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+kmem_cache_t *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+kmem_cache_t *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+kmem_cache_t *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static kmem_cache_t *mm_cachep;
+
+void free_task(struct task_struct *tsk)
+{
+	free_thread_info(tsk->thread_info);
+	rt_mutex_debug_task_free(tsk);
+	free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
+#ifdef CONFIG_PREEMPT_RT
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+	WARN_ON(tsk == current);
+
+	security_task_free(tsk);
+	free_uid(tsk->user);
+	put_group_info(tsk->group_info);
+	delayacct_tsk_free(tsk);
+
+	if (!profile_handoff_task(tsk))
+		free_task(tsk);
+}
+
+#else
+
+void __put_task_struct(struct task_struct *tsk)
+{
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
+	WARN_ON(tsk == current);
+
+	security_task_free(tsk);
+	free_uid(tsk->user);
+	put_group_info(tsk->group_info);
+
+	if (!profile_handoff_task(tsk))
+		free_task(tsk);
+}
+#endif
+
+void __init fork_init(unsigned long mempages)
+{
+	int i;
+
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#endif
+	/* create a slab on which task_structs can be allocated */
+	task_struct_cachep =
+		kmem_cache_create("task_struct", sizeof(struct task_struct),
+			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+#endif
+
+	/*
+	 * The default maximum number of threads is set to a safe
+	 * value: the thread structures can take up at most half
+	 * of memory.
+	 */
+	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
+
+	/*
+	 * we need to allow at least 20 threads to boot a system
+	 */
+	if(max_threads < 20)
+		max_threads = 20;
+
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_SIGPENDING] =
+		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig)
+{
+	struct task_struct *tsk;
+	struct thread_info *ti;
+
+	prepare_to_copy(orig);
+
+	tsk = alloc_task_struct();
+	if (!tsk)
+		return NULL;
+
+	ti = alloc_thread_info(tsk);
+	if (!ti) {
+		free_task_struct(tsk);
+		return NULL;
+	}
+
+	*tsk = *orig;
+	tsk->thread_info = ti;
+	setup_thread_stack(tsk, orig);
+
+	/* One for us, one for whoever does the "release_task()" (usually parent) */
+	atomic_set(&tsk->usage,2);
+	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
+	tsk->splice_pipe = NULL;
+	return tsk;
+}
+
+#ifdef CONFIG_MMU
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	struct vm_area_struct *mpnt, *tmp, **pprev;
+	struct rb_node **rb_link, *rb_parent;
+	int retval;
+	unsigned long charge;
+	struct mempolicy *pol;
+
+	down_write(&oldmm->mmap_sem);
+	flush_cache_mm(oldmm);
+	/*
+	 * Not linked in yet - no deadlock potential:
+	 */
+	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+	mm->locked_vm = 0;
+	mm->mmap = NULL;
+	mm->mmap_cache = NULL;
+	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+	mm->map_count = 0;
+	cpus_clear(mm->cpu_vm_mask);
+	mm->mm_rb = RB_ROOT;
+	rb_link = &mm->mm_rb.rb_node;
+	rb_parent = NULL;
+	pprev = &mm->mmap;
+
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+		struct file *file;
+
+		if (mpnt->vm_flags & VM_DONTCOPY) {
+			long pages = vma_pages(mpnt);
+			mm->total_vm -= pages;
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+								-pages);
+			continue;
+		}
+		charge = 0;
+		if (mpnt->vm_flags & VM_ACCOUNT) {
+			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+			if (security_vm_enough_memory(len))
+				goto fail_nomem;
+			charge = len;
+		}
+		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (!tmp)
+			goto fail_nomem;
+		*tmp = *mpnt;
+		pol = mpol_copy(vma_policy(mpnt));
+		retval = PTR_ERR(pol);
+		if (IS_ERR(pol))
+			goto fail_nomem_policy;
+		vma_set_policy(tmp, pol);
+		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_mm = mm;
+		tmp->vm_next = NULL;
+		anon_vma_link(tmp);
+		file = tmp->vm_file;
+		if (file) {
+			struct inode *inode = file->f_dentry->d_inode;
+			get_file(file);
+			if (tmp->vm_flags & VM_DENYWRITE)
+				atomic_dec(&inode->i_writecount);
+      
+			/* insert tmp into the share list, just after mpnt */
+			spin_lock(&file->f_mapping->i_mmap_lock);
+			tmp->vm_truncate_count = mpnt->vm_truncate_count;
+			flush_dcache_mmap_lock(file->f_mapping);
+			vma_prio_tree_add(tmp, mpnt);
+			flush_dcache_mmap_unlock(file->f_mapping);
+			spin_unlock(&file->f_mapping->i_mmap_lock);
+		}
+
+		/*
+		 * Link in the new vma and copy the page table entries.
+		 */
+		*pprev = tmp;
+		pprev = &tmp->vm_next;
+
+		__vma_link_rb(mm, tmp, rb_link, rb_parent);
+		rb_link = &tmp->vm_rb.rb_right;
+		rb_parent = &tmp->vm_rb;
+
+		mm->map_count++;
+		retval = copy_page_range(mm, oldmm, mpnt);
+
+		if (tmp->vm_ops && tmp->vm_ops->open)
+			tmp->vm_ops->open(tmp);
+
+		if (retval)
+			goto out;
+	}
+	retval = 0;
+out:
+	up_write(&mm->mmap_sem);
+	flush_tlb_mm(oldmm);
+	up_write(&oldmm->mmap_sem);
+	return retval;
+fail_nomem_policy:
+	kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+	retval = -ENOMEM;
+	vm_unacct_memory(charge);
+	goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct * mm)
+{
+	mm->pgd = pgd_alloc(mm);
+	if (unlikely(!mm->pgd))
+		return -ENOMEM;
+	return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct * mm)
+{
+	pgd_free(mm->pgd);
+}
+#else
+#define dup_mmap(mm, oldmm)	(0)
+#define mm_alloc_pgd(mm)	(0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
+#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
+
+#include <linux/init_task.h>
+
+static struct mm_struct * mm_init(struct mm_struct * mm)
+{
+	atomic_set(&mm->mm_users, 1);
+	atomic_set(&mm->mm_count, 1);
+	init_rwsem(&mm->mmap_sem);
+	INIT_LIST_HEAD(&mm->mmlist);
+	mm->core_waiters = 0;
+	mm->nr_ptes = 0;
+	set_mm_counter(mm, file_rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
+	spin_lock_init(&mm->page_table_lock);
+	rwlock_init(&mm->ioctx_list_lock);
+	mm->ioctx_list = NULL;
+	INIT_LIST_HEAD(&mm->delayed_drop);
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+
+	if (likely(!mm_alloc_pgd(mm))) {
+		mm->def_flags = 0;
+		return mm;
+	}
+	free_mm(mm);
+	return NULL;
+}
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct * mm_alloc(void)
+{
+	struct mm_struct * mm;
+
+	mm = allocate_mm();
+	if (mm) {
+		memset(mm, 0, sizeof(*mm));
+		mm = mm_init(mm);
+	}
+	return mm;
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void fastcall __mmdrop(struct mm_struct *mm)
+{
+	BUG_ON(mm == &init_mm);
+	mm_free_pgd(mm);
+	destroy_context(mm);
+	free_mm(mm);
+}
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+	might_sleep();
+
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		exit_aio(mm);
+		exit_mmap(mm);
+		if (!list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_del(&mm->mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		put_swap_token(mm);
+		mmdrop(mm);
+	}
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+/**
+ * get_task_mm - acquire a reference to the task's mm
+ *
+ * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
+ * bumping up the use count.  User must release the mm via mmput()
+ * after use.  Typically used by /proc and ptrace.
+ */
+struct mm_struct *get_task_mm(struct task_struct *task)
+{
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (task->flags & PF_BORROWED_MM)
+			mm = NULL;
+		else
+			atomic_inc(&mm->mm_users);
+	}
+	task_unlock(task);
+	return mm;
+}
+EXPORT_SYMBOL_GPL(get_task_mm);
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one.  Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct completion *vfork_done = tsk->vfork_done;
+
+	/* Get rid of any cached register state */
+	deactivate_mm(tsk, mm);
+
+	/* notify parent sleeping on vfork() */
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+		u32 __user * tidptr = tsk->clear_child_tid;
+		tsk->clear_child_tid = NULL;
+
+		/*
+		 * We don't check the error code - if userspace has
+		 * not set up a proper pointer then tough luck.
+		 */
+		put_user(0, tidptr);
+		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
+	}
+}
+
+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+static struct mm_struct *dup_mm(struct task_struct *tsk)
+{
+	struct mm_struct *mm, *oldmm = current->mm;
+	int err;
+
+	if (!oldmm)
+		return NULL;
+
+	mm = allocate_mm();
+	if (!mm)
+		goto fail_nomem;
+
+	memcpy(mm, oldmm, sizeof(*mm));
+
+	if (!mm_init(mm))
+		goto fail_nomem;
+
+	if (init_new_context(tsk, mm))
+		goto fail_nocontext;
+
+	err = dup_mmap(mm, oldmm);
+	if (err)
+		goto free_pt;
+
+	mm->hiwater_rss = get_mm_rss(mm);
+	mm->hiwater_vm = mm->total_vm;
+
+	return mm;
+
+free_pt:
+	mmput(mm);
+
+fail_nomem:
+	return NULL;
+
+fail_nocontext:
+	/*
+	 * If init_new_context() failed, we cannot use mmput() to free the mm
+	 * because it calls destroy_context()
+	 */
+	mm_free_pgd(mm);
+	free_mm(mm);
+	return NULL;
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct mm_struct * mm, *oldmm;
+	int retval;
+
+	tsk->min_flt = tsk->maj_flt = 0;
+	tsk->nvcsw = tsk->nivcsw = 0;
+
+	tsk->mm = NULL;
+	tsk->active_mm = NULL;
+
+	/*
+	 * Are we cloning a kernel thread?
+	 *
+	 * We need to steal a active VM for that..
+	 */
+	oldmm = current->mm;
+	if (!oldmm)
+		return 0;
+
+	if (clone_flags & CLONE_VM) {
+		atomic_inc(&oldmm->mm_users);
+		mm = oldmm;
+		goto good_mm;
+	}
+
+	retval = -ENOMEM;
+	mm = dup_mm(tsk);
+	if (!mm)
+		goto fail_nomem;
+
+good_mm:
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	return 0;
+
+fail_nomem:
+	return retval;
+}
+
+static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->rootmnt = mntget(old->rootmnt);
+		fs->root = dget(old->root);
+		fs->pwdmnt = mntget(old->pwdmnt);
+		fs->pwd = dget(old->pwd);
+		if (old->altroot) {
+			fs->altrootmnt = mntget(old->altrootmnt);
+			fs->altroot = dget(old->altroot);
+		} else {
+			fs->altrootmnt = NULL;
+			fs->altroot = NULL;
+		}
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	return __copy_fs_struct(old);
+}
+
+EXPORT_SYMBOL_GPL(copy_fs_struct);
+
+static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
+{
+	if (clone_flags & CLONE_FS) {
+		atomic_inc(&current->fs->count);
+		return 0;
+	}
+	tsk->fs = __copy_fs_struct(current->fs);
+	if (!tsk->fs)
+		return -ENOMEM;
+	return 0;
+}
+
+static int count_open_files(struct fdtable *fdt)
+{
+	int size = fdt->max_fdset;
+	int i;
+
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (fdt->open_fds->fds_bits[--i])
+			break;
+	}
+	i = (i+1) * 8 * sizeof(long);
+	return i;
+}
+
+static struct files_struct *alloc_files(void)
+{
+	struct files_struct *newf;
+	struct fdtable *fdt;
+
+	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	fdt = &newf->fdtab;
+	fdt->max_fds = NR_OPEN_DEFAULT;
+	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
+	fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&fdt->rcu);
+	fdt->free_files = NULL;
+	fdt->next = NULL;
+	rcu_assign_pointer(newf->fdt, fdt);
+out:
+	return newf;
+}
+
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
+ */
+static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+{
+	struct files_struct *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i, expand;
+	struct fdtable *old_fdt, *new_fdt;
+
+	*errorp = -ENOMEM;
+	newf = alloc_files();
+	if (!newf)
+		goto out;
+
+	spin_lock(&oldf->file_lock);
+	old_fdt = files_fdtable(oldf);
+	new_fdt = files_fdtable(newf);
+	size = old_fdt->max_fdset;
+	open_files = count_open_files(old_fdt);
+	expand = 0;
+
+	/*
+	 * Check whether we need to allocate a larger fd array or fd set.
+	 * Note: we're not a clone task, so the open count won't  change.
+	 */
+	if (open_files > new_fdt->max_fdset) {
+		new_fdt->max_fdset = 0;
+		expand = 1;
+	}
+	if (open_files > new_fdt->max_fds) {
+		new_fdt->max_fds = 0;
+		expand = 1;
+	}
+
+	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
+	if (expand) {
+		spin_unlock(&oldf->file_lock);
+		spin_lock(&newf->file_lock);
+		*errorp = expand_files(newf, open_files-1);
+		spin_unlock(&newf->file_lock);
+		if (*errorp < 0)
+			goto out_release;
+		new_fdt = files_fdtable(newf);
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
+		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
+	}
+
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
+
+	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, new_fdt->open_fds);
+		}
+		rcu_assign_pointer(*new_fds++, f);
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */ 
+	memset(new_fds, 0, size); 
+
+	if (new_fdt->max_fdset > open_files) {
+		int left = (new_fdt->max_fdset-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+	}
+
+out:
+	return newf;
+
+out_release:
+	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
+	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+	free_fd_array(new_fdt->fd, new_fdt->max_fds);
+	kmem_cache_free(files_cachep, newf);
+	return NULL;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct files_struct *oldf, *newf;
+	int error = 0;
+
+	/*
+	 * A background process may not have any files ...
+	 */
+	oldf = current->files;
+	if (!oldf)
+		goto out;
+
+	if (clone_flags & CLONE_FILES) {
+		atomic_inc(&oldf->count);
+		goto out;
+	}
+
+	/*
+	 * Note: we may be using current for both targets (See exec.c)
+	 * This works because we cache current->files (old) as oldf. Don't
+	 * break this.
+	 */
+	tsk->files = NULL;
+	newf = dup_fd(oldf, &error);
+	if (!newf)
+		goto out;
+
+	tsk->files = newf;
+	error = 0;
+out:
+	return error;
+}
+
+/*
+ *	Helper to unshare the files of the current task.
+ *	We don't want to expose copy_files internals to
+ *	the exec layer of the kernel.
+ */
+
+int unshare_files(void)
+{
+	struct files_struct *files  = current->files;
+	int rc;
+
+	BUG_ON(!files);
+
+	/* This can race but the race causes us to copy when we don't
+	   need to and drop the copy */
+	if(atomic_read(&files->count) == 1)
+	{
+		atomic_inc(&files->count);
+		return 0;
+	}
+	rc = copy_files(0, current);
+	if(rc)
+		current->files = files;
+	return rc;
+}
+
+EXPORT_SYMBOL(unshare_files);
+
+static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct sighand_struct *sig;
+
+	if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+		atomic_inc(&current->sighand->count);
+		return 0;
+	}
+	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+	rcu_assign_pointer(tsk->sighand, sig);
+	if (!sig)
+		return -ENOMEM;
+	atomic_set(&sig->count, 1);
+	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+	return 0;
+}
+
+void __cleanup_sighand(struct sighand_struct *sighand)
+{
+	if (atomic_dec_and_test(&sighand->count))
+		kmem_cache_free(sighand_cachep, sighand);
+}
+
+static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct signal_struct *sig;
+	int ret;
+
+	if (clone_flags & CLONE_THREAD) {
+		atomic_inc(&current->signal->count);
+		atomic_inc(&current->signal->live);
+		taskstats_tgid_alloc(current->signal);
+		return 0;
+	}
+	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+	tsk->signal = sig;
+	if (!sig)
+		return -ENOMEM;
+
+	ret = copy_thread_group_keys(tsk);
+	if (ret < 0) {
+		kmem_cache_free(signal_cachep, sig);
+		return ret;
+	}
+
+	atomic_set(&sig->count, 1);
+	atomic_set(&sig->live, 1);
+	init_waitqueue_head(&sig->wait_chldexit);
+	sig->flags = 0;
+	sig->group_exit_code = 0;
+	sig->group_exit_task = NULL;
+	sig->group_stop_count = 0;
+	sig->curr_target = NULL;
+	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
+
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	sig->it_real_incr.tv64 = 0;
+	sig->real_timer.function = it_real_fn;
+	sig->tsk = tsk;
+
+	sig->it_virt_expires = cputime_zero;
+	sig->it_virt_incr = cputime_zero;
+	sig->it_prof_expires = cputime_zero;
+	sig->it_prof_incr = cputime_zero;
+
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
+
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+	sig->sched_time = 0;
+	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	taskstats_tgid_init(sig);
+
+	task_lock(current->group_leader);
+	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+	task_unlock(current->group_leader);
+
+	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+		/*
+		 * New sole thread in the process gets an expiry time
+		 * of the whole CPU time limit.
+		 */
+		tsk->it_prof_expires =
+			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+	}
+	acct_init_pacct(&sig->pacct);
+
+	return 0;
+}
+
+void __cleanup_signal(struct signal_struct *sig)
+{
+	exit_thread_group_keys(sig);
+	taskstats_tgid_free(sig);
+	kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	atomic_dec(&sig->live);
+
+	if (atomic_dec_and_test(&sig->count))
+		__cleanup_signal(sig);
+}
+
+static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
+{
+	unsigned long new_flags = p->flags;
+
+	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
+	new_flags |= PF_FORKNOEXEC;
+	if (!(clone_flags & CLONE_PTRACE))
+		p->ptrace = 0;
+	p->flags = new_flags;
+}
+
+asmlinkage long sys_set_tid_address(int __user *tidptr)
+{
+	current->clear_child_tid = tidptr;
+
+	return current->pid;
+}
+
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+	spin_lock_init(&p->pi_lock);
+	plist_head_init(&p->pi_waiters, &p->pi_lock);
+	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	p->last_kernel_lock = NULL;
+# endif
+#endif
+}
+
+/*
+ * This creates a new process as a copy of the old one,
+ * but does not actually start it yet.
+ *
+ * It copies the registers, and all the appropriate
+ * parts of the process environment (as per the clone
+ * flags). The actual kick-off is left to the caller.
+ */
+static struct task_struct *copy_process(unsigned long clone_flags,
+					unsigned long stack_start,
+					struct pt_regs *regs,
+					unsigned long stack_size,
+					int __user *parent_tidptr,
+					int __user *child_tidptr,
+					int pid)
+{
+	int retval;
+	struct task_struct *p = NULL;
+
+	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Thread groups must share signals as well, and detached threads
+	 * can only be started up within the thread group.
+	 */
+	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Shared signal handlers imply shared VM. By way of the above,
+	 * thread groups also imply shared VM. Blocking this case allows
+	 * for various simplifications in other code.
+	 */
+	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
+		return ERR_PTR(-EINVAL);
+
+	retval = security_task_create(clone_flags);
+	if (retval)
+		goto fork_out;
+
+	retval = -ENOMEM;
+	p = dup_task_struct(current);
+	if (!p)
+		goto fork_out;
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
+	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+#endif
+	retval = -EAGAIN;
+	if (atomic_read(&p->user->processes) >=
+			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
+				p->user != &root_user)
+			goto bad_fork_free;
+	}
+
+	atomic_inc(&p->user->__count);
+	atomic_inc(&p->user->processes);
+	get_group_info(p->group_info);
+
+	/*
+	 * If multiple threads are within copy_process(), then this check
+	 * triggers too late. This doesn't hurt, the check is only there
+	 * to stop root fork bombs.
+	 */
+	if (nr_threads >= max_threads)
+		goto bad_fork_cleanup_count;
+
+	if (!try_module_get(task_thread_info(p)->exec_domain->module))
+		goto bad_fork_cleanup_count;
+
+	if (p->binfmt && !try_module_get(p->binfmt->module))
+		goto bad_fork_cleanup_put_domain;
+
+	p->did_exec = 0;
+	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
+	copy_flags(clone_flags, p);
+	p->pid = pid;
+	retval = -EFAULT;
+	if (clone_flags & CLONE_PARENT_SETTID)
+		if (put_user(p->pid, parent_tidptr))
+			goto bad_fork_cleanup_delays_binfmt;
+
+	INIT_LIST_HEAD(&p->children);
+	INIT_LIST_HEAD(&p->sibling);
+	p->vfork_done = NULL;
+	spin_lock_init(&p->alloc_lock);
+
+	clear_tsk_thread_flag(p, TIF_SIGPENDING);
+	init_sigpending(&p->pending);
+
+	p->utime = cputime_zero;
+	p->stime = cputime_zero;
+ 	p->sched_time = 0;
+	p->rchar = 0;		/* I/O counter: bytes read */
+	p->wchar = 0;		/* I/O counter: bytes written */
+	p->syscr = 0;		/* I/O counter: read syscalls */
+	p->syscw = 0;		/* I/O counter: write syscalls */
+	acct_clear_integrals(p);
+
+ 	p->it_virt_expires = cputime_zero;
+	p->it_prof_expires = cputime_zero;
+ 	p->it_sched_expires = 0;
+ 	INIT_LIST_HEAD(&p->cpu_timers[0]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[1]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[2]);
+	p->posix_timer_list = NULL;
+	p->lock_depth = -1;		/* -1 = no lock */
+	do_posix_clock_monotonic_gettime(&p->start_time);
+	p->security = NULL;
+	p->io_context = NULL;
+	p->io_wait = NULL;
+	p->audit_context = NULL;
+	cpuset_fork(p);
+#ifdef CONFIG_NUMA
+ 	p->mempolicy = mpol_copy(p->mempolicy);
+ 	if (IS_ERR(p->mempolicy)) {
+ 		retval = PTR_ERR(p->mempolicy);
+ 		p->mempolicy = NULL;
+ 		goto bad_fork_cleanup_cpuset;
+ 	}
+	mpol_fix_fork_child_flag(p);
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	p->hardirqs_enabled = 1;
+#else
+	p->hardirqs_enabled = 0;
+#endif
+	p->hardirq_enable_ip = 0;
+	p->hardirq_enable_event = 0;
+	p->hardirq_disable_ip = _THIS_IP_;
+	p->hardirq_disable_event = 0;
+	p->softirqs_enabled = 1;
+	p->softirq_enable_ip = _THIS_IP_;
+	p->softirq_enable_event = 0;
+	p->softirq_disable_ip = 0;
+	p->softirq_disable_event = 0;
+	p->hardirq_context = 0;
+	p->softirq_context = 0;
+#endif
+#ifdef CONFIG_LOCKDEP
+	p->lockdep_depth = 0; /* no locks held yet */
+	p->curr_chain_key = 0;
+	p->lockdep_recursion = 0;
+#endif
+
+	rt_mutex_init_task(p);
+
+#ifdef CONFIG_DEBUG_MUTEXES
+	p->blocked_on = NULL; /* not blocked yet */
+#endif
+	p->tgid = p->pid;
+	if (clone_flags & CLONE_THREAD)
+		p->tgid = current->tgid;
+
+	if ((retval = security_task_alloc(p)))
+		goto bad_fork_cleanup_policy;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
+	/* copy all the process information */
+	if ((retval = copy_semundo(clone_flags, p)))
+		goto bad_fork_cleanup_audit;
+	if ((retval = copy_files(clone_flags, p)))
+		goto bad_fork_cleanup_semundo;
+	if ((retval = copy_fs(clone_flags, p)))
+		goto bad_fork_cleanup_files;
+	if ((retval = copy_sighand(clone_flags, p)))
+		goto bad_fork_cleanup_fs;
+	if ((retval = copy_signal(clone_flags, p)))
+		goto bad_fork_cleanup_sighand;
+	if ((retval = copy_mm(clone_flags, p)))
+		goto bad_fork_cleanup_signal;
+	if ((retval = copy_keys(clone_flags, p)))
+		goto bad_fork_cleanup_mm;
+	if ((retval = copy_namespace(clone_flags, p)))
+		goto bad_fork_cleanup_keys;
+	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	if (retval)
+		goto bad_fork_cleanup_namespace;
+#ifdef CONFIG_DEBUG_PREEMPT
+	p->lock_count = 0;
+#endif
+
+	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+	/*
+	 * Clear TID on mm_release()?
+	 */
+	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+	p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+	p->compat_robust_list = NULL;
+#endif
+	INIT_LIST_HEAD(&p->pi_state_list);
+	p->pi_state_cache = NULL;
+
+	/*
+	 * sigaltstack should be cleared when sharing the same VM
+	 */
+	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
+		p->sas_ss_sp = p->sas_ss_size = 0;
+
+	/*
+	 * Syscall tracing should be turned off in the child regardless
+	 * of CLONE_PTRACE.
+	 */
+	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
+
+	/* Our parent execution domain becomes current domain
+	   These must match for thread signalling to apply */
+	   
+	p->parent_exec_id = p->self_exec_id;
+
+	/* ok, now we should be set up.. */
+	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+	p->pdeath_signal = 0;
+	p->exit_state = 0;
+
+	/*
+	 * Ok, make it visible to the rest of the system.
+	 * We dont wake it up yet.
+	 */
+	p->group_leader = p;
+	INIT_LIST_HEAD(&p->thread_group);
+	INIT_LIST_HEAD(&p->ptrace_children);
+	INIT_LIST_HEAD(&p->ptrace_list);
+
+	/* Perform scheduler related setup. Assign this task to a CPU. */
+	sched_fork(p, clone_flags);
+
+	/* Need tasklist lock for parent etc handling! */
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * The task hasn't been attached yet, so its cpus_allowed mask will
+	 * not be changed, nor will its assigned CPU.
+	 *
+	 * The cpus_allowed mask of the parent may have changed after it was
+	 * copied first time - so re-copy it here, then check the child's CPU
+	 * to ensure it is on a valid CPU (and if not, just force it back to
+	 * parent's CPU). This avoids alot of nasty races.
+	 */
+	preempt_disable();
+	p->cpus_allowed = current->cpus_allowed;
+	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
+			!cpu_online(task_cpu(p))))
+		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
+
+	/* CLONE_PARENT re-uses the old parent */
+	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+		p->real_parent = current->real_parent;
+	else
+		p->real_parent = current;
+	p->parent = p->real_parent;
+
+	spin_lock(&current->sighand->siglock);
+
+	/*
+	 * Process group and session signals need to be delivered to just the
+	 * parent before the fork or both the parent and the child after the
+	 * fork. Restart if a signal comes in before we add the new process to
+	 * it's process group.
+	 * A fatal signal pending means that current will exit, so the new
+	 * thread can't slip out of an OOM kill (or normal SIGKILL).
+ 	 */
+ 	recalc_sigpending();
+	if (signal_pending(current)) {
+		spin_unlock(&current->sighand->siglock);
+		write_unlock_irq(&tasklist_lock);
+		retval = -ERESTARTNOINTR;
+		goto bad_fork_cleanup_namespace;
+	}
+
+	if (clone_flags & CLONE_THREAD) {
+		p->group_leader = current->group_leader;
+		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
+
+		if (!cputime_eq(current->signal->it_virt_expires,
+				cputime_zero) ||
+		    !cputime_eq(current->signal->it_prof_expires,
+				cputime_zero) ||
+		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
+		    !list_empty(&current->signal->cpu_timers[0]) ||
+		    !list_empty(&current->signal->cpu_timers[1]) ||
+		    !list_empty(&current->signal->cpu_timers[2])) {
+			/*
+			 * Have child wake up on its first tick to check
+			 * for process CPU timers.
+			 */
+			p->it_prof_expires = jiffies_to_cputime(1);
+		}
+	}
+
+	/*
+	 * inherit ioprio
+	 */
+	p->ioprio = current->ioprio;
+
+	if (likely(p->pid)) {
+		add_parent(p);
+		if (unlikely(p->ptrace & PT_PTRACED))
+			__ptrace_link(p, current->parent);
+
+		if (thread_group_leader(p)) {
+			p->signal->tty = current->signal->tty;
+			p->signal->pgrp = process_group(current);
+			p->signal->session = current->signal->session;
+			attach_pid(p, PIDTYPE_PGID, process_group(p));
+			attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			preempt_disable();
+			__get_cpu_var(process_counts)++;
+			preempt_enable();
+		}
+		attach_pid(p, PIDTYPE_PID, p->pid);
+		nr_threads++;
+	}
+
+	total_forks++;
+	spin_unlock(&current->sighand->siglock);
+	write_unlock_irq(&tasklist_lock);
+	proc_fork_connector(p);
+	return p;
+
+bad_fork_cleanup_namespace:
+	exit_namespace(p);
+bad_fork_cleanup_keys:
+	exit_keys(p);
+bad_fork_cleanup_mm:
+	if (p->mm)
+		mmput(p->mm);
+bad_fork_cleanup_signal:
+	cleanup_signal(p);
+bad_fork_cleanup_sighand:
+	__cleanup_sighand(p->sighand);
+bad_fork_cleanup_fs:
+	exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+	exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
+bad_fork_cleanup_security:
+	security_task_free(p);
+bad_fork_cleanup_policy:
+#ifdef CONFIG_NUMA
+	mpol_free(p->mempolicy);
+bad_fork_cleanup_cpuset:
+#endif
+	cpuset_exit(p);
+bad_fork_cleanup_delays_binfmt:
+	delayacct_tsk_free(p);
+	if (p->binfmt)
+		module_put(p->binfmt->module);
+bad_fork_cleanup_put_domain:
+	module_put(task_thread_info(p)->exec_domain->module);
+bad_fork_cleanup_count:
+	put_group_info(p->group_info);
+	atomic_dec(&p->user->processes);
+	free_uid(p->user);
+bad_fork_free:
+	free_task(p);
+fork_out:
+	return ERR_PTR(retval);
+}
+
+struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+	return regs;
+}
+
+struct task_struct * __devinit fork_idle(int cpu)
+{
+	struct task_struct *task;
+	struct pt_regs regs;
+
+	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
+	if (!task)
+		return ERR_PTR(-ENOMEM);
+	init_idle(task, cpu);
+
+	return task;
+}
+
+static inline int fork_traceflag (unsigned clone_flags)
+{
+	if (clone_flags & CLONE_UNTRACED)
+		return 0;
+	else if (clone_flags & CLONE_VFORK) {
+		if (current->ptrace & PT_TRACE_VFORK)
+			return PTRACE_EVENT_VFORK;
+	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
+		if (current->ptrace & PT_TRACE_CLONE)
+			return PTRACE_EVENT_CLONE;
+	} else if (current->ptrace & PT_TRACE_FORK)
+		return PTRACE_EVENT_FORK;
+
+	return 0;
+}
+
+/*
+ *  Ok, this is the main fork-routine.
+ *
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+long do_fork(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      struct pt_regs *regs,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	struct task_struct *p;
+	int trace = 0;
+	struct pid *pid = alloc_pid();
+	long nr;
+
+	if (!pid)
+		return -EAGAIN;
+	nr = pid->nr;
+	if (unlikely(current->ptrace)) {
+		trace = fork_traceflag (clone_flags);
+		if (trace)
+			clone_flags |= CLONE_PTRACE;
+	}
+
+	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
+	/*
+	 * Do this prior waking up the new thread - the thread pointer
+	 * might get invalid after that point, if the thread exits quickly.
+	 */
+	if (!IS_ERR(p)) {
+		struct completion vfork;
+
+		if (clone_flags & CLONE_VFORK) {
+			p->vfork_done = &vfork;
+			init_completion(&vfork);
+		}
+
+		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+			/*
+			 * We'll start up with an immediate SIGSTOP.
+			 */
+			sigaddset(&p->pending.signal, SIGSTOP);
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
+
+		if (!(clone_flags & CLONE_STOPPED))
+			wake_up_new_task(p, clone_flags);
+		else
+			p->state = TASK_STOPPED;
+
+		if (unlikely (trace)) {
+			current->ptrace_message = nr;
+			ptrace_notify ((trace << 8) | SIGTRAP);
+		}
+
+		if (clone_flags & CLONE_VFORK) {
+			wait_for_completion(&vfork);
+			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+				current->ptrace_message = nr;
+				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+			}
+		}
+	} else {
+		free_pid(pid);
+		nr = PTR_ERR(p);
+	}
+	return nr;
+}
+
+#ifndef ARCH_MIN_MMSTRUCT_ALIGN
+#define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct sighand_struct *sighand = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+					SLAB_CTOR_CONSTRUCTOR)
+		spin_lock_init(&sighand->siglock);
+}
+
+void __init proc_caches_init(void)
+{
+	sighand_cachep = kmem_cache_create("sighand_cache",
+			sizeof(struct sighand_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+			sighand_ctor, NULL);
+	signal_cachep = kmem_cache_create("signal_cache",
+			sizeof(struct signal_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	files_cachep = kmem_cache_create("files_cache", 
+			sizeof(struct files_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	fs_cachep = kmem_cache_create("fs_cache", 
+			sizeof(struct fs_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), 0,
+			SLAB_PANIC, NULL, NULL);
+	mm_cachep = kmem_cache_create("mm_struct",
+			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
+
+
+/*
+ * Check constraints on flags passed to the unshare system call and
+ * force unsharing of additional process context as appropriate.
+ */
+static inline void check_unshare_flags(unsigned long *flags_ptr)
+{
+	/*
+	 * If unsharing a thread from a thread group, must also
+	 * unshare vm.
+	 */
+	if (*flags_ptr & CLONE_THREAD)
+		*flags_ptr |= CLONE_VM;
+
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (*flags_ptr & CLONE_VM)
+		*flags_ptr |= CLONE_SIGHAND;
+
+	/*
+	 * If unsharing signal handlers and the task was created
+	 * using CLONE_THREAD, then must unshare the thread
+	 */
+	if ((*flags_ptr & CLONE_SIGHAND) &&
+	    (atomic_read(&current->signal->count) > 1))
+		*flags_ptr |= CLONE_THREAD;
+
+	/*
+	 * If unsharing namespace, must also unshare filesystem information.
+	 */
+	if (*flags_ptr & CLONE_NEWNS)
+		*flags_ptr |= CLONE_FS;
+}
+
+/*
+ * Unsharing of tasks created with CLONE_THREAD is not supported yet
+ */
+static int unshare_thread(unsigned long unshare_flags)
+{
+	if (unshare_flags & CLONE_THREAD)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unshare the filesystem structure if it is being shared
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+	struct fs_struct *fs = current->fs;
+
+	if ((unshare_flags & CLONE_FS) &&
+	    (fs && atomic_read(&fs->count) > 1)) {
+		*new_fsp = __copy_fs_struct(current->fs);
+		if (!*new_fsp)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Unshare the namespace structure if it is being shared
+ */
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+{
+	struct namespace *ns = current->namespace;
+
+	if ((unshare_flags & CLONE_NEWNS) &&
+	    (ns && atomic_read(&ns->count) > 1)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		*new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+		if (!*new_nsp)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * supported yet
+ */
+static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+{
+	struct sighand_struct *sigh = current->sighand;
+
+	if ((unshare_flags & CLONE_SIGHAND) &&
+	    (sigh && atomic_read(&sigh->count) > 1))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/*
+ * Unshare vm if it is being shared
+ */
+static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+{
+	struct mm_struct *mm = current->mm;
+
+	if ((unshare_flags & CLONE_VM) &&
+	    (mm && atomic_read(&mm->mm_users) > 1)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Unshare file descriptor table if it is being shared
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+	struct files_struct *fd = current->files;
+	int error = 0;
+
+	if ((unshare_flags & CLONE_FILES) &&
+	    (fd && atomic_read(&fd->count) > 1)) {
+		*new_fdp = dup_fd(fd, &error);
+		if (!*new_fdp)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
+ * supported yet
+ */
+static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
+{
+	if (unshare_flags & CLONE_SYSVSEM)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone.  copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+	int err = 0;
+	struct fs_struct *fs, *new_fs = NULL;
+	struct namespace *ns, *new_ns = NULL;
+	struct sighand_struct *sigh, *new_sigh = NULL;
+	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
+	struct files_struct *fd, *new_fd = NULL;
+	struct sem_undo_list *new_ulist = NULL;
+
+	check_unshare_flags(&unshare_flags);
+
+	/* Return -EINVAL for all unsupported flags */
+	err = -EINVAL;
+	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+		goto bad_unshare_out;
+
+	if ((err = unshare_thread(unshare_flags)))
+		goto bad_unshare_out;
+	if ((err = unshare_fs(unshare_flags, &new_fs)))
+		goto bad_unshare_cleanup_thread;
+	if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+		goto bad_unshare_cleanup_fs;
+	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+		goto bad_unshare_cleanup_ns;
+	if ((err = unshare_vm(unshare_flags, &new_mm)))
+		goto bad_unshare_cleanup_sigh;
+	if ((err = unshare_fd(unshare_flags, &new_fd)))
+		goto bad_unshare_cleanup_vm;
+	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
+		goto bad_unshare_cleanup_fd;
+
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+
+		task_lock(current);
+
+		if (new_fs) {
+			fs = current->fs;
+			current->fs = new_fs;
+			new_fs = fs;
+		}
+
+		if (new_ns) {
+			ns = current->namespace;
+			current->namespace = new_ns;
+			new_ns = ns;
+		}
+
+		if (new_sigh) {
+			sigh = current->sighand;
+			rcu_assign_pointer(current->sighand, new_sigh);
+			new_sigh = sigh;
+		}
+
+		if (new_mm) {
+			mm = current->mm;
+			active_mm = current->active_mm;
+			current->mm = new_mm;
+			current->active_mm = new_mm;
+			activate_mm(active_mm, new_mm);
+			new_mm = mm;
+		}
+
+		if (new_fd) {
+			fd = current->files;
+			current->files = new_fd;
+			new_fd = fd;
+		}
+
+		task_unlock(current);
+	}
+
+bad_unshare_cleanup_fd:
+	if (new_fd)
+		put_files_struct(new_fd);
+
+bad_unshare_cleanup_vm:
+	if (new_mm)
+		mmput(new_mm);
+
+bad_unshare_cleanup_sigh:
+	if (new_sigh)
+		if (atomic_dec_and_test(&new_sigh->count))
+			kmem_cache_free(sighand_cachep, new_sigh);
+
+bad_unshare_cleanup_ns:
+	if (new_ns)
+		put_namespace(new_ns);
+
+bad_unshare_cleanup_fs:
+	if (new_fs)
+		put_fs_struct(new_fs);
+
+bad_unshare_cleanup_thread:
+bad_unshare_out:
+	return err;
+}
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (mmdrop_complete())
+			continue;
+		schedule();
+
+		/* This must be called from time to time on ia64, and is a no-op on other archs.
+		 * Used to be in cpu_idle(), but with the new -rt semantics it can't stay there.
+		 */
+		check_pgt_cache();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
diff -urN ./linux-2.6.18.1/kernel/futex.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/futex.c
--- ./linux-2.6.18.1/kernel/futex.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/futex.c	2007-05-19 23:58:35.000000000 +0900
@@ -49,6 +49,7 @@
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <asm/futex.h>
+#include <linux/hrtimer.h>
 
 #include "rtmutex_common.h"
 
@@ -1001,7 +1002,7 @@
 	drop_key_refs(&q->key);
 }
 
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
@@ -1009,6 +1010,8 @@
 	struct futex_q q;
 	u32 uval;
 	int ret;
+	struct hrtimer_sleeper t;
+	int rem = 0;
 
 	q.pi_state = NULL;
  retry:
@@ -1086,8 +1089,37 @@
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (likely(!list_empty(&q.list)))
-		time = schedule_timeout(time);
+	if (likely(!list_empty(&q.list))) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
+
+		if (time->tv_sec == 0 && time->tv_nsec == 0)
+			schedule();
+		else {
+
+			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_REL);
+			hrtimer_init_sleeper(&t, current);
+			t.timer.expires = timespec_to_ktime(*time);
+
+			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_REL);
+
+			/*
+			 * the timer could have already expired, in which
+			 * case current would be flagged for rescheduling.
+			 * Don't bother calling schedule.
+			 */
+			if (likely(t.task))
+				schedule();
+
+			hrtimer_cancel(&t.timer);
+
+			/* Flag if a timeout occured */
+			rem = (t.task == NULL);
+		}
+
+		current->flags |= nosched_flag;
+	}
 	__set_current_state(TASK_RUNNING);
 
 	/*
@@ -1098,7 +1130,7 @@
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	if (!unqueue_me(&q))
 		return 0;
-	if (time == 0)
+	if (rem)
 		return -ETIMEDOUT;
 	/*
 	 * We expect signal_pending(current), but another thread may
@@ -1120,8 +1152,8 @@
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
-			 long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, int detect, struct timespec *time,
+			 int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
@@ -1133,11 +1165,11 @@
 	if (refill_pi_state_cache())
 		return -ENOMEM;
 
-	if (sec != MAX_SCHEDULE_TIMEOUT) {
+	if (time->tv_sec || time->tv_nsec) {
 		to = &timeout;
 		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
 		hrtimer_init_sleeper(to, current);
-		to->timer.expires = ktime_set(sec, nsec);
+		to->timer.expires = timespec_to_ktime(*time);
 	}
 
 	q.pi_state = NULL;
@@ -1765,7 +1797,7 @@
 	}
 }
 
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
@@ -1791,13 +1823,13 @@
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
 	case FUTEX_LOCK_PI:
-		ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+		ret = futex_lock_pi(uaddr, val, timeout, 0);
 		break;
 	case FUTEX_UNLOCK_PI:
 		ret = futex_unlock_pi(uaddr);
 		break;
 	case FUTEX_TRYLOCK_PI:
-		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+		ret = futex_lock_pi(uaddr, 0, timeout, 1);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -1810,8 +1842,7 @@
 			  struct timespec __user *utime, u32 __user *uaddr2,
 			  u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec t = {.tv_sec=0, .tv_nsec=0};
 	u32 val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
@@ -1819,12 +1850,6 @@
 			return -EFAULT;
 		if (!timespec_valid(&t))
 			return -EINVAL;
-		if (op == FUTEX_WAIT)
-			timeout = timespec_to_jiffies(&t) + 1;
-		else {
-			timeout = t.tv_sec;
-			val2 = t.tv_nsec;
-		}
 	}
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
@@ -1832,7 +1857,7 @@
 	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (u32) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
 }
 
 static int futexfs_get_sb(struct file_system_type *fs_type,
diff -urN ./linux-2.6.18.1/kernel/futex_compat.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/futex_compat.c
--- ./linux-2.6.18.1/kernel/futex_compat.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/futex_compat.c	2007-05-19 23:58:35.000000000 +0900
@@ -141,8 +141,7 @@
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
 		u32 val3)
 {
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timespec t = {.tv_sec = 0, .tv_nsec = 0};
 	int val2 = 0;
 
 	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
@@ -150,15 +149,9 @@
 			return -EFAULT;
 		if (!timespec_valid(&t))
 			return -EINVAL;
-		if (op == FUTEX_WAIT)
-			timeout = timespec_to_jiffies(&t) + 1;
-		else {
-			timeout = t.tv_sec;
-			val2 = t.tv_nsec;
-		}
 	}
 	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, &t, uaddr2, val2, val3);
 }
diff -urN ./linux-2.6.18.1/kernel/hrtimer.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/hrtimer.c
--- ./linux-2.6.18.1/kernel/hrtimer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/hrtimer.c	2007-05-19 23:58:35.000000000 +0900
@@ -1,8 +1,8 @@
 /*
  *  linux/kernel/hrtimer.c
  *
- *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
  *
  *  High-resolution kernel timers
  *
@@ -30,13 +30,18 @@
  *  For licencing details see kernel-base/COPYING
  */
 
+#include <linux/clockchips.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#include <linux/clockchips.h>
+#include <linux/seq_file.h>
+#include <linux/profile.h>
 
 #include <asm/uaccess.h>
 
@@ -45,7 +50,7 @@
  *
  * returns the time in ktime_t format
  */
-static ktime_t ktime_get(void)
+ktime_t ktime_get(void)
 {
 	struct timespec now;
 
@@ -59,7 +64,7 @@
  *
  * returns the time in ktime_t format
  */
-static ktime_t ktime_get_real(void)
+ktime_t ktime_get_real(void)
 {
 	struct timespec now;
 
@@ -79,21 +84,22 @@
  * This ensures that we capture erroneous accesses to these clock ids
  * rather than moving them into the range of valid clock id's.
  */
-
-#define MAX_HRTIMER_BASES 2
-
-static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
+DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
+
+	.clock_base =
 	{
-		.index = CLOCK_REALTIME,
-		.get_time = &ktime_get_real,
-		.resolution = KTIME_REALTIME_RES,
-	},
-	{
-		.index = CLOCK_MONOTONIC,
-		.get_time = &ktime_get,
-		.resolution = KTIME_MONOTONIC_RES,
-	},
+		{
+			.index = CLOCK_REALTIME,
+			.get_time = &ktime_get_real,
+			.resolution = KTIME_REALTIME_RES,
+		},
+		{
+			.index = CLOCK_MONOTONIC,
+			.get_time = &ktime_get,
+			.resolution = KTIME_MONOTONIC_RES,
+		},
+	}
 };
 
 /**
@@ -125,7 +131,7 @@
  * Get the coarse grained time at the softirq based on xtime and
  * wall_to_monotonic.
  */
-static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, tomono;
 	unsigned long seq;
@@ -137,8 +143,9 @@
 
 	} while (read_seqretry(&xtime_lock, seq));
 
-	base[CLOCK_REALTIME].softirq_time = xtim;
-	base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+	base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
+	base->clock_base[CLOCK_MONOTONIC].softirq_time =
+		ktime_add(xtim, tomono);
 }
 
 /*
@@ -147,7 +154,12 @@
  */
 #ifdef CONFIG_SMP
 
-#define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)
+static inline int base_same_cpu(struct hrtimer_clock_base *base)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+	return (base == &cpu_base->clock_base[base->index]);
+}
 
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
@@ -161,19 +173,20 @@
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
-static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
-					      unsigned long *flags)
+static
+struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+					     unsigned long *flags)
 {
-	struct hrtimer_base *base;
+	struct hrtimer_clock_base *base;
 
 	for (;;) {
 		base = timer->base;
 		if (likely(base != NULL)) {
-			spin_lock_irqsave(&base->lock, *flags);
+			spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
 			/* The timer has migrated to another CPU: */
-			spin_unlock_irqrestore(&base->lock, *flags);
+			spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
 		}
 		cpu_relax();
 	}
@@ -182,12 +195,14 @@
 /*
  * Switch the timer base to the current CPU when possible.
  */
-static inline struct hrtimer_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
-	struct hrtimer_base *new_base;
+	struct hrtimer_clock_base *new_base;
+	struct hrtimer_cpu_base *new_cpu_base;
 
-	new_base = &__get_cpu_var(hrtimer_bases)[base->index];
+	new_cpu_base = &__get_cpu_var(hrtimer_bases);
+	new_base = &new_cpu_base->clock_base[base->index];
 
 	if (base != new_base) {
 		/*
@@ -199,13 +214,13 @@
 		 * completed. There is no conflict as we hold the lock until
 		 * the timer is enqueued.
 		 */
-		if (unlikely(base->curr_timer == timer))
+		if (unlikely(base->cpu_base->curr_timer == timer))
 			return base;
 
 		/* See the comment in lock_timer_base() */
 		timer->base = NULL;
-		spin_unlock(&base->lock);
-		spin_lock(&new_base->lock);
+		spin_unlock(&base->cpu_base->lock);
+		spin_lock(&new_base->cpu_base->lock);
 		timer->base = new_base;
 	}
 	return new_base;
@@ -213,22 +228,53 @@
 
 #else /* CONFIG_SMP */
 
-#define set_curr_timer(b, t)		do { } while (0)
-
-static inline struct hrtimer_base *
+static inline struct hrtimer_clock_base *
 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
-	struct hrtimer_base *base = timer->base;
+	struct hrtimer_clock_base *base = timer->base;
 
-	spin_lock_irqsave(&base->lock, *flags);
+	spin_lock_irqsave(&base->cpu_base->lock, *flags);
 
 	return base;
 }
 
-#define switch_hrtimer_base(t, b)	(b)
+# define switch_hrtimer_base(t, b)	(b)
+# define base_same_cpu(b)		1
 
 #endif	/* !CONFIG_SMP */
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+# define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)
+#else
+# define set_curr_timer(b, t)		do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+# define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * hrtimer_wait_for_timer - Wait for a running timer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void hrtimer_wait_for_timer(const struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+
+	if (base && base->cpu_base)
+		wait_event(base->cpu_base->wait,
+				base->cpu_base->curr_timer != timer);
+}
+
+#else
+# define wake_up_timer_waiters(b)	do { } while (0)
+#endif
+
 /*
  * Functions for the union type storage format of ktime_t which are
  * too large for inlining:
@@ -256,9 +302,6 @@
 
 	return ktime_add(kt, tmp);
 }
-
-#else /* CONFIG_KTIME_SCALAR */
-
 # endif /* !CONFIG_KTIME_SCALAR */
 
 /*
@@ -286,13 +329,633 @@
 # define ktime_divns(kt, div)		(unsigned long)((kt).tv64 / (div))
 #endif /* BITS_PER_LONG >= 64 */
 
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+static ktime_t last_jiffies_update;
+
+/*
+ * Reprogramm the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
+{
+	int i;
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
+	ktime_t expires;
+
+	cpu_base->expires_next.tv64 = KTIME_MAX;
+
+	for (i = HRTIMER_MAX_CLOCK_BASES; i ; i--, base++) {
+		struct hrtimer *timer;
+
+		if (!base->first)
+			continue;
+		timer = rb_entry(base->first, struct hrtimer, node);
+		expires = ktime_sub(timer->expires, base->offset);
+		if (expires.tv64 < cpu_base->expires_next.tv64)
+			cpu_base->expires_next = expires;
+	}
+
+	if (cpu_base->expires_next.tv64 != KTIME_MAX)
+		clockevents_set_next_event(cpu_base->expires_next, 1);
+}
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a new expires first timer is enqueued, we have
+ * to check, whether it expires earlier than the timer
+ * for which the hrt time source was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer,
+			     struct hrtimer_clock_base *base)
+{
+	ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+	ktime_t expires = ktime_sub(timer->expires, base->offset);
+	int res;
+
+	if (!base_same_cpu(base))
+		return 0;
+
+	if (expires.tv64 >= expires_next->tv64)
+		return 0;
+
+	res = clockevents_set_next_event(expires, 0);
+	if (!res)
+		*expires_next = expires;
+	return res;
+}
+
+
+/*
+ * Retrigger next event is called after clock was set
+ */
+static void retrigger_next_event(void *arg)
+{
+	struct hrtimer_cpu_base *base;
+	struct timespec realtime_offset;
+
+	base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+	/* Adjust CLOCK_REALTIME offset */
+	spin_lock(&base->lock);
+	set_normalized_timespec(&realtime_offset,
+				-wall_to_monotonic.tv_sec,
+				-wall_to_monotonic.tv_nsec);
+	base->clock_base[CLOCK_REALTIME].offset =
+		timespec_to_ktime(realtime_offset);
+
+	hrtimer_force_reprogram(base);
+	spin_unlock(&base->lock);
+}
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock. Called with xtime lock held !
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+	unsigned long flags;
+
+	preempt_disable();
+	local_irq_save(flags);
+
+	warp_check_clock_was_changed();
+
+	if (hrtimer_hres_active) {
+		retrigger_next_event(NULL);
+		local_irq_enable();
+
+		if (smp_call_function(retrigger_next_event, NULL, 1, 1))
+			BUG();
+	}
+	local_irq_restore(flags);
+	preempt_enable();
+}
+
+/**
+ * hrtimer_clock_notify - A clock source or a clock event has been installed
+ *
+ * Notify the per cpu softirqs to recheck the clock sources and events
+ */
+void hrtimer_clock_notify(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		set_bit(0, &per_cpu(hrtimer_bases, i).check_clocks);
+}
+
+
+static const ktime_t nsec_per_hz = { .tv64 = NSEC_PER_SEC / HZ };
+
+/*
+ * We switched off the global tick source when switching to high resolution
+ * mode. Update jiffies64.
+ *
+ * Must be called with interrupts disabled !
+ */
+static void update_jiffies64(ktime_t now)
+{
+	ktime_t delta;
+
+	write_seqlock(&xtime_lock);
+
+	delta = ktime_sub(now, last_jiffies_update);
+	if (delta.tv64 >= nsec_per_hz.tv64) {
+
+		delta = ktime_sub(delta, nsec_per_hz);
+		last_jiffies_update = ktime_add(last_jiffies_update,
+						nsec_per_hz);
+
+		/* Slow path for long timeouts */
+		if (unlikely(delta.tv64 >= nsec_per_hz.tv64)) {
+			s64 incr = ktime_to_ns(nsec_per_hz);
+			unsigned long orun = ktime_divns(delta, incr);
+
+			last_jiffies_update = ktime_add_ns(last_jiffies_update,
+							   incr * orun);
+			jiffies_64 += orun;
+		}
+		do_timer(NULL);
+	}
+	write_sequnlock(&xtime_lock);
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * Called from interrupt entry when then CPU was idle
+ */
+void update_jiffies(void)
+{
+	unsigned long flags;
+	ktime_t now;
+
+	if (unlikely(!hrtimer_hres_active))
+		return;
+
+	now = ktime_get();
+
+	local_irq_save(flags);
+	update_jiffies64(now);
+	local_irq_restore(flags);
+}
+
+/*
+ * Called from the idle thread so careful!
+ */
+int hrtimer_stop_sched_tick(void)
+{
+	int cpu = smp_processor_id();
+	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+	unsigned long seq, last_jiffies, next_jiffies;
+	ktime_t last_update, expires;
+	unsigned long delta_jiffies;
+	unsigned long flags;
+
+	if (unlikely(!hrtimer_hres_active))
+		return 0;
+
+	local_irq_save(flags);
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		last_update = last_jiffies_update;
+		last_jiffies = jiffies;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	next_jiffies = get_next_timer_interrupt(last_jiffies);
+	delta_jiffies = next_jiffies - last_jiffies;
+
+	cpu_base->idle_calls++;
+
+	if ((long)delta_jiffies >= 1) {
+		/*
+		 * Save the current tick time, so we can restart the
+		 * scheduler tick when we get woken up before the next
+		 * wheel timer expires
+		 */
+		cpu_base->idle_tick = cpu_base->sched_timer.expires;
+		expires = ktime_add_ns(last_update,
+				       nsec_per_hz.tv64 * delta_jiffies);
+		hrtimer_start(&cpu_base->sched_timer, expires, HRTIMER_ABS);
+		cpu_base->idle_sleeps++;
+		cpu_base->idle_jiffies = last_jiffies;
+	} else {
+		/* Keep the timer alive */
+		cpu_base->idle_tick.tv64 = 0;
+		if ((long) delta_jiffies < 0)
+			raise_softirq(TIMER_SOFTIRQ);
+	}
+
+	if (local_softirq_pending()) {
+		inc_preempt_count();
+		do_softirq();
+		dec_preempt_count();
+	}
+
+	/*
+	 * RCU normally depends on the timer IRQ kicking completion
+	 * in every tick. We have to do this here now:
+	 */
+	if (rcu_pending(cpu)) {
+#ifdef CONFIG_PREEMPT_RT
+		/*
+		 * In PREEMPT_RT we cannot process RCU callbacks in
+		 * this context - so we just kick the RCU tasklet
+		 * if needed:
+		 */
+		rcu_check_callbacks(cpu, 1);
+#else
+		/*
+		 * We are in quiescent state, so advance callbacks:
+		 */
+		rcu_advance_callbacks(cpu, 1);
+		local_irq_enable();
+		local_bh_disable();
+		rcu_process_callbacks(0);
+		local_bh_enable();
+#endif
+	}
+
+	local_irq_restore(flags);
+
+	return need_resched();
+}
+
+void hrtimer_restart_sched_tick(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+	unsigned long flags;
+	ktime_t now;
+
+	if (!hrtimer_hres_active || cpu_base->idle_tick.tv64 == 0)
+		return;
+
+	/* Update jiffies first */
+	now = ktime_get();
+
+	local_irq_save(flags);
+	update_jiffies64(now);
+
+	/*
+	 * Update process times would randomly account the time we slept to
+	 * whatever the context of the next sched tick is.  Enforce that this
+	 * is accounted to idle !
+	 */
+	add_preempt_count(HARDIRQ_OFFSET);
+	update_process_times(0);
+	sub_preempt_count(HARDIRQ_OFFSET);
+
+	cpu_base->idle_sleeptime += jiffies - cpu_base->idle_jiffies;
+
+	hrtimer_cancel(&cpu_base->sched_timer);
+	cpu_base->sched_timer.expires = cpu_base->idle_tick;
+	hrtimer_forward(&cpu_base->sched_timer, now, nsec_per_hz);
+	hrtimer_start(&cpu_base->sched_timer, cpu_base->sched_timer.expires,
+		      HRTIMER_ABS);
+	local_irq_restore(flags);
+}
+
+void show_no_hz_stats(struct seq_file *p)
+{
+	int cpu;
+	unsigned long calls = 0, sleeps = 0, time = 0, events = 0;
+
+	for_each_online_cpu(cpu) {
+		struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
+
+		calls += base->idle_calls;
+		sleeps += base->idle_sleeps;
+		time += base->idle_sleeptime;
+		events += base->events;
+
+		seq_printf(p, "nohz cpu%d I:%lu S:%lu T:%lu A:%lu E: %lu\n",
+			   cpu, base->idle_calls, base->idle_sleeps,
+			   base->idle_sleeptime, base->idle_sleeps ?
+			   base->idle_sleeptime / sleeps : 0, base->events);
+	}
+#ifdef CONFIG_SMP
+	seq_printf(p, "nohz total I:%lu S:%lu T:%lu A:%lu E:%lu\n",
+		   calls, sleeps, time, sleeps ? time / sleeps : 0, events);
+#endif
+}
+
+#endif
+
+/*
+ * We rearm the timer until we get disabled by the idle code
+ */
+static int hrtimer_sched_tick(struct hrtimer *timer)
+{
+	unsigned long flags;
+	struct hrtimer_cpu_base *cpu_base =
+		container_of(timer, struct hrtimer_cpu_base, sched_timer);
+
+	local_irq_save(flags);
+	/*
+	 * Do not call, when we are not in irq context and have
+	 * no valid regs pointer
+	 */
+	if (cpu_base->sched_regs) {
+		update_process_times(user_mode(cpu_base->sched_regs));
+		profile_tick(CPU_PROFILING, cpu_base->sched_regs);
+	}
+
+	hrtimer_forward(timer, hrtimer_cb_get_time(timer), nsec_per_hz);
+	local_irq_restore(flags);
+
+	return HRTIMER_RESTART;
+}
+
+/*
+ * A change in the clock source or clock events was detected.
+ * Check the clock source and the events, whether we can switch to
+ * high resolution mode or not.
+ *
+ * TODO: Handle the removal of clock sources / events
+ */
+static void hrtimer_check_clocks(void)
+{
+	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+	unsigned long flags;
+	ktime_t now;
+
+	if (!test_and_clear_bit(0, &base->check_clocks))
+		return;
+
+	if (!timekeeping_is_continuous())
+		return;
+
+	if (!clockevents_next_event_available())
+		return;
+
+	local_irq_save(flags);
+
+	if (base->hres_active) {
+		local_irq_restore(flags);
+		return;
+	}
+
+	now = ktime_get();
+	if (clockevents_init_next_event()) {
+		local_irq_restore(flags);
+		return;
+	}
+	base->hres_active = 1;
+
+	/* Did we start the jiffies update yet ? */
+	if (last_jiffies_update.tv64 == 0) {
+		write_seqlock(&xtime_lock);
+		last_jiffies_update = now;
+		write_sequnlock(&xtime_lock);
+	}
+
+	/*
+	 * Emulate tick processing via per-CPU hrtimers:
+	 */
+	hrtimer_init(&base->sched_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+	base->sched_timer.function = hrtimer_sched_tick;
+	base->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE;
+	hrtimer_start(&base->sched_timer, nsec_per_hz, HRTIMER_REL);
+
+	/* "Retrigger" the interrupt to get things going */
+	retrigger_next_event(NULL);
+	local_irq_restore(flags);
+	printk(KERN_INFO "hrtimers: Switched to high resolution mode CPU %d\n",
+	       smp_processor_id());
+}
+
+#ifndef CONFIG_PREEMPT_RT
+
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return !list_empty(&timer->cb_entry);
+}
+
+static inline int hrtimer_base_cb_pending(const struct hrtimer_cpu_base *base)
+{
+	return !list_empty(&base->cb_pending);
+}
+
+static inline
+struct hrtimer *hrtimer_base_get_cb_pending(struct hrtimer_cpu_base *base)
+{
+	return list_entry(base->cb_pending.next, struct hrtimer, cb_entry);
+}
+
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	list_del_init(&timer->cb_entry);
+}
+
+static inline void hrtimer_add_cb_pending(struct hrtimer *timer,
+					  struct hrtimer_clock_base *base)
+{
+	list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending);
+	timer->state = HRTIMER_PENDING;
+}
+
+static inline void hrtimer_init_base_cb_pending(struct hrtimer_cpu_base *base)
+{
+	INIT_LIST_HEAD(&base->cb_pending);
+}
+
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+	INIT_LIST_HEAD(&timer->cb_entry);
+}
+
+static inline void hrtimer_update_timer_prio(struct hrtimer *timer)
+{
+}
+
+static inline void hrtimer_raise_softirq(struct hrtimer_cpu_base *base)
+{
+	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
+static inline void hrtimer_adjust_softirq_prio(struct hrtimer_cpu_base *base)
+{
+}
+
+#else
+
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return !plist_node_empty(&timer->cb_entry);
+}
+
+static inline int hrtimer_base_cb_pending(const struct hrtimer_cpu_base *base)
+{
+	return !plist_head_empty(&base->cb_pending);
+}
+
+static inline
+struct hrtimer *hrtimer_base_get_cb_pending(struct hrtimer_cpu_base *base)
+{
+	return plist_first_entry(&base->cb_pending, struct hrtimer, cb_entry);
+}
+
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	plist_del(&timer->cb_entry, &timer->base->cpu_base->cb_pending);
+}
+
+static inline void hrtimer_add_cb_pending(struct hrtimer *timer,
+					  struct hrtimer_clock_base *base)
+{
+	plist_add(&timer->cb_entry, &base->cpu_base->cb_pending);
+	timer->state = HRTIMER_PENDING;
+}
+
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+	plist_node_init(&timer->cb_entry, current->normal_prio);
+}
+
+static inline void hrtimer_init_base_cb_pending(struct hrtimer_cpu_base *base)
+{
+	plist_head_init(&base->cb_pending, &base->lock);
+}
+
+static inline void hrtimer_update_timer_prio(struct hrtimer *timer)
+{
+	timer->cb_entry.prio = current->normal_prio;
+}
+
+static inline int hrtimer_get_softirq_prio(struct hrtimer_cpu_base *base)
+{
+	if(hrtimer_base_cb_pending(base)) {
+		struct hrtimer *timer = hrtimer_base_get_cb_pending(base);
+
+		if (rt_prio(timer->cb_entry.prio))
+			return timer->cb_entry.prio;
+	}
+	/* Standard softirq prio: 1 */
+	return MAX_RT_PRIO-1 - 1;
+}
+
+static inline void hrtimer_adjust_softirq_prio(struct hrtimer_cpu_base *base)
+{
+	int prio = hrtimer_get_softirq_prio(base);
+
+	if (current->normal_prio != prio) {
+		set_thread_priority(current, prio);
+		spin_unlock_irq(&base->lock);
+		cond_resched();
+		spin_lock_irq(&base->lock);
+	}
+}
+
+static inline void hrtimer_raise_softirq(struct hrtimer_cpu_base *base)
+{
+	raise_softirq_irqoff_prio(HRTIMER_SOFTIRQ,
+				  hrtimer_get_softirq_prio(base));
+}
+
+#endif
+
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+{
+	base->expires_next.tv64 = KTIME_MAX;
+	set_bit(0, &base->check_clocks);
+	base->hres_active = 0;
+	hrtimer_init_base_cb_pending(base);
+}
+
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+					    struct hrtimer_clock_base *base)
+{
+	/*
+	 * High resolution timers, when active try to
+	 * reprogram. If the timer is in the past we just move
+	 * it to the expired list and schedule the softirq.
+	 */
+	if (hrtimer_hres_active && hrtimer_reprogram(timer, base)) {
+		/*
+		 * HRTIMER_CB_IRQSAFE_NO_RESTART signals that the timer is not
+		 * requested to be rearmed by the callback function. Just call
+		 * the callback instead of going through the softirq.
+		 */
+		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_RESTART) {
+			int ret = timer->function(timer);
+
+			BUG_ON(ret != HRTIMER_NORESTART);
+		} else {
+			hrtimer_add_cb_pending(timer, base);
+			hrtimer_raise_softirq(base->cpu_base);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static inline void hrtimer_resume_jiffie_update(void)
+{
+	unsigned long flags;
+	ktime_t now = ktime_get();
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	last_jiffies_update = now;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+#else
+
+# define hrtimer_hres_active		0
+# define hrtimer_check_clocks()		do { } while (0)
+# define hrtimer_enqueue_reprogram(t,b)	0
+# define hrtimer_force_reprogram(b)	do { } while (0)
+# define hrtimer_cb_pending(t)		0
+# define hrtimer_remove_cb_pending(t)	do { } while (0)
+# define hrtimer_init_hres(c)		do { } while (0)
+# define hrtimer_init_timer_hres(t)	do { } while (0)
+# define hrtimer_resume_jiffie_update()	do { } while (0)
+# define hrtimer_update_timer_prio(t)	do { } while (0)
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
+#ifdef CONFIG_TIMER_STATS
+void __tstats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
+{
+	if (timer->start_site)
+		return;
+
+	timer->start_site = addr;
+	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+	timer->start_pid = current->pid;
+}
+#endif
+
+/*
+ * Timekeeping resumed notification
+ */
+void hrtimer_notify_resume(void)
+{
+	hrtimer_resume_jiffie_update();
+	clockevents_resume_events();
+	clock_was_set();
+}
+
 /*
  * Counterpart to lock_timer_base above:
  */
 static inline
 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
-	spin_unlock_irqrestore(&timer->base->lock, *flags);
+	spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
 }
 
 /**
@@ -342,7 +1005,8 @@
  * The timer is inserted in expiry order. Insertion into the
  * red black tree is O(log(n)). Must hold the base lock.
  */
-static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+static void enqueue_hrtimer(struct hrtimer *timer,
+			    struct hrtimer_clock_base *base)
 {
 	struct rb_node **link = &base->active.rb_node;
 	struct rb_node *parent = NULL;
@@ -368,12 +1032,18 @@
 	 * Insert the timer to the rbtree and check whether it
 	 * replaces the first pending timer
 	 */
-	rb_link_node(&timer->node, parent, link);
-	rb_insert_color(&timer->node, &base->active);
-
 	if (!base->first || timer->expires.tv64 <
-	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
+	    rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
+
+		if (hrtimer_enqueue_reprogram(timer, base))
+			return;
+
 		base->first = &timer->node;
+	}
+
+	rb_link_node(&timer->node, parent, link);
+	rb_insert_color(&timer->node, &base->active);
+	timer->state = HRTIMER_ACTIVE;
 }
 
 /*
@@ -381,26 +1051,37 @@
  *
  * Caller must hold the base lock.
  */
-static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
-{
-	/*
-	 * Remove the timer from the rbtree and replace the
-	 * first entry pointer if necessary.
-	 */
-	if (base->first == &timer->node)
-		base->first = rb_next(&timer->node);
-	rb_erase(&timer->node, &base->active);
-	rb_set_parent(&timer->node, &timer->node);
+static void __remove_hrtimer(struct hrtimer *timer,
+			     struct hrtimer_clock_base *base,
+			     enum hrtimer_state newstate, int reprogram)
+{
+	/* High res. callback list. NOP for !HIGHRES */
+	if (hrtimer_cb_pending(timer))
+		hrtimer_remove_cb_pending(timer);
+	else {
+		/*
+		 * Remove the timer from the rbtree and replace the
+		 * first entry pointer if necessary.
+		 */
+		if (base->first == &timer->node) {
+			base->first = rb_next(&timer->node);
+			if (reprogram && hrtimer_hres_active)
+				hrtimer_force_reprogram(base->cpu_base);
+		}
+		rb_erase(&timer->node, &base->active);
+	}
+	timer->state = newstate;
 }
 
 /*
  * remove hrtimer, called with base lock held
  */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_active(timer)) {
-		__remove_hrtimer(timer, base);
+		tstats_hrtimer_clear_start_info(timer);
+		__remove_hrtimer(timer, base, HRTIMER_INACTIVE, 1);
 		return 1;
 	}
 	return 0;
@@ -419,7 +1100,7 @@
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	struct hrtimer_base *base, *new_base;
+	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
 	int ret;
 
@@ -431,6 +1112,8 @@
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base);
 
+	hrtimer_update_timer_prio(timer);
+
 	if (mode == HRTIMER_REL) {
 		tim = ktime_add(tim, new_base->get_time());
 		/*
@@ -446,6 +1129,8 @@
 	}
 	timer->expires = tim;
 
+	tstats_hrtimer_set_start_info(timer);
+
 	enqueue_hrtimer(timer, new_base);
 
 	unlock_hrtimer_base(timer, &flags);
@@ -466,13 +1151,13 @@
  */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
 {
-	struct hrtimer_base *base;
+	struct hrtimer_clock_base *base;
 	unsigned long flags;
 	int ret = -1;
 
 	base = lock_hrtimer_base(timer, &flags);
 
-	if (base->curr_timer != timer)
+	if (base->cpu_base->curr_timer != timer)
 		ret = remove_hrtimer(timer, base);
 
 	unlock_hrtimer_base(timer, &flags);
@@ -497,7 +1182,7 @@
 
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		hrtimer_wait_for_timer(timer);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -508,7 +1193,7 @@
  */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
-	struct hrtimer_base *base;
+	struct hrtimer_clock_base *base;
 	unsigned long flags;
 	ktime_t rem;
 
@@ -520,7 +1205,7 @@
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
@@ -529,26 +1214,38 @@
  */
 ktime_t hrtimer_get_next_event(void)
 {
-	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
 	ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
 	unsigned long flags;
 	int i;
 
-	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+#ifndef CONFIG_NO_HZ
+	/*
+	 * In high-res mode we dont need to get the next high-res
+	 * event on a tickless system:
+	 */
+	if (hrtimer_hres_active)
+		return mindelta;
+#endif
+
+	spin_lock_irqsave(&cpu_base->lock, flags);
+
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
 		struct hrtimer *timer;
 
-		spin_lock_irqsave(&base->lock, flags);
-		if (!base->first) {
-			spin_unlock_irqrestore(&base->lock, flags);
+		if (!base->first)
 			continue;
-		}
+
 		timer = rb_entry(base->first, struct hrtimer, node);
 		delta.tv64 = timer->expires.tv64;
-		spin_unlock_irqrestore(&base->lock, flags);
 		delta = ktime_sub(delta, base->get_time());
 		if (delta.tv64 < mindelta.tv64)
 			mindelta.tv64 = delta.tv64;
 	}
+
+	spin_unlock_irqrestore(&cpu_base->lock, flags);
+
 	if (mindelta.tv64 < 0)
 		mindelta.tv64 = 0;
 	return mindelta;
@@ -564,17 +1261,24 @@
 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		  enum hrtimer_mode mode)
 {
-	struct hrtimer_base *bases;
+	struct hrtimer_cpu_base *cpu_base;
 
 	memset(timer, 0, sizeof(struct hrtimer));
 
-	bases = __raw_get_cpu_var(hrtimer_bases);
+	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
 
 	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
 		clock_id = CLOCK_MONOTONIC;
 
-	timer->base = &bases[clock_id];
-	rb_set_parent(&timer->node, &timer->node);
+	timer->base = &cpu_base->clock_base[clock_id];
+	timer->state = HRTIMER_INACTIVE;
+	hrtimer_init_timer_hres(timer);
+
+#ifdef CONFIG_TIMER_STATS
+	timer->start_site = NULL;
+	timer->start_pid = -1;
+	memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
@@ -588,21 +1292,151 @@
  */
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
-	struct hrtimer_base *bases;
+	struct hrtimer_cpu_base *cpu_base;
 
-	bases = __raw_get_cpu_var(hrtimer_bases);
-	*tp = ktime_to_timespec(bases[which_clock].resolution);
+	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+	*tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct pt_regs *regs)
+{
+	struct hrtimer_clock_base *base;
+	ktime_t expires_next, now;
+	int i, raise = 0, cpu = smp_processor_id();
+	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+
+	BUG_ON(!cpu_base->hres_active);
+
+	/* Store the regs for an possible sched_timer callback */
+	cpu_base->sched_regs = regs;
+	cpu_base->events++;
+
+ retry:
+	now = ktime_get();
+
+	/* Check, if the jiffies need an update */
+	update_jiffies64(now);
+
+	expires_next.tv64 = KTIME_MAX;
+
+	base = cpu_base->clock_base;
+
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+		ktime_t basenow;
+		struct rb_node *node;
+
+		spin_lock(&cpu_base->lock);
+
+		basenow = ktime_add(now, base->offset);
+
+		while ((node = base->first)) {
+			struct hrtimer *timer;
+
+			timer = rb_entry(node, struct hrtimer, node);
+
+			if (basenow.tv64 < timer->expires.tv64) {
+				ktime_t expires;
+
+				expires = ktime_sub(timer->expires,
+						    base->offset);
+				if (expires.tv64 < expires_next.tv64)
+					expires_next = expires;
+				break;
+			}
+			__remove_hrtimer(timer, base, HRTIMER_PENDING, 0);
+
+			if (timer->cb_mode != HRTIMER_CB_SOFTIRQ) {
+				tstats_account_hrtimer(timer);
+				if (timer->function(timer) == HRTIMER_RESTART)
+					enqueue_hrtimer(timer, base);
+				else
+					timer->state = HRTIMER_INACTIVE;
+			} else {
+				hrtimer_add_cb_pending(timer, base);
+				raise = 1;
+			}
+		}
+		spin_unlock(&cpu_base->lock);
+		base++;
+	}
+
+	cpu_base->expires_next = expires_next;
+
+	/* Reprogramming necessary ? */
+	if (expires_next.tv64 != KTIME_MAX) {
+		if (clockevents_set_next_event(expires_next, 0))
+			goto retry;
+	}
+
+	/* Invalidate regs */
+	cpu_base->sched_regs = NULL;
+
+	/* Raise softirq ? */
+	if (raise)
+		hrtimer_raise_softirq(cpu_base);
+}
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+	struct hrtimer_cpu_base *cpu_base;
+
+	cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
+
+	spin_lock_irq(&cpu_base->lock);
+
+	while (hrtimer_base_cb_pending(cpu_base)) {
+		struct hrtimer *timer;
+		int (*fn)(struct hrtimer *);
+		int restart;
+
+		timer = hrtimer_base_get_cb_pending(cpu_base);
+
+		tstats_account_hrtimer(timer);
+
+		fn = timer->function;
+		set_curr_timer(cpu_base, timer);
+		__remove_hrtimer(timer, timer->base, HRTIMER_INACTIVE, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			enqueue_hrtimer(timer, timer->base);
+		} else if (hrtimer_active(timer)) {
+			/* Timer was rearmed on another CPU: */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+		set_curr_timer(cpu_base, NULL);
+		hrtimer_adjust_softirq_prio(cpu_base);
+	}
+	spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
+}
+
+#endif	/* CONFIG_HIGH_RES_TIMERS */
+
 /*
  * Expire the per base hrtimer-queue:
  */
-static inline void run_hrtimer_queue(struct hrtimer_base *base)
+static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
+				     int index)
 {
 	struct rb_node *node;
+	struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
 
 	if (!base->first)
 		return;
@@ -610,7 +1444,7 @@
 	if (base->get_softirq_time)
 		base->softirq_time = base->get_softirq_time();
 
-	spin_lock_irq(&base->lock);
+	spin_lock_irq(&cpu_base->lock);
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
@@ -621,36 +1455,49 @@
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
+		tstats_account_hrtimer(timer);
+
 		fn = timer->function;
-		set_curr_timer(base, timer);
-		__remove_hrtimer(timer, base);
-		spin_unlock_irq(&base->lock);
+		set_curr_timer(cpu_base, timer);
+		__remove_hrtimer(timer, base, HRTIMER_INACTIVE, 0);
+		spin_unlock_irq(&cpu_base->lock);
 
 		restart = fn(timer);
 
-		spin_lock_irq(&base->lock);
+		spin_lock_irq(&cpu_base->lock);
 
-		if (restart != HRTIMER_NORESTART) {
+		if (restart == HRTIMER_RESTART) {
 			BUG_ON(hrtimer_active(timer));
 			enqueue_hrtimer(timer, base);
 		}
 	}
-	set_curr_timer(base, NULL);
-	spin_unlock_irq(&base->lock);
+	set_curr_timer(cpu_base, NULL);
+	spin_unlock_irq(&cpu_base->lock);
+
+	wake_up_timer_waiters(cpu_base);
 }
 
 /*
  * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
  */
 void hrtimer_run_queues(void)
 {
-	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	int i;
 
-	hrtimer_get_softirq_time(base);
+	hrtimer_check_clocks();
+
+	if (hrtimer_hres_active)
+		return;
+
+	hrtimer_get_softirq_time(cpu_base);
 
-	for (i = 0; i < MAX_HRTIMER_BASES; i++)
-		run_hrtimer_queue(&base[i]);
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+		run_hrtimer_queue(cpu_base, i);
 }
 
 /*
@@ -673,6 +1520,9 @@
 {
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+#endif
 }
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,7 +1533,8 @@
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
 
-		schedule();
+		if (likely(t->task))
+			schedule();
 
 		hrtimer_cancel(&t->timer);
 		mode = HRTIMER_ABS;
@@ -779,26 +1630,32 @@
  */
 static void __devinit init_hrtimers_cpu(int cpu)
 {
-	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
 
-	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
-		spin_lock_init(&base->lock);
-		lockdep_set_class(&base->lock, &base->lock_key);
-	}
+	spin_lock_init(&cpu_base->lock);
+	lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
+
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+		cpu_base->clock_base[i].cpu_base = cpu_base;
+
+	hrtimer_init_hres(cpu_base);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&cpu_base->wait);
+#endif
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void migrate_hrtimer_list(struct hrtimer_base *old_base,
-				struct hrtimer_base *new_base)
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+				struct hrtimer_clock_base *new_base)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
 
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
-		__remove_hrtimer(timer, old_base);
+		__remove_hrtimer(timer, old_base, HRTIMER_INACTIVE, 0);
 		timer->base = new_base;
 		enqueue_hrtimer(timer, new_base);
 	}
@@ -806,29 +1663,26 @@
 
 static void migrate_hrtimers(int cpu)
 {
-	struct hrtimer_base *old_base, *new_base;
+	struct hrtimer_cpu_base *old_base, *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu(hrtimer_bases, cpu);
-	new_base = get_cpu_var(hrtimer_bases);
+	old_base = &per_cpu(hrtimer_bases, cpu);
+	new_base = &get_cpu_var(hrtimer_bases);
 
 	local_irq_disable();
 
-	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
-
-		spin_lock(&new_base->lock);
-		spin_lock(&old_base->lock);
+	spin_lock(&new_base->lock);
+	spin_lock(&old_base->lock);
 
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		BUG_ON(old_base->curr_timer);
 
-		migrate_hrtimer_list(old_base, new_base);
-
-		spin_unlock(&old_base->lock);
-		spin_unlock(&new_base->lock);
-		old_base++;
-		new_base++;
+		migrate_hrtimer_list(&old_base->clock_base[i],
+				     &new_base->clock_base[i]);
 	}
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
 
 	local_irq_enable();
 	put_cpu_var(hrtimer_bases);
@@ -868,5 +1722,8 @@
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+#endif
 }
 
diff -urN ./linux-2.6.18.1/kernel/irq/autoprobe.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/autoprobe.c
--- ./linux-2.6.18.1/kernel/irq/autoprobe.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/autoprobe.c	2007-05-19 23:58:35.000000000 +0900
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
diff -urN ./linux-2.6.18.1/kernel/irq/chip.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/chip.c
--- ./linux-2.6.18.1/kernel/irq/chip.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/chip.c	2007-05-19 23:58:35.000000000 +0900
@@ -18,6 +18,62 @@
 #include "internals.h"
 
 /**
+ *	dynamic_irq_init - initialize a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_init(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+		WARN_ON(1);
+		return;
+	}
+
+	/* Ensure we don't have left over values from a previous use of this irq */
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status = IRQ_DISABLED;
+	desc->chip = &no_irq_chip;
+	desc->handle_irq = handle_bad_irq;
+	desc->depth = 1;
+	desc->handler_data = NULL;
+	desc->chip_data = NULL;
+	desc->action = NULL;
+	desc->irq_count = 0;
+	desc->irqs_unhandled = 0;
+#ifdef CONFIG_SMP
+	desc->affinity = CPU_MASK_ALL;
+#endif
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+/**
+ *	dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+		WARN_ON(1);
+		return;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handle_irq = handle_bad_irq;
+	desc->chip = &no_irq_chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+
+/**
  *	set_irq_chip - set the irq chip for an irq
  *	@irq:	irq number
  *	@chip:	pointer to irq chip description structure
@@ -218,6 +274,11 @@
 		goto out_unlock;
 
 	desc->status |= IRQ_INPROGRESS;
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_unlock;
 	spin_unlock(&desc->lock);
 
 	action_ret = handle_IRQ_event(irq, regs, action);
@@ -267,6 +328,11 @@
 	}
 
 	desc->status |= IRQ_INPROGRESS;
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_unlock;
 	desc->status &= ~IRQ_PENDING;
 	spin_unlock(&desc->lock);
 
@@ -320,6 +386,13 @@
 	}
 
 	desc->status |= IRQ_INPROGRESS;
+	/*
+	 * fasteoi should not be used for threaded IRQ handlers!
+	 */
+	if (redirect_hardirq(desc)) {
+		WARN_ON_ONCE(1);
+		goto out_unlock;
+	}
 	desc->status &= ~IRQ_PENDING;
 	spin_unlock(&desc->lock);
 
@@ -331,7 +404,7 @@
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
-
+out_unlock:
 	spin_unlock(&desc->lock);
 }
 
@@ -381,6 +454,12 @@
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
 
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_unlock;
+
 	do {
 		struct irqaction *action = desc->action;
 		irqreturn_t action_ret;
diff -urN ./linux-2.6.18.1/kernel/irq/handle.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/handle.c
--- ./linux-2.6.18.1/kernel/irq/handle.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/handle.c	2007-05-19 23:58:35.000000000 +0900
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
@@ -54,12 +55,13 @@
 		.chip = &no_irq_chip,
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
-		.lock = SPIN_LOCK_UNLOCKED,
+		.lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc),
 #ifdef CONFIG_SMP
 		.affinity = CPU_MASK_ALL
 #endif
 	}
 };
+EXPORT_SYMBOL_GPL(irq_desc);
 
 /*
  * What should we do if we get a hw irq event on an illegal vector?
@@ -136,24 +138,65 @@
 
 	handle_dynamic_tick(action);
 
-	if (!(action->flags & IRQF_DISABLED))
-		local_irq_enable_in_hardirq();
+	/*
+	 * Unconditionally enable interrupts for threaded
+	 * IRQ handlers:
+	 */
+	if (!hardirq_count() || !(action->flags & IRQF_DISABLED))
+		local_irq_enable();
 
 	do {
+		unsigned int preempt_count = preempt_count();
+
 		ret = action->handler(irq, action->dev_id, regs);
+		if (preempt_count() != preempt_count) {
+			stop_trace();
+			print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler);
+			printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+			dump_stack();
+			preempt_count() = preempt_count;
+		}
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
 		action = action->next;
 	} while (action);
 
-	if (status & IRQF_SAMPLE_RANDOM)
+	if (status & IRQF_SAMPLE_RANDOM) {
+		local_irq_enable();
 		add_interrupt_randomness(irq);
+	}
 	local_irq_disable();
 
 	return retval;
 }
 
+/*
+ * Hack - used for development only.
+ */
+int debug_direct_keyboard = 0;
+
+int redirect_hardirq(struct irq_desc *desc)
+{
+	/*
+	 * Direct execution:
+	 */
+	if (!hardirq_preemption || (desc->status & IRQ_NODELAY) ||
+							!desc->thread)
+		return 0;
+
+#ifdef __i386__
+	if (debug_direct_keyboard && (desc - irq_desc == 1))
+		return 0;
+#endif
+
+	BUG_ON(!irqs_disabled());
+	if (desc->thread && desc->thread->state != TASK_RUNNING)
+		wake_up_process(desc->thread);
+
+	return 1;
+}
+
 /**
  * __do_IRQ - original all in one highlevel IRQ handler
  * @irq:	the interrupt number
@@ -166,7 +209,7 @@
  * This is the original x86 implementation which is used for every
  * interrupt type.
  */
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *action;
@@ -185,6 +228,13 @@
 		desc->chip->end(irq);
 		return 1;
 	}
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_softlockup_watchdog();
 
 	spin_lock(&desc->lock);
 	if (desc->chip->ack)
@@ -218,6 +268,12 @@
 		goto out;
 
 	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_no_end;
+
+	/*
 	 * Edge triggered interrupts need to remember
 	 * pending events.
 	 * This applies to any hw interrupts that allow a second
@@ -249,6 +305,7 @@
 	 * disabled while the handler was running.
 	 */
 	desc->chip->end(irq);
+out_no_end:
 	spin_unlock(&desc->lock);
 
 	return 1;
diff -urN ./linux-2.6.18.1/kernel/irq/internals.h linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/internals.h
--- ./linux-2.6.18.1/kernel/irq/internals.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/internals.h	2007-05-19 23:58:35.000000000 +0900
@@ -10,6 +10,10 @@
 /* Set default handler: */
 extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 
+extern int redirect_hardirq(struct irq_desc *desc);
+
+void recalculate_desc_flags(struct irq_desc *desc);
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff -urN ./linux-2.6.18.1/kernel/irq/manage.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/manage.c
--- ./linux-2.6.18.1/kernel/irq/manage.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/manage.c	2007-05-19 23:58:35.000000000 +0900
@@ -8,8 +8,10 @@
  */
 
 #include <linux/irq.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -33,8 +35,12 @@
 	if (irq >= NR_IRQS)
 		return;
 
-	while (desc->status & IRQ_INPROGRESS)
-		cpu_relax();
+	if (hardirq_preemption && !(desc->status & IRQ_NODELAY))
+		wait_event(desc->wait_for_handler,
+			!(desc->status & IRQ_INPROGRESS));
+	else
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
 }
 EXPORT_SYMBOL(synchronize_irq);
 
@@ -178,6 +184,21 @@
 EXPORT_SYMBOL(set_irq_wake);
 
 /*
+ * If any action has IRQF_NODELAY then turn IRQ_NODELAY on:
+ */
+void recalculate_desc_flags(struct irq_desc *desc)
+{
+	struct irqaction *action;
+
+	desc->status &= ~IRQ_NODELAY;
+	for (action = desc->action ; action; action = action->next)
+		if (action->flags & IRQF_NODELAY)
+			desc->status |= IRQ_NODELAY;
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc);
+
+/*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
  * for driver use.
@@ -241,6 +262,9 @@
 		rand_initialize_irq(irq);
 	}
 
+	if (!(new->flags & IRQF_NODELAY))
+		if (start_irq_thread(irq, desc))
+			return -ENOMEM;
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -274,6 +298,11 @@
 	}
 
 	*p = new;
+	/*
+	 * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY:
+	 */
+	recalculate_desc_flags(desc);
+
 #if defined(CONFIG_IRQ_PER_CPU)
 	if (new->flags & IRQF_PERCPU)
 		desc->status |= IRQ_PER_CPU;
@@ -316,7 +345,7 @@
 
 	new->irq = irq;
 	register_irq_proc(irq);
-	new->dir = NULL;
+	new->dir = new->threaded = NULL;
 	register_handler_proc(irq, new);
 
 	return 0;
@@ -383,6 +412,7 @@
 				else
 					desc->chip->disable(irq);
 			}
+			recalculate_desc_flags(desc);
 			spin_unlock_irqrestore(&desc->lock, flags);
 			unregister_handler_proc(irq, action);
 
@@ -476,3 +506,268 @@
 }
 EXPORT_SYMBOL(request_irq);
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+int hardirq_preemption = 1;
+
+EXPORT_SYMBOL(hardirq_preemption);
+
+/*
+ * Real-Time Preemption depends on hardirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init hardirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		hardirq_preemption = 0;
+	else
+		get_option(&str, &hardirq_preemption);
+	if (!hardirq_preemption)
+		printk("turning off hardirq preemption!\n");
+
+	return 1;
+}
+
+__setup("hardirq-preempt=", hardirq_preempt_setup);
+
+#endif
+
+/*
+ * threaded simple handler
+ */
+static void thread_simple_irq(irq_desc_t *desc)
+{
+	struct irqaction *action = desc->action;
+	unsigned int irq = desc - irq_desc;
+	irqreturn_t action_ret;
+
+	if (action && !desc->depth) {
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		local_irq_enable();
+		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded level type irq handler
+ */
+static void thread_level_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	thread_simple_irq(desc);
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * threaded fasteoi type irq handler
+ */
+static void thread_fasteoi_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	thread_simple_irq(desc);
+	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_edge_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->chip->mask(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->chip->unmask(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		local_irq_enable();
+		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_do_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->chip->disable(irq);
+			return;
+		}
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		local_irq_enable();
+		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	desc->chip->end(irq);
+}
+
+static void do_hardirq(struct irq_desc *desc)
+{
+	spin_lock_irq(&desc->lock);
+
+	if (!(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	if (desc->handle_irq == handle_simple_irq)
+		thread_simple_irq(desc);
+	else if (desc->handle_irq == handle_level_irq)
+		thread_level_irq(desc);
+	else if (desc->handle_irq == handle_fasteoi_irq)
+		thread_fasteoi_irq(desc);
+	else if (desc->handle_irq == handle_edge_irq)
+		thread_edge_irq(desc);
+	else
+		thread_do_irq(desc);
+ out:
+	spin_unlock_irq(&desc->lock);
+
+	if (waitqueue_active(&desc->wait_for_handler))
+		wake_up(&desc->wait_for_handler);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+static int curr_irq_prio = 49;
+
+static int do_irqd(void * __desc)
+{
+	struct sched_param param = { 0, };
+	struct irq_desc *desc = __desc;
+#ifdef CONFIG_SMP
+	int irq = desc - irq_desc;
+	cpumask_t mask;
+
+	mask = cpumask_of_cpu(any_online_cpu(irq_desc[irq].affinity));
+	set_cpus_allowed(current, mask);
+#endif
+	current->flags |= PF_NOFREEZE | PF_HARDIRQ;
+
+	/*
+	 * Scale irq thread priorities from prio 50 to prio 25
+	 */
+	param.sched_priority = curr_irq_prio;
+	if (param.sched_priority > 25)
+		curr_irq_prio = param.sched_priority - 1;
+
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		do_hardirq(desc);
+		cond_resched_all();
+		local_irq_disable();
+		__do_softirq();
+		local_irq_enable();
+#ifdef CONFIG_SMP
+		/*
+		 * Did IRQ affinities change?
+		 */
+		if (!cpus_equal(current->cpus_allowed, irq_desc[irq].affinity))
+			set_cpus_allowed(current, irq_desc[irq].affinity);
+#endif
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int ok_to_create_irq_threads;
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	if (desc->thread || !ok_to_create_irq_threads)
+		return 0;
+
+	desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq);
+	if (!desc->thread) {
+		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
+		return -ENOMEM;
+	}
+
+	/*
+	 * An interrupt may have come in before the thread pointer was
+	 * stored in desc->thread; make sure the thread gets woken up in
+	 * such a case:
+	 */
+	smp_mb();
+	wake_up_process(desc->thread);
+
+	return 0;
+}
+
+void __init init_hardirqs(void)
+{
+	int i;
+	ok_to_create_irq_threads = 1;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+
+		if (desc->action && !(desc->status & IRQ_NODELAY))
+			start_irq_thread(i, desc);
+	}
+}
+
+#else
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
+
+#endif
+
+void __init early_init_hardirqs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		init_waitqueue_head(&irq_desc[i].wait_for_handler);
+}
diff -urN ./linux-2.6.18.1/kernel/irq/migration.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/migration.c
--- ./linux-2.6.18.1/kernel/irq/migration.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/migration.c	2007-05-19 23:58:35.000000000 +0900
@@ -7,17 +7,17 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	desc->move_irq = 1;
+	desc->status |= IRQ_MOVE_PENDING;
 	irq_desc[irq].pending_mask = mask;
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
-void move_native_irq(int irq)
+void move_masked_irq(int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	cpumask_t tmp;
 
-	if (likely(!desc->move_irq))
+	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
 
 	/*
@@ -28,7 +28,7 @@
 		return;
 	}
 
-	desc->move_irq = 0;
+	desc->status &= ~IRQ_MOVE_PENDING;
 
 	if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
 		return;
@@ -48,15 +48,29 @@
 	 * when an active trigger is comming in. This could
 	 * cause some ioapics to mal-function.
 	 * Being paranoid i guess!
+	 *
+	 * For correct operation this depends on the caller
+	 * masking the irqs.
 	 */
 	if (likely(!cpus_empty(tmp))) {
-		if (likely(!(desc->status & IRQ_DISABLED)))
-			desc->chip->disable(irq);
-
 		desc->chip->set_affinity(irq,tmp);
-
-		if (likely(!(desc->status & IRQ_DISABLED)))
-			desc->chip->enable(irq);
 	}
 	cpus_clear(irq_desc[irq].pending_mask);
 }
+
+void move_native_irq(int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+		return;
+
+	if (likely(!(desc->status & IRQ_DISABLED)))
+		desc->chip->disable(irq);
+
+	move_masked_irq(irq);
+
+	if (likely(!(desc->status & IRQ_DISABLED)))
+		desc->chip->enable(irq);
+}
+
diff -urN ./linux-2.6.18.1/kernel/irq/proc.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/proc.c
--- ./linux-2.6.18.1/kernel/irq/proc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/proc.c	2007-05-19 23:58:35.000000000 +0900
@@ -7,6 +7,8 @@
  */
 
 #include <linux/irq.h>
+#include <asm/uaccess.h>
+#include <linux/profile.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
@@ -79,37 +81,6 @@
 
 #endif
 
-#define MAX_NAMELEN 128
-
-static int name_unique(unsigned int irq, struct irqaction *new_action)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	struct irqaction *action;
-
-	for (action = desc->action ; action; action = action->next)
-		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name))
-			return 0;
-	return 1;
-}
-
-void register_handler_proc(unsigned int irq, struct irqaction *action)
-{
-	char name [MAX_NAMELEN];
-
-	if (!irq_desc[irq].dir || action->dir || !action->name ||
-					!name_unique(irq, action))
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	snprintf(name, MAX_NAMELEN, "%s", action->name);
-
-	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_desc[irq].dir);
-}
-
-#undef MAX_NAMELEN
-
 #define MAX_NAMELEN 10
 
 void register_irq_proc(unsigned int irq)
@@ -148,10 +119,96 @@
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
+	if (action->threaded)
+		remove_proc_entry(action->threaded->name, action->dir);
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+static int threaded_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	return sprintf(page, "%c\n",
+		((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1');
+}
+
+static int threaded_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	int c;
+	struct irqaction *action = data;
+	irq_desc_t *desc = irq_desc + action->irq;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+	if (c != '0' && c != '1')
+		return -EINVAL;
+
+	spin_lock_irq(&desc->lock);
+
+	if (c == '0')
+		action->flags |= IRQF_NODELAY;
+	if (c == '1')
+		action->flags &= ~IRQF_NODELAY;
+	recalculate_desc_flags(desc);
+
+	spin_unlock_irq(&desc->lock);
+
+	return 1;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_desc[irq].dir || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_desc[irq].dir);
+
+	if (!action->dir)
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	{
+		struct proc_dir_entry *entry;
+		/* create /proc/irq/1234/handler/threaded */
+		entry = create_proc_entry("threaded", 0600, action->dir);
+		if (!entry)
+			return;
+		entry->nlink = 1;
+		entry->data = (void *)action;
+		entry->read_proc = threaded_read_proc;
+		entry->write_proc = threaded_write_proc;
+		action->threaded = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
 void init_irq_proc(void)
 {
 	int i;
@@ -161,6 +218,9 @@
 	if (!root_irq_dir)
 		return;
 
+	/* create /proc/irq/prof_cpu_mask */
+	create_prof_cpu_mask(root_irq_dir);
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
diff -urN ./linux-2.6.18.1/kernel/irq/spurious.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/spurious.c
--- ./linux-2.6.18.1/kernel/irq/spurious.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/irq/spurious.c	2007-05-19 23:58:35.000000000 +0900
@@ -10,6 +10,10 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#ifdef CONFIG_X86_IO_APIC
+# include <asm/apicdef.h>
+# include <asm/io_apic.h>
+#endif
 
 static int irqfixup __read_mostly;
 
@@ -55,9 +59,8 @@
 			}
 			action = action->next;
 		}
-		local_irq_disable();
 		/* Now clean up the flags */
-		spin_lock(&desc->lock);
+		spin_lock_irq(&desc->lock);
 		action = desc->action;
 
 		/*
@@ -163,6 +166,12 @@
 		 * The interrupt is stuck
 		 */
 		__report_bad_irq(irq, desc, action_ret);
+#ifdef CONFIG_X86_IO_APIC
+		if (!sis_apic_bug) {
+			sis_apic_bug = 1;
+			printk(KERN_ERR "turning off IO-APIC fast mode.\n");
+		}
+#else
 		/*
 		 * Now kill the IRQ
 		 */
@@ -170,6 +179,7 @@
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 		desc->chip->disable(irq);
+#endif
 	}
 	desc->irqs_unhandled = 0;
 }
@@ -188,6 +198,11 @@
 
 static int __init irqfixup_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqfixup boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 1;
 	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
 	printk(KERN_WARNING "This may impact system performance.\n");
@@ -199,6 +214,11 @@
 
 static int __init irqpoll_setup(char *str)
 {
+#ifdef CONFIG_PREEMPT_RT
+	printk(KERN_WARNING "irqpoll boot option not supported "
+		"w/ CONFIG_PREEMPT_RT\n");
+	return 1;
+#endif
 	irqfixup = 2;
 	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
 				"enabled\n");
diff -urN ./linux-2.6.18.1/kernel/itimer.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/itimer.c
--- ./linux-2.6.18.1/kernel/itimer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/itimer.c	2007-05-19 23:58:35.000000000 +0900
@@ -136,7 +136,7 @@
 	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
 
 	if (sig->it_real_incr.tv64 != 0) {
-		hrtimer_forward(timer, timer->base->softirq_time,
+		hrtimer_forward(timer, hrtimer_cb_get_time(timer),
 				sig->it_real_incr);
 		return HRTIMER_RESTART;
 	}
@@ -229,6 +229,7 @@
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
 			spin_unlock_irq(&tsk->sighand->siglock);
+			hrtimer_wait_for_timer(&tsk->signal->real_timer);
 			goto again;
 		}
 		tsk->signal->it_real_incr =
diff -urN ./linux-2.6.18.1/kernel/latency.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/latency.c
--- ./linux-2.6.18.1/kernel/latency.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/latency.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,279 @@
+/*
+ * latency.c: Explicit system-wide latency-expectation infrastructure
+ *
+ * The purpose of this infrastructure is to allow device drivers to set
+ * latency constraint they have and to collect and summarize these
+ * expectations globally. The cummulated result can then be used by
+ * power management and similar users to make decisions that have
+ * tradoffs with a latency component.
+ *
+ * An example user of this are the x86 C-states; each higher C state saves
+ * more power, but has a higher exit latency. For the idle loop power
+ * code to make a good decision which C-state to use, information about
+ * acceptable latencies is required.
+ *
+ * An example announcer of latency is an audio driver that knowns it
+ * will get an interrupt when the hardware has 200 usec of samples
+ * left in the DMA buffer; in that case the driver can set a latency
+ * constraint of, say, 150 usec.
+ *
+ * Multiple drivers can each announce their maximum accepted latency,
+ * to keep these appart, a string based identifier is used.
+ *
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latency.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <asm/atomic.h>
+
+struct latency_info {
+	struct list_head list;
+	int usecs;
+	char *identifier;
+};
+
+/*
+ * locking rule: all modifications to current_max_latency and
+ * latency_list need to be done while holding the latency_lock.
+ * latency_lock needs to be taken _irqsave.
+ */
+static atomic_t current_max_latency;
+static DEFINE_SPINLOCK(latency_lock);
+
+static LIST_HEAD(latency_list);
+static BLOCKING_NOTIFIER_HEAD(latency_notifier);
+
+/*
+ * This function returns the maximum latency allowed, which
+ * happens to be the minimum of all maximum latencies on the
+ * list.
+ */
+static int __find_max_latency(void)
+{
+	int min = INFINITE_LATENCY;
+	struct latency_info *info;
+
+	list_for_each_entry(info, &latency_list, list) {
+		if (info->usecs < min)
+			min = info->usecs;
+	}
+	return min;
+}
+
+/**
+ * set_acceptable_latency - sets the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function sleeps and can only be called from process
+ * context.
+ * Calling this function with an existing identifier is valid
+ * and will cause the existing latency setting to be changed.
+ */
+void set_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *info, *iter;
+	unsigned long flags;
+	int found_old = 0;
+
+	info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
+	if (!info)
+		return;
+	info->usecs = usecs;
+	info->identifier = kstrdup(identifier, GFP_KERNEL);
+	if (!info->identifier)
+		goto free_info;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier)==0) {
+			found_old = 1;
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (!found_old)
+		list_add(&info->list, &latency_list);
+
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+
+	/*
+	 * if we inserted the new one, we're done; otherwise there was
+	 * an existing one so we need to free the redundant data
+	 */
+	if (!found_old)
+		return;
+
+	kfree(info->identifier);
+free_info:
+	kfree(info);
+}
+EXPORT_SYMBOL_GPL(set_acceptable_latency);
+
+/**
+ * modify_acceptable_latency - changes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ *
+ * Due to the atomic nature of this function, the modified latency
+ * value will only be used for future decisions; past decisions
+ * can still lead to longer latencies in the near future.
+ */
+void modify_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *iter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier)==0)
+			iter->usecs = usecs;
+		break;
+	}
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(modify_acceptable_latency);
+
+/**
+ * remove_acceptable_latency - removes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ *
+ * This function removes a previously set maximum latency setting
+ * for the driver and frees up any resources associated with the
+ * bookkeeping needed for this.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ */
+
+void remove_acceptable_latency(char *identifier)
+{
+	unsigned long flags;
+	int newmax = 0;
+	struct latency_info *iter, *temp;
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	list_for_each_entry_safe(iter,  temp, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier)==0) {
+			list_del(&iter->list);
+			newmax = iter->usecs;
+			kfree(iter->identifier);
+			kfree(iter);
+			break;
+		}
+	}
+
+	/* If we just deleted the system wide value, we need to
+	 * recalculate with a full search
+	 */
+	if (newmax == atomic_read(&current_max_latency)) {
+		newmax = __find_max_latency();
+		atomic_set(&current_max_latency, newmax);
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remove_acceptable_latency);
+
+/**
+ * system_latency_constraint - queries the system wide latency maximum
+ *
+ * This function returns the system wide maximum latency in
+ * microseconds.
+ *
+ * This function does not sleep and can be called in any context.
+ */
+int system_latency_constraint(void)
+{
+	return atomic_read(&current_max_latency);
+}
+EXPORT_SYMBOL_GPL(system_latency_constraint);
+
+/**
+ * synchronize_acceptable_latency - recalculates all latency decisions
+ *
+ * This function will cause a callback to various kernel pieces that
+ * will make those pieces rethink their latency decisions. This implies
+ * that if there are overlong latencies in hardware state already, those
+ * latencies get taken right now. When this call completes no overlong
+ * latency decisions should be active anymore.
+ *
+ * Typical usecase of this is after a modify_acceptable_latency() call,
+ * which in itself is non-blocking and non-synchronizing.
+ *
+ * This function blocks and should not be called with locks held.
+ */
+
+void synchronize_acceptable_latency(void)
+{
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+}
+EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
+
+/*
+ * Latency notifier: this notifier gets called when a non-atomic new
+ * latency value gets set. The expectation nof the caller of the
+ * non-atomic set is that when the call returns, future latencies
+ * are within bounds, so the functions on the notifier list are
+ * expected to take the overlong latencies immediately, inside the
+ * callback, and not make a overlong latency decision anymore.
+ *
+ * The callback gets called when the new latency value is made
+ * active so system_latency_constraint() returns the new latency.
+ */
+int register_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_register(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(register_latency_notifier);
+
+int unregister_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_unregister(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_latency_notifier);
+
+static __init int latency_init(void)
+{
+	atomic_set(&current_max_latency, INFINITE_LATENCY);
+	/*
+	 * we don't want by default to have longer latencies than 2 ticks,
+	 * since that would cause lost ticks
+	 */
+	set_acceptable_latency("kernel", 2*1000000/HZ);
+	return 0;
+}
+
+module_init(latency_init);
diff -urN ./linux-2.6.18.1/kernel/latency_hist.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/latency_hist.c
--- ./linux-2.6.18.1/kernel/latency_hist.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/latency_hist.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,267 @@
+/*
+ * kernel/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/latency_hist.h>
+#include <asm/atomic.h>
+
+typedef struct hist_data_struct {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	unsigned long min_lat;
+	unsigned long avg_lat;
+	unsigned long max_lat;
+	unsigned long long beyond_hist_bound_samples;
+	unsigned long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+} hist_data_t;
+
+static struct proc_dir_entry * latency_hist_root = NULL;
+static char * latency_hist_proc_dir_root = "latency_hist";
+
+static char * percpu_proc_name = "CPU";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist);
+static char * interrupt_off_hist_proc_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, preempt_off_hist);
+static char * preempt_off_hist_proc_dir = "preempt_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist);
+static char * wakeup_latency_hist_proc_dir = "wakeup_latency";
+#endif
+
+static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS];
+
+static inline u64 u64_div(u64 x, u64 y)
+{
+        do_div(x, y);
+        return x;
+}
+
+void latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+	hist_data_t * my_hist;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+			|| (latency_type > WAKEUP_LATENCY) || (latency < 0))
+		return;
+
+	switch(latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu);
+		break;
+#endif
+	default:
+		return;
+	}
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency >= MAX_ENTRY_NUM)
+		my_hist->beyond_hist_bound_samples++;
+	else
+		my_hist->hist_array[latency]++;
+
+	if (latency < my_hist->min_lat)
+		my_hist->min_lat = latency;
+	else if (latency > my_hist->max_lat)
+		my_hist->max_lat = latency;
+
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+	my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat,
+						  my_hist->total_samples);
+	return;
+}
+
+static void *l_start(struct seq_file *m, loff_t * pos)
+{
+	loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+	loff_t index = *pos;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (!index_ptr)
+		return NULL;
+
+	if (index == 0) {
+		atomic_dec(&my_hist->hist_mode);
+		seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+			   "#Average latency: %lu microseconds.\n"
+			   "#Maximum latency: %lu microseconds.\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples greater or equal than %d microseconds\n"
+			   "#usecs\t%16s\n"
+			   , my_hist->min_lat
+			   , my_hist->avg_lat
+			   , my_hist->max_lat
+			   , my_hist->total_samples
+			   , my_hist->beyond_hist_bound_samples
+			   , MAX_ENTRY_NUM, "samples");
+	}
+	if (index >= MAX_ENTRY_NUM)
+		return NULL;
+
+	*index_ptr = index;
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	loff_t *index_ptr = p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *entry_ptr = NULL;
+	int ret, i, j, break_flags = 0;
+	struct seq_file *seq;
+
+	entry_ptr = PDE(file->f_dentry->d_inode);
+	for (i = 0; i < LATENCY_TYPE_NUM; i++) {
+		for (j = 0; j < NR_CPUS; j++) {
+			if (entry[i][j] == NULL)
+				continue;
+			if (entry_ptr->low_ino == entry[i][j]->low_ino) {
+				break_flags = 1;
+				break;
+			}
+		}
+		if (break_flags == 1)
+			break;
+	}
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (break_flags == 1) {
+		seq = (struct seq_file *)file->private_data;
+		seq->private = entry[i][j]->data;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_seq_fops = {
+	.open = latency_hist_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int latency_hist_init(void)
+{
+	struct proc_dir_entry *tmp_parent_proc_dir;
+	int i = 0, len = 0;
+	hist_data_t *my_hist;
+	char procname[64];
+
+	latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL);
+
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[INTERRUPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i);
+		entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[PREEMPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i);
+		entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[WAKEUP_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i);
+		entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+	return 0;
+
+}
+
+__initcall(latency_hist_init);
+
diff -urN ./linux-2.6.18.1/kernel/latency_trace.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/latency_trace.c
--- ./linux-2.6.18.1/kernel/latency_trace.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/latency_trace.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,2582 @@
+/*
+ *  kernel/latency_trace.c
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/rtc.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/latency_hist.h>
+#include <linux/utsrelease.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#ifndef irqs_off
+# define irqs_off()			0
+#endif
+
+#ifndef DEBUG_WARN_ON
+static inline int DEBUG_WARN_ON(int cond)
+{
+	WARN_ON(cond);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+#  define irqs_off_preempt_count() preempt_count()
+# else
+#  define irqs_off_preempt_count() 0
+# endif
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+	__raw_spinlock_t trace_lock;
+	struct task_struct *task;
+	int cpu;
+	struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+		{ trace_lock: __RAW_SPIN_LOCK_UNLOCKED };
+
+int wakeup_timing = 1;
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+#include <asm/rtc.h>
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+#ifdef CONFIG_LATENCY_HIST
+static cycles_t preempt_max_latency = (cycles_t)0UL;
+#else
+static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX;
+#endif
+
+static cycles_t preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycles_t delta)
+{
+	if (latency_hist_flag && !trace_user_triggered)
+		return 1;
+
+	if (preempt_thresh) {
+		if (delta < preempt_thresh)
+			return 0;
+	} else {
+		if (delta <= preempt_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+
+/*
+ * trace_stop_sched_switched must not be called with runqueue locks held!
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp unsigned long max_sequence;
+
+enum trace_type
+{
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_SPECIAL,
+	TRACE_SPECIAL_PID,
+	TRACE_SPECIAL_U64,
+	TRACE_CMDLINE,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+	TRACE_FLAG_IRQS_HARD_OFF	= 0x10,
+};
+
+
+#ifdef CONFIG_LATENCY_TRACE
+
+/*
+ * On DEBUG_PAGEALLOC && SMP there's not too much lowmem, so reduce
+ * the # of trace entries, or else we OOM on bootup. Same applies for
+ * ARM where we have only 4MB boot window for kernel text+data+bss.
+ *
+ * The large buffer allocates 8MB memory, which might also be more
+ * than the available memory on a small embedded box. This needs more
+ * thought for embedded devices and should be initialized at runtime
+ * under consideration of the available memory resources.
+ */
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_SMP) && !defined(CONFIG_ARM)
+# define MAX_TRACE (unsigned long)(8192*16-1)
+#else
+# define MAX_TRACE (unsigned long)(8192*2-1)
+#endif
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+	char type;
+	char cpu;
+	char flags;
+	char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+	int pid;
+	cycles_t timestamp;
+	union {
+		struct {
+			unsigned long eip;
+			unsigned long parent_eip;
+		} fn;
+		struct {
+			unsigned long eip;
+			unsigned long v1, v2, v3;
+		} special;
+		struct {
+			unsigned char str[CMDLINE_BYTES];
+		} cmdline;
+		struct {
+			unsigned long nr; // highest bit: compat call
+			unsigned long p1, p2, p3;
+		} syscall;
+		struct {
+			unsigned long ret;
+		} sysret;
+		struct {
+			unsigned long __pad3[4];
+		} pad;
+	} u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+	atomic_t disabled;
+	unsigned long trace_idx;
+	cycles_t preempt_timestamp;
+	unsigned long critical_start, critical_end;
+	unsigned long critical_sequence;
+	atomic_t overrun;
+	int early_warning;
+	int latency_type;
+	int cpu;
+
+#ifdef CONFIG_LATENCY_TRACE
+	struct trace_entry trace[MAX_TRACE];
+	char comm[CMDLINE_BYTES];
+	pid_t pid;
+	unsigned long uid;
+	unsigned long nice;
+	unsigned long policy;
+	unsigned long rt_priority;
+	unsigned long saved_latency;
+#endif
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	unsigned long stack_check;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp =
+{ [0 ... NR_CPUS-1] = {
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ .stack_check = 1
+#endif
+ } };
+
+static unsigned long notrace cycles_to_usecs(cycles_t delta)
+{
+#ifdef CONFIG_X86
+	do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+	delta = mulhwu(tb_to_us, delta);
+#elif defined(CONFIG_ARM)
+	delta = mach_cycles_to_usecs(delta);
+#else
+	#error Implement cycles_to_usecs.
+#endif
+
+	return (unsigned long) delta;
+}
+
+static cycles_t notrace usecs_to_cycles(unsigned long delta)
+{
+#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+	return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1);
+#elif defined(CONFIG_ARM)
+	return mach_usecs_to_cycles(delta);
+#else
+	#error Implement usecs_to_cycles
+#endif
+}
+
+#ifdef CONFIG_LATENCY_TRACE
+
+int trace_enabled = 0;
+int mcount_enabled = 1;
+int trace_freerunning = 0;
+int trace_print_at_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+int print_functions = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+int trace_user_trigger_irq = -1;
+
+struct saved_trace_struct {
+	int cpu;
+	cycles_t first_timestamp, last_timestamp;
+	struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+static void notrace printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		printk("%s+%#lx/%#lx", sym_name, offset, size);
+	else
+		printk("<%08lx>", eip);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+#ifndef STACK_WARN
+# define STACK_WARN (THREAD_SIZE/8)
+#endif
+
+#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN)
+#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info))
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER)
+# define PRINT_EXACT_STACKFRAME
+#endif
+
+#ifdef PRINT_EXACT_STACKFRAME
+static unsigned long *worst_stack_bp;
+#endif
+static DEFINE_RAW_SPINLOCK(worst_stack_lock);
+unsigned long worst_stack_left = THREAD_SIZE;
+static unsigned long worst_stack_printed = THREAD_SIZE;
+static char worst_stack_comm[TASK_COMM_LEN+1];
+static int worst_stack_pid;
+static unsigned long worst_stack_sp;
+static char worst_stack[THREAD_SIZE];
+
+static notrace void fill_worst_stack(unsigned long stack_left)
+{
+	unsigned long flags;
+
+	/*
+	 * On x64, we must not read the PDA during early bootup:
+	 */
+#ifdef CONFIG_X86_64
+	if (system_state == SYSTEM_BOOTING)
+		return;
+#endif
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (likely(stack_left < worst_stack_left)) {
+		worst_stack_left = stack_left;
+		memcpy(worst_stack, current_thread_info(), THREAD_SIZE);
+		worst_stack_sp = (unsigned long)&stack_left;
+		memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN);
+		worst_stack_pid = current->pid;
+#ifdef PRINT_EXACT_STACKFRAME
+# ifdef __i386__
+		asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp));
+# elif defined(__x86_64__)
+		asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp));
+# else
+#  error Poke the author of above asm code lines !
+# endif
+#endif
+	}
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+}
+
+#ifdef PRINT_EXACT_STACKFRAME
+
+/*
+ * This takes a BP offset to point the BP back into the saved stack,
+ * the original stack might be long gone (but the stackframe within
+ * the saved copy still contains references to it).
+ */
+#define CONVERT_TO_SAVED_STACK(bp) \
+	((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1)))
+
+static void show_stackframe(void)
+{
+	unsigned long addr, frame_size, *bp, *prev_bp, sum = 0;
+
+	bp = CONVERT_TO_SAVED_STACK(worst_stack_bp);
+
+	while (bp[0]) {
+		addr = bp[1];
+		if (!kernel_text_address(addr))
+			break;
+
+		prev_bp = bp;
+		bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]);
+
+		frame_size = (bp - prev_bp) * sizeof(long);
+
+		if (frame_size < THREAD_SIZE) {
+			printk("{ %4ld} ", frame_size);
+			sum += frame_size;
+		} else
+			printk("{=%4ld} ", sum);
+
+		printk("[<%08lx>] ", addr);
+		printk_name(addr);
+		printk("\n");
+	}
+}
+
+#else
+
+static inline int valid_stack_ptr(void *p)
+{
+	return  p > (void *)worst_stack &&
+                p < (void *)worst_stack + THREAD_SIZE - 3;
+}
+
+static void show_stackframe(void)
+{
+	unsigned long prev_frame, addr;
+	unsigned long *stack;
+
+	prev_frame = (unsigned long)(worst_stack +
+					(worst_stack_sp & (THREAD_SIZE-1)));
+	stack = (unsigned long *)prev_frame;
+
+	while (valid_stack_ptr(stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			printk("(%4ld) ", (unsigned long)stack - prev_frame);
+			printk("[<%08lx>] ", addr);
+			print_symbol("%s\n", addr);
+			prev_frame = (unsigned long)stack;
+		}
+		if ((char *)stack >= worst_stack + THREAD_SIZE)
+			break;
+	}
+}
+
+#endif
+
+static notrace void __print_worst_stack(void)
+{
+	unsigned long fill_ratio;
+	printk("----------------------------->\n");
+	printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n",
+		worst_stack_comm, worst_stack_pid,
+		MAX_STACK-worst_stack_left, (long)MAX_STACK);
+	fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK;
+	printk("| Stack fill ratio: %02ld%%", fill_ratio);
+	if (fill_ratio >= 90)
+		printk(" - BUG: that's quite high, please report this!\n");
+	else
+		printk(" - that's still OK, no need to report this.\n");
+	printk("------------|\n");
+
+	show_stackframe();
+	printk("<---------------------------\n\n");
+}
+
+static notrace void print_worst_stack(void)
+{
+	unsigned long flags;
+
+	if (irqs_disabled())
+		return;
+
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (worst_stack_printed == worst_stack_left) {
+		spin_unlock_irqrestore(&worst_stack_lock, flags);
+		return;
+	}
+	worst_stack_printed = worst_stack_left;
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+
+	__print_worst_stack();
+}
+
+static notrace void debug_stackoverflow(struct cpu_trace *tr)
+{
+	long stack_left;
+
+	if (unlikely(tr->stack_check <= 0))
+		return;
+	atomic_inc(&tr->disabled);
+
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+#ifdef __i386__
+	__asm__ __volatile__("and %%esp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#elif defined(__x86_64__)
+	__asm__ __volatile__("and %%rsp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#else
+# error Poke the author of above asm code lines !
+#endif
+	if (unlikely(stack_left < MIN_STACK_NEEDED)) {
+		tr->stack_check = 0;
+		printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n",
+			stack_left - sizeof(struct thread_info),
+			(long)&stack_left,
+			(long)current_thread_info(),
+			(long)current_thread_info() + THREAD_SIZE);
+		fill_worst_stack(stack_left);
+		__print_worst_stack();
+		goto out;
+	}
+	if (unlikely(stack_left < worst_stack_left)) {
+		tr->stack_check--;
+		fill_worst_stack(stack_left);
+		print_worst_stack();
+		tr->stack_check++;
+	} else
+		if (worst_stack_printed != worst_stack_left) {
+			tr->stack_check--;
+			print_worst_stack();
+			tr->stack_check++;
+		}
+out:
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void notrace early_printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		early_printk("%s <%08lx>", sym_name, eip);
+	else
+		early_printk("<%08lx>", eip);
+}
+
+static __raw_spinlock_t early_print_lock;
+
+static void notrace early_print_entry(struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	__raw_spin_lock(&early_print_lock);
+	early_printk("%-5d ", entry->pid);
+
+	early_printk("%d%c%c",
+		entry->cpu,
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		early_printk("H");
+	else {
+		if (hardirq)
+			early_printk("h");
+		else {
+			if (softirq)
+				early_printk("s");
+			else
+				early_printk(".");
+		}
+	}
+
+	early_printk(":%d: ", entry->preempt_count);
+
+	if (entry->type == TRACE_FN) {
+		early_printk_name(entry->u.fn.eip);
+		early_printk("  <= (");
+		early_printk_name(entry->u.fn.parent_eip);
+		early_printk(")\n");
+	} else {
+		/* special entries: */
+		early_printk_name(entry->u.special.eip);
+		early_printk(": <%08lx> <%08lx> <%08lx>\n",
+			entry->u.special.v1,
+			entry->u.special.v2,
+			entry->u.special.v3);
+	}
+	__raw_spin_unlock(&early_print_lock);
+}
+#else
+#  define early_print_entry(x) do { } while(0)
+#endif
+
+static void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long idx, idx_next;
+	cycles_t timestamp;
+	u32 pc;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+//	WARN_ON(!atomic_read(&tr->disabled));
+#endif
+	if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && !trace_print_at_crash && !print_functions)
+		goto out;
+	/*
+	 * Allocate the next index. Make sure an NMI (or interrupt)
+	 * has not taken it away. Potentially redo the timestamp as
+	 * well to make sure the trace timestamps are in chronologic
+	 * order.
+	 */
+again:
+	idx = tr->trace_idx;
+	idx_next = idx + 1;
+	timestamp = get_cycles();
+
+	if (unlikely((trace_freerunning || print_functions) &&
+						(idx_next >= MAX_TRACE)))
+		idx_next = 0;
+	if (unlikely(idx_next >= MAX_TRACE)) {
+		atomic_inc(&tr->overrun);
+		goto out;
+	}
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx))
+		goto again;
+#else
+# ifdef CONFIG_SMP
+#  error CMPXCHG missing
+# else
+	/* No worry, we are protected by the atomic_incr(&tr->disabled)
+	 * in __trace further down
+	 */
+	tr->trace_idx = idx_next;
+# endif
+#endif
+	pc = preempt_count();
+
+	entry = tr->trace + idx;
+	entry->type = type;
+#ifdef CONFIG_SMP
+	entry->cpu = cpu;
+#endif
+	entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) |
+		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)|
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+	entry->preempt_count = pc & 0xff;
+	entry->pid = current->pid;
+	entry->timestamp = timestamp;
+
+	switch (type) {
+	case TRACE_FN:
+		entry->u.fn.eip = eip;
+		entry->u.fn.parent_eip = parent_eip;
+		if (unlikely(print_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_SPECIAL_PID:
+	case TRACE_SPECIAL_U64:
+		entry->u.special.eip = eip;
+		entry->u.special.v1 = v1;
+		entry->u.special.v2 = v2;
+		entry->u.special.v3 = v3;
+		if (unlikely(print_functions && !in_interrupt()))
+			early_print_entry(entry);
+		break;
+	case TRACE_SYSCALL:
+		entry->u.syscall.nr = eip;
+		entry->u.syscall.p1 = v1;
+		entry->u.syscall.p2 = v2;
+		entry->u.syscall.p3 = v3;
+		break;
+	case TRACE_SYSRET:
+		entry->u.sysret.ret = eip;
+		break;
+	case TRACE_CMDLINE:
+		memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+		break;
+	default:
+		break;
+	}
+out:
+	;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (unlikely(trace_enabled <= 0))
+		return;
+
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86)
+	debug_stackoverflow(cpu_traces + raw_smp_processor_id());
+#endif
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	/*
+	 * Trace on the CPU where the current highest-prio task
+	 * is waiting to become runnable:
+	 */
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing && !trace_all_cpus && !trace_print_at_crash && !print_functions) {
+		if (!sch.tr || cpu != sch.cpu)
+			goto out;
+		tr = sch.tr;
+	} else
+		tr = cpu_traces + cpu;
+#else
+	tr = cpu_traces + cpu;
+#endif
+	atomic_inc(&tr->disabled);
+	if (likely(atomic_read(&tr->disabled) == 1)) {
+//#define DEBUG_STACK_POISON
+#ifdef DEBUG_STACK_POISON
+		char stack;
+
+		memset(&stack - 128, 0x34, 128);
+#endif
+		____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags);
+	}
+	atomic_dec(&tr->disabled);
+#ifdef CONFIG_WAKEUP_TIMING
+out:
+#endif
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+void notrace trace_special_u64(unsigned long long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0,
+		 (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), v2);
+}
+
+EXPORT_SYMBOL(trace_special_u64);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+	___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+	___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+void notrace
+sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+
+void notrace
+sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	___trace(TRACE_SYSCALL, nr | 0x80000000, 0, p1, p2, p3);
+}
+
+#endif
+
+void notrace sys_ret(unsigned long ret)
+{
+	___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	/*
+	 * Special trace values:
+	 */
+	if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+		seq_printf(m, "(%ld)", eip);
+		return;
+	}
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_puts(m, sym_name);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+					sym_name, offset, size, eip);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned long out_sequence = -1;
+
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+	____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags);
+}
+
+void notrace trace_cmdline(void)
+{
+	___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(void)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned int i, j, entries, pid;
+
+	if (tr->critical_sequence == out_sequence)
+		return;
+	out_sequence = tr->critical_sequence;
+
+	memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	for (i = 0; i < entries; i++) {
+		struct trace_entry *entry = tr->trace + i;
+
+		if (entry->type != TRACE_CMDLINE)
+			continue;
+		pid = entry->pid;
+		if (pid < PID_MAX_DEFAULT) {
+			pid_to_cmdline_array[pid] = i;
+			/*
+			 * Replace space with underline - makes it easier
+			 * to process for tools:
+			 */
+			for (j = 0; j < CMDLINE_BYTES; j++)
+				if (entry->u.cmdline.str[j] == ' ')
+					entry->u.cmdline.str[j] = '_';
+		}
+	}
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	char *cmdline = "<...>";
+	int idx;
+
+	pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+	if (!pid)
+		return "<idle>";
+
+	if (pid_to_cmdline_array[pid] != -1) {
+		idx = pid_to_cmdline_array[pid];
+		if (tr->trace[idx].type == TRACE_CMDLINE)
+			cmdline = tr->trace[idx].u.cmdline.str;
+	}
+	return cmdline;
+}
+
+struct block_idx {
+	int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+	cycles_t min_stamp = (cycles_t) -1;
+	struct trace_entry *entry;
+	int cpu, min_cpu = -1, idx;
+
+	for_each_online_cpu(cpu) {
+		idx = bidx->idx[cpu];
+		if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE-1))
+			continue;
+		if (idx >= MAX_TRACE*NR_CPUS) {
+			printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+		entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+		if (entry->timestamp < min_stamp) {
+			min_cpu = cpu;
+			min_stamp = entry->timestamp;
+		}
+	}
+
+	return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+	int cpu, sum, entries, overrun_sum;
+	struct cpu_trace *tmp_max, *tmp_out;
+	struct trace_entry *out_entry, *entry;
+	struct block_idx bidx = { { 0, }, };
+	cycles_t stamp, first_stamp, last_stamp;
+
+	/*
+	 * Nasty trick. We might overflow the first array but
+	 * there are NR_CPUS of them so we use it as a 'big'
+	 * trace buffer.
+	 */
+	tmp_out = out_tr.traces + 0;
+	*tmp_out = max_tr.traces[max_tr.cpu];
+	out_tr.cpu = max_tr.cpu;
+	out_entry = tmp_out->trace + 0;
+
+	if (!trace_all_cpus) {
+		entries = min(tmp_out->trace_idx, MAX_TRACE-1);
+		if (!entries)
+			return;
+		out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+		out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+		return;
+	}
+	/*
+	 * Find the range of timestamps that are fully traced in
+	 * all CPU traces. (since CPU traces can cover a variable
+	 * range of time, we have to find the best range.)
+	 */
+	first_stamp = 0;
+	for_each_online_cpu(cpu) {
+		tmp_max = max_tr.traces + cpu;
+		stamp = tmp_max->trace[0].timestamp;
+		if (stamp > first_stamp)
+			first_stamp = stamp;
+	}
+	/*
+	 * Save the timestamp range:
+	 */
+	tmp_max = max_tr.traces + max_tr.cpu;
+	entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+	/*
+	 * No saved trace yet?
+	 */
+	if (!entries) {
+		out_tr.traces[0].trace_idx = 0;
+		return;
+	}
+
+	last_stamp = tmp_max->trace[entries-1].timestamp;
+
+	if (last_stamp < first_stamp) {
+		WARN_ON(1);
+
+		for_each_online_cpu(cpu) {
+			tmp_max = max_tr.traces + cpu;
+			entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+			printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", cpu,
+				tmp_max->trace[0].timestamp,
+				tmp_max->trace[1].timestamp,
+				entries,
+				tmp_max->trace[entries-2].timestamp,
+				tmp_max->trace[entries-1].timestamp);
+		}
+		tmp_max = max_tr.traces + max_tr.cpu;
+		entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+
+		printk("CPU%d entries: %d\n", max_tr.cpu, entries);
+		printk("first stamp: %016Lx\n", first_stamp);
+		printk(" last stamp: %016Lx\n", first_stamp);
+	}
+
+#if 0
+	printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp);
+	printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp);
+	printk("   +1 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries].timestamp,
+		tmp_max->trace[entries].timestamp);
+	printk("   +2 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries+1].timestamp,
+		tmp_max->trace[entries+1].timestamp);
+	printk("      delta: %Ld\n", last_stamp-first_stamp);
+	printk("    entries: %d\n", entries);
+#endif
+
+	out_tr.first_timestamp = first_stamp;
+	out_tr.last_timestamp = last_stamp;
+
+	/*
+	 * Fetch trace entries one by one, in increasing timestamp
+	 * order. Start at first_stamp, stop at last_stamp:
+	 */
+	sum = 0;
+	for (;;) {
+		cpu = min_idx(&bidx);
+		if (cpu == -1)
+			break;
+		entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+		if (entry->timestamp > last_stamp)
+			break;
+
+		bidx.idx[cpu]++;
+		if (entry->timestamp < first_stamp)
+			continue;
+		*out_entry = *entry;
+		out_entry++;
+		sum++;
+		if (sum >= MAX_TRACE*NR_CPUS) {
+			printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	sum = 0;
+	overrun_sum = 0;
+	for_each_online_cpu(cpu) {
+		sum += max_tr.traces[cpu].trace_idx;
+		overrun_sum += atomic_read(&max_tr.traces[cpu].overrun);
+	}
+	tmp_out->trace_idx = sum;
+	atomic_set(&tmp_out->overrun, overrun_sum);
+}
+
+static void notrace print_help_header(struct seq_file *m)
+{
+	seq_puts(m, "                 _------=> CPU#            \n");
+	seq_puts(m, "                / _-----=> irqs-off        \n");
+	seq_puts(m, "               | / _----=> need-resched    \n");
+	seq_puts(m, "               || / _---=> hardirq/softirq \n");
+	seq_puts(m, "               ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "               |||| /                      \n");
+	seq_puts(m, "               |||||     delay             \n");
+	seq_puts(m, "   cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "      \\   /    |||||   \\   |   /           \n");
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	unsigned long entries;
+	struct cpu_trace *tr;
+
+	down(&out_mutex);
+	/*
+	 * if the file is being read newly, update the output trace:
+	 */
+	if (!n) {
+		// TODO: use the sequence counter here to optimize
+		down(&max_mutex);
+		update_out_trace();
+		up(&max_mutex);
+		if (!out_tr.traces[0].trace_idx) {
+			up(&out_mutex);
+			return NULL;
+		}
+		construct_pid_to_cmdline();
+	}
+	tr = out_tr.traces;
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (!n) {
+		seq_printf(m, "preemption latency trace v1.1.5 on %s\n", UTS_RELEASE);
+		seq_puts(m, "--------------------------------------------------------------------\n");
+		seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+			cycles_to_usecs(tr->saved_latency),
+			entries, entries + atomic_read(&tr->overrun),
+			out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+			"server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+			"desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+			"preempt",
+#else
+			"rt",
+#endif
+			0, 0,
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			softirq_preemption
+#else
+			0
+#endif
+			,
+#ifdef CONFIG_PREEMPT_HARDIRQS
+ hardirq_preemption
+#else
+			0
+#endif
+		);
+#ifdef CONFIG_SMP
+		seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+		seq_puts(m, ")\n");
+#endif
+		seq_puts(m, "    -----------------\n");
+		seq_printf(m, "    | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+			tr->comm, tr->pid, tr->uid, tr->nice,
+			tr->policy, tr->rt_priority);
+		seq_puts(m, "    -----------------\n");
+		if (trace_user_triggered) {
+			seq_puts(m, " => started at: ");
+			print_name_offset(m, tr->critical_start);
+			seq_puts(m, "\n => ended at:   ");
+			print_name_offset(m, tr->critical_end);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "\n");
+
+		if (!trace_verbose)
+			print_help_header(m);
+	}
+	if (n >= entries)
+		return NULL;
+
+	return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned long entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (++*pos >= entries) {
+		if (*pos == entries)
+			seq_puts(m, "\n\nvim:ft=help\n");
+		return NULL;
+	}
+	return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+	up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+						unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+			unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_putc(m, '!');
+	else if (rel_usecs > 1)
+		seq_putc(m, '+');
+	else
+		seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+	seq_printf(m, "%d", entry->cpu);
+	seq_printf(m, "%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	if (trace_verbose) {
+		seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ",
+			pid_to_cmdline(entry->pid),
+			entry->pid, entry->cpu, entry->flags,
+			entry->preempt_count, trace_idx,
+			entry->timestamp, abs_usecs/1000,
+			abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		print_name_offset(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name_offset(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	} else {
+		print_generic(m, entry);
+		print_timestamp(m, abs_usecs, rel_usecs);
+		print_name(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	}
+	return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry, int mode64)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+
+	if (!mode64) {
+		seq_printf(m, " (%lx %lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	} else {
+		seq_printf(m, " (%lx%8lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	}
+	return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int pid;
+
+	pid = entry->u.special.v1;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " <%.8s-%d> (%ld %ld)\n",
+		pid_to_cmdline(pid), pid,
+		entry->u.special.v2, entry->u.special.v3);
+
+	return 0;
+}
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	if (!trace_verbose)
+		return 0;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	seq_printf(m,
+		"[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+			entry->u.cmdline.str,
+			abs_usecs/1000, abs_usecs % 1000,
+			rel_usecs/1000, rel_usecs % 1000);
+
+	return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+extern unsigned long ia32_sys_call_table[], ia32_syscall_end[];
+#define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table)
+#endif
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned long nr;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_puts(m, "> ");
+	nr = entry->u.syscall.nr;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86)
+	if (nr & 0x80000000) {
+		nr &= ~0x80000000;
+		if (nr < IA32_NR_syscalls)
+			print_name(m, ia32_sys_call_table[nr]);
+		else
+			seq_printf(m, "<badsys(%lu)>", nr);
+	} else
+#endif
+	if (nr < NR_syscalls)
+		print_name(m, sys_call_table[nr]);
+	else
+		seq_printf(m, "<badsys(%lu)>", nr);
+
+#ifdef CONFIG_64BIT
+	seq_printf(m, " (%016lx %016lx %016lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+#else
+	seq_printf(m, " (%08lx %08lx %08lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+#endif
+
+	return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_printf(m, "< (%ld)\n", entry->u.sysret.ret);
+
+	return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	struct trace_entry *entry, *entry0, *next_entry;
+	unsigned long trace_idx;
+
+	cond_resched();
+	entry = p;
+	if (entry->timestamp < out_tr.first_timestamp)
+		return 0;
+	if (entry->timestamp > out_tr.last_timestamp)
+		return 0;
+
+	entry0 = tr->trace;
+	trace_idx = entry - entry0;
+
+	if (trace_idx + 1 < tr->trace_idx)
+		next_entry = entry + 1;
+	else
+		next_entry = entry;
+
+	if (trace_verbose)
+		seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx);
+
+	switch (entry->type) {
+		case TRACE_FN:
+			l_show_fn(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 0);
+			break;
+		case TRACE_SPECIAL_PID:
+			l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL_U64:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 1);
+			break;
+		case TRACE_CMDLINE:
+			l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSCALL:
+			l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSRET:
+			l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+			break;
+		default:
+			seq_printf(m, "unknown trace type %d\n", entry->type);
+	}
+	return 0;
+}
+
+struct seq_operations latency_trace_op = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show
+};
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr)
+{
+	/* free-running needs reordering */
+	if (trace_freerunning) {
+		int i, idx, idx0 = tr->trace_idx;
+
+		for (i = 0; i < MAX_TRACE; i++) {
+			idx = (idx0 + i) % MAX_TRACE;
+			save->trace[i] = tr->trace[idx];
+		}
+		save->trace_idx = MAX_TRACE-1;
+	} else {
+		save->trace_idx = tr->trace_idx;
+
+		memcpy(save->trace, tr->trace,
+			min(save->trace_idx + 1, MAX_TRACE-1) *
+					sizeof(struct trace_entry));
+	}
+	save->overrun = tr->overrun;
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct cpu_trace *tr)
+{
+	struct cpu_trace *save;
+	int cpu, all_cpus = 0;
+
+#ifdef CONFIG_PREEMPT
+	WARN_ON(!preempt_count() && !irqs_disabled());
+#endif
+
+	max_tr.cpu = tr->cpu;
+	save = max_tr.traces + tr->cpu;
+
+	if ((wakeup_timing || trace_user_triggered || trace_print_at_crash || print_functions) &&
+				trace_all_cpus) {
+		all_cpus = 1;
+		for_each_online_cpu(cpu)
+			atomic_inc(&cpu_traces[cpu].disabled);
+	}
+
+	save->saved_latency = preempt_max_latency;
+	save->preempt_timestamp = tr->preempt_timestamp;
+	save->critical_start = tr->critical_start;
+	save->critical_end = tr->critical_end;
+	save->critical_sequence = tr->critical_sequence;
+
+	memcpy(save->comm, current->comm, CMDLINE_BYTES);
+	save->pid = current->pid;
+	save->uid = current->uid;
+	save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+	save->policy = current->policy;
+	save->rt_priority = current->rt_priority;
+
+	if (all_cpus) {
+		for_each_online_cpu(cpu) {
+			copy_trace(max_tr.traces + cpu, cpu_traces + cpu);
+			atomic_dec(&cpu_traces[cpu].disabled);
+		}
+	} else
+		copy_trace(save, tr);
+}
+
+#else /* !LATENCY_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+	int thresh;
+
+	get_option(&s, &thresh);
+	if (thresh > 0) {
+		preempt_thresh = usecs_to_cycles(thresh);
+		printk("Preemption threshold = %u us\n", thresh);
+	}
+	return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr)
+{
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu)
+			cpu_traces[cpu].trace_idx = 0;
+	else
+		tr->trace_idx = 0;
+}
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1;
+	cycles_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	if (trace_user_triggered)
+		return;
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us critical section "
+			"violates %lu us threshold.\n"
+			" => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, cycles_to_usecs(preempt_thresh), t0);
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol("<%s>\n", tr->critical_start);
+	printk(" =>   ended at timestamp %lu: ", t1);
+	print_symbol("<%s>\n", tr->critical_end);
+	dump_stack();
+	t1 = cycles_to_usecs(get_cycles());
+	printk(" =>   dump-end timestamp %lu\n\n", t1);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->early_warning = 0;
+	reset_trace_idx(cpu, tr);
+	_trace_cmdline(cpu, tr);
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+}
+
+void notrace touch_critical_timing(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	if (preempt_count() > 0 && tr->critical_start) {
+		atomic_inc(&tr->disabled);
+		check_critical_timing(cpu, tr, CALLER_ADDR0);
+		tr->critical_start = CALLER_ADDR0;
+		tr->critical_sequence = max_sequence;
+		atomic_dec(&tr->disabled);
+	}
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + raw_smp_processor_id();
+
+	tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip,
+			int latency_type)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = eip;
+	atomic_set(&tr->overrun, 0);
+	reset_trace_idx(cpu, tr);
+	tr->latency_type = latency_type;
+	_trace_cmdline(cpu, tr);
+
+	local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+
+	atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+	local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+	check_critical_timing(cpu, tr, eip);
+	tr->critical_start = 0;
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+#ifdef CONFIG_LOCKDEP
+
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(a0, a1, INTERRUPT_LATENCY);
+}
+
+#else /* !CONFIG_TRACE_IRQFLAGS */
+
+/*
+ * Dummy:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */, INTERRUPT_LATENCY);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+#endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */
+
+#endif /* LATENCY_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+static inline unsigned long get_parent_eip(void)
+{
+	unsigned long parent_eip = CALLER_ADDR1;
+
+	if (in_lock_functions(parent_eip)) {
+		parent_eip = CALLER_ADDR2;
+		if (in_lock_functions(parent_eip))
+			parent_eip = CALLER_ADDR3;
+	}
+
+	return parent_eip;
+}
+
+void notrace add_preempt_count(unsigned int val)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_WARN_ON(((int)preempt_count() < 0)))
+		return;
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	if (DEBUG_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10))
+		return;
+#endif
+
+	preempt_count() += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count() & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void)eip, (void)parent_eip;
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count(unsigned int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_WARN_ON(unlikely(val > preempt_count())))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+void notrace mask_preempt_count(unsigned int mask)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+	preempt_count() |= mask;
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void) eip, (void) parent_eip;
+}
+EXPORT_SYMBOL(mask_preempt_count);
+
+void notrace unmask_preempt_count(unsigned int mask)
+{
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() &= ~mask;
+}
+EXPORT_SYMBOL(unmask_preempt_count);
+
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1, flags;
+	cycles_t T0, T1, T2, delta;
+	int cpu = raw_smp_processor_id();
+
+	if (trace_user_triggered)
+		return;
+
+	atomic_inc(&tr->disabled);
+	if (atomic_read(&tr->disabled) != 1)
+		goto out;
+
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	/*
+	 * maybe preempt_timestamp originated on another CPU,
+	 * with a TSC drift:
+	 */
+	if (T0 > T1)
+		T0 = T1;
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+
+	local_save_flags(flags);
+	____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug2: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug2: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us wakeup latency "
+			"violates %lu us threshold.\n",
+				current->comm, current->pid,
+				raw_smp_processor_id(), latency,
+				cycles_to_usecs(preempt_thresh));
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"wakeup.\n", current->comm, current->pid,
+				raw_smp_processor_id(), latency);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	atomic_dec(&tr->disabled);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	int cpu;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	__raw_spin_lock(&sch.trace_lock);
+	if (sch.task && (sch.task->prio <= p->prio))
+		goto out_unlock;
+
+	/*
+	 * New highest-prio task just woke up - start tracing:
+	 */
+	sch.task = p;
+	cpu = task_cpu(p);
+	sch.cpu = cpu;
+	/*
+	 * We keep using this CPU's trace buffer even if the task
+	 * gets migrated to another CPU. Tracing only happens on
+	 * the CPU that 'owns' the highest-prio task so it's
+	 * fundamentally single-threaded.
+	 */
+	sch.tr = tr = cpu_traces + cpu;
+	reset_trace_idx(cpu, tr);
+
+//	if (!atomic_read(&tr->disabled)) {
+		atomic_inc(&tr->disabled);
+		tr->critical_sequence = max_sequence;
+		tr->preempt_timestamp = get_cycles();
+		tr->latency_type = WAKEUP_LATENCY;
+		tr->critical_start = CALLER_ADDR0;
+		atomic_set(&tr->overrun, 0);
+		_trace_cmdline(raw_smp_processor_id(), tr);
+		atomic_dec(&tr->disabled);
+//	}
+
+	mcount();
+	trace_special_pid(p->pid, p->prio, cpu);
+out_unlock:
+	__raw_spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	local_irq_save(flags);
+	__raw_spin_lock(&sch.trace_lock);
+	if (p == sch.task) {
+		trace_special_pid(p->pid, p->prio, task_cpu(p));
+
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		WARN_ON(!tr);
+		/*
+		 * Somewhat racy but safer - the printks within
+		 * check_wakeup_timing() can call back into the
+		 * wakup-timing code and deadlock:
+		 */
+//		atomic_inc(&tr->disabled);
+		preempt_disable();
+		__raw_spin_unlock(&sch.trace_lock);
+		check_wakeup_timing(tr, CALLER_ADDR0);
+		preempt_enable();
+//		atomic_dec(&tr->disabled);
+	} else {
+		if (sch.task)
+			trace_special_pid(sch.task->pid, sch.task->prio, p->prio);
+		if (sch.task && (sch.task->prio >= p->prio))
+			sch.task = NULL;
+		__raw_spin_unlock(&sch.trace_lock);
+	}
+	local_irq_restore(flags);
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+
+	if (!wakeup_timing)
+		return;
+
+	trace_special(task_cpu(p), task_cpu(p), new_cpu);
+	local_irq_save(flags);
+	__raw_spin_lock(&sch.trace_lock);
+	if (p == sch.task && task_cpu(p) != new_cpu) {
+		sch.cpu = new_cpu;
+		trace_special(task_cpu(p), new_cpu, 0);
+	}
+	__raw_spin_unlock(&sch.trace_lock);
+	local_irq_restore(flags);
+}
+
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+
+long user_trace_start(void)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (!trace_user_triggered || trace_print_at_crash || print_functions)
+		return -EINVAL;
+
+	/*
+	 * If the user has not yet reset the max latency after
+	 * bootup then we assume that this was the intention
+	 * (we wont get any tracing done otherwise):
+	 */
+	if (preempt_max_latency == (cycles_t)ULONG_MAX)
+		preempt_max_latency = 0;
+
+	/*
+	 * user_trace_start() might be called from hardirq
+	 * context, if trace_user_triggered_irq is set, so
+	 * be careful about locking:
+	 */
+	if (preempt_count()) {
+		if (down_trylock(&max_mutex))
+			return -EAGAIN;
+	} else
+		down(&max_mutex);
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	tr = cpu_traces + cpu;
+
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		__raw_spin_lock(&sch.trace_lock);
+		sch.task = current;
+		sch.cpu = cpu;
+		sch.tr = tr;
+		__raw_spin_unlock(&sch.trace_lock);
+	}
+#endif
+	reset_trace_idx(cpu, tr);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = CALLER_ADDR0;
+	atomic_set(&tr->overrun, 0);
+	_trace_cmdline(cpu, tr);
+	mcount();
+
+	WARN_ON(!irqs_disabled());
+	local_irq_restore(flags);
+
+	up(&max_mutex);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(user_trace_start);
+
+long user_trace_stop(void)
+{
+	unsigned long latency, flags;
+	struct cpu_trace *tr;
+	cycles_t delta;
+
+	if (!trace_user_triggered || trace_print_at_crash || print_functions)
+		return -EINVAL;
+
+	preempt_disable();
+	mcount();
+
+	local_irq_save(flags);
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		__raw_spin_lock(&sch.trace_lock);
+		if (current != sch.task) {
+			__raw_spin_unlock(&sch.trace_lock);
+			local_irq_restore(flags);
+			preempt_enable();
+			return -EINVAL;
+		}
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		__raw_spin_unlock(&sch.trace_lock);
+	} else
+#endif
+		tr = cpu_traces + smp_processor_id();
+
+	atomic_inc(&tr->disabled);
+	if (tr->preempt_timestamp) {
+		cycles_t T0, T1;
+		unsigned long long tmp0;
+
+		T0 = tr->preempt_timestamp;
+		T1 = get_cycles();
+		tmp0 = preempt_max_latency;
+		if (T1 < T0)
+			T0 = T1;
+		delta = T1 - T0;
+		if (!report_latency(delta))
+			goto out;
+		if (tr->critical_sequence != max_sequence ||
+						down_trylock(&max_mutex))
+			goto out;
+
+		if (!preempt_thresh && preempt_max_latency > delta) {
+			local_irq_restore(flags);
+			printk("bug3: updating %016Lx > %016Lx [%016Lx]?\n",
+				preempt_max_latency, delta, tmp0);
+			printk("  [%016Lx %016Lx]\n", T0, T1);
+			local_irq_save(flags);
+		}
+
+		preempt_max_latency = delta;
+		update_max_tr(tr);
+
+		latency = cycles_to_usecs(delta);
+
+		local_irq_restore(flags);
+		if (preempt_thresh)
+			printk("(%16s-%-5d|#%d): %lu us user-latency "
+				"violates %lu us threshold.\n",
+					current->comm, current->pid,
+					raw_smp_processor_id(), latency,
+					cycles_to_usecs(preempt_thresh));
+		else
+			printk("(%16s-%-5d|#%d): new %lu us user-latency.\n",
+				current->comm, current->pid,
+					raw_smp_processor_id(), latency);
+		local_irq_save(flags);
+
+		max_sequence++;
+		up(&max_mutex);
+out:
+		tr->preempt_timestamp = 0;
+	}
+	atomic_dec(&tr->disabled);
+	local_irq_restore(flags);
+	preempt_enable();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+static int trace_print_cpu = -1;
+
+void notrace stop_trace(void)
+{
+	if (trace_print_at_crash && trace_print_cpu == -1) {
+		trace_enabled = -1;
+		trace_print_cpu = raw_smp_processor_id();
+	}
+}
+
+EXPORT_SYMBOL(stop_trace);
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0,
+			struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs;
+	int hardirq, softirq;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+
+	printk("%-5d ", entry->pid);
+
+	printk("%d%c%c",
+		entry->cpu,
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		printk("H");
+	else {
+		if (hardirq)
+			printk("h");
+		else {
+			if (softirq)
+				printk("s");
+			else
+				printk(".");
+		}
+	}
+
+	if (entry->preempt_count)
+		printk(":%x ", entry->preempt_count);
+	else
+		printk(":. ");
+
+	printk("%ld.%03ldms: ", abs_usecs/1000, abs_usecs % 1000);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		printk_name(entry->u.fn.eip);
+		printk("  <= (");
+		printk_name(entry->u.fn.parent_eip);
+		printk(")\n");
+		break;
+	case TRACE_SPECIAL:
+		printk(" special: %lx %lx %lx\n",
+		       entry->u.special.v1, entry->u.special.v2,
+		       entry->u.special.v3);
+		break;
+	case TRACE_SPECIAL_U64:
+		printk("  spec64: %lx%08lx %lx\n",
+		       entry->u.special.v1, entry->u.special.v2,
+		       entry->u.special.v3);
+		break;
+	}
+}
+
+/*
+ * Print the current trace at crash time.
+ *
+ * We print it backwards, so that the newest (most interesting) entries
+ * are printed first.
+ */
+void print_last_trace(void)
+{
+	unsigned int idx0, idx, i, cpu;
+	struct cpu_trace *tr;
+	struct trace_entry *entry0, *entry, *next_entry;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+	if (trace_enabled != -1 || trace_print_cpu != cpu ||
+						!trace_print_at_crash) {
+		printk("skipping trace printing on CPU#%d != %d\n",
+			cpu, trace_print_cpu);
+		preempt_enable();
+		return;
+	}
+
+	trace_print_at_crash = 0;
+
+	tr = cpu_traces + cpu;
+
+	printk("Last %ld trace entries:\n", MAX_TRACE);
+	idx0 = tr->trace_idx;
+	printk("curr idx: %d\n", idx0);
+	if (idx0 >= MAX_TRACE)
+		idx0 = MAX_TRACE-1;
+	idx = idx0;
+	entry0 = tr->trace + idx0;
+
+	for (i = 0; i < MAX_TRACE; i++) {
+		next_entry = tr->trace + idx;
+		if (idx == 0)
+			idx = MAX_TRACE-1;
+		else
+			idx--;
+		entry = tr->trace + idx;
+		switch (entry->type) {
+		case TRACE_FN:
+		case TRACE_SPECIAL:
+		case TRACE_SPECIAL_U64:
+			print_entry(entry, entry0, next_entry);
+			break;
+		}
+	}
+	printk("printed %ld entries\n", MAX_TRACE);
+
+	preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	int cpu, this_cpu = smp_processor_id();
+
+	__trace(eip, parent_eip);
+
+	nmi_eips[this_cpu] = parent_eip;
+	nmi_flags[this_cpu] = flags;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_online(cpu) && cpu != this_cpu) {
+			__trace(eip, nmi_eips[cpu]);
+			__trace(eip, nmi_flags[cpu]);
+		}
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	__trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count = task->thread_info->preempt_count;
+	unsigned int i, lim = count & PREEMPT_MASK;
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+void print_traces(struct task_struct *task)
+{
+	if (!task)
+		task = current;
+
+#ifdef CONFIG_PREEMPT_TRACE
+	print_preempt_trace(task);
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	print_last_trace();
+#endif
+}
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+static int preempt_read_proc(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	cycles_t *max = data;
+
+	return sprintf(page, "%ld\n", cycles_to_usecs(*max));
+}
+
+static int preempt_write_proc(struct file *file, const char __user *buffer,
+			      unsigned long count, void *data)
+{
+	unsigned int c, done = 0, val, sum = 0;
+	cycles_t *max = data;
+
+	while (count) {
+		if (get_user(c, buffer))
+			return -EFAULT;
+		val = c - '0';
+		buffer++;
+		done++;
+		count--;
+		if (c == 0 || c == '\n')
+			break;
+		if (val > 9)
+			return -EINVAL;
+		sum *= 10;
+		sum += val;
+	}
+	*max = usecs_to_cycles(sum);
+	return done;
+}
+
+#define	PROCNAME_PML	"sys/kernel/preempt_max_latency"
+#define PROCNAME_PT	"sys/kernel/preempt_thresh"
+
+static __init int latency_init(void)
+{
+	struct proc_dir_entry *entry;
+	int cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_traces[cpu].cpu = cpu;
+
+	if (!(entry = create_proc_entry(PROCNAME_PML, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PML);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_max_latency;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+
+	if (!(entry = create_proc_entry(PROCNAME_PT, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PT);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_thresh;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+	return 0;
+}
+__initcall(latency_init);
+
+#endif
diff -urN ./linux-2.6.18.1/kernel/lockdep.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/lockdep.c
--- ./linux-2.6.18.1/kernel/lockdep.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/lockdep.c	2007-05-19 23:58:35.000000000 +0900
@@ -48,7 +48,7 @@
  * to use a raw spinlock - we really dont want the spinlock
  * code to recurse back into the lockdep code.
  */
-static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static __raw_spinlock_t hash_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
 static int lockdep_initialized;
 
@@ -121,8 +121,8 @@
  * unique.
  */
 #define iterate_chain_key(key1, key2) \
-	(((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
-	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))
 
 void lockdep_off(void)
@@ -151,9 +151,7 @@
  */
 
 #define VERBOSE			0
-#ifdef VERBOSE
-# define VERY_VERBOSE		0
-#endif
+#define VERY_VERBOSE		0
 
 #if VERBOSE
 # define HARDIRQ_VERBOSE	1
@@ -178,8 +176,8 @@
 			!strcmp(class->name, "&struct->lockfield"))
 		return 1;
 #endif
-	/* Allow everything else. 0 would be filter everything else */
-	return 1;
+	/* Filter everything else. 1 would be to allow everything else */
+	return 0;
 }
 #endif
 
@@ -660,7 +658,7 @@
  * Return 1 otherwise and keep <backwards_match> unchanged.
  * Return 0 on error.
  */
-static noinline int
+static noinline notrace int
 find_usage_backwards(struct lock_class *source, unsigned int depth)
 {
 	struct lock_list *entry;
@@ -1244,7 +1242,7 @@
  * add it and return 0 - in this case the new dependency chain is
  * validated. If the key is already hashed, return 1.
  */
-static inline int lookup_chain_cache(u64 chain_key)
+static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
 {
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
@@ -1266,9 +1264,13 @@
 			__raw_spin_lock(&hash_lock);
 			return 1;
 #endif
+			if (very_verbose(class))
+				printk("\nhash chain already cached, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
 			return 0;
 		}
 	}
+	if (very_verbose(class))
+		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
 	/*
 	 * Allocate a new chain entry from the static array, and add
 	 * it to the hash:
@@ -1314,7 +1316,7 @@
  * We are building curr_chain_key incrementally, so double-check
  * it from scratch, to make sure that it's done correctly:
  */
-static void check_chain_key(struct task_struct *curr)
+static void notrace check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
@@ -1505,8 +1507,9 @@
 /*
  * Mark a lock with a usage bit, and validate the state transition:
  */
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit, unsigned long ip)
+static int notrace
+mark_lock(struct task_struct *curr, struct held_lock *this,
+	  enum lock_usage_bit new_bit, unsigned long ip)
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 
@@ -1719,6 +1722,7 @@
 	 * We must printk outside of the hash_lock:
 	 */
 	if (ret == 2) {
+		user_trace_stop();
 		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
 		print_lock(this);
 		print_irqtrace_events(curr);
@@ -1779,7 +1783,7 @@
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on(void)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
@@ -1820,6 +1824,9 @@
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	time_hardirqs_on(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+#endif
 }
 
 EXPORT_SYMBOL(trace_hardirqs_on);
@@ -1827,7 +1834,7 @@
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off(void)
 {
 	struct task_struct *curr = current;
 
@@ -1845,6 +1852,9 @@
 		curr->hardirq_disable_ip = _RET_IP_;
 		curr->hardirq_disable_event = ++curr->irq_events;
 		debug_atomic_inc(&hardirqs_off_events);
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		time_hardirqs_off(CALLER_ADDR0, 0 /* CALLER_ADDR1 */);
+#endif
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
@@ -2114,7 +2124,7 @@
 	 * (If lookup_chain_cache() returns with 1 it acquires
 	 * hash_lock for us)
 	 */
-	if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+	if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
 		/*
 		 * Check whether last held lock:
 		 *
@@ -2331,7 +2341,7 @@
 /*
  * Check whether we follow the irq-flags state precisely:
  */
-static void check_flags(unsigned long flags)
+static notrace void check_flags(unsigned long flags)
 {
 #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
 	if (!debug_locks)
@@ -2363,8 +2373,9 @@
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
  */
-void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-		  int trylock, int read, int check, unsigned long ip)
+void notrace
+lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+	     int trylock, int read, int check, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2372,9 +2383,9 @@
 		return;
 
 	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
 	check_flags(flags);
 
-	current->lockdep_recursion = 1;
 	__lock_acquire(lock, subclass, trylock, read, check,
 		       irqs_disabled_flags(flags), ip);
 	current->lockdep_recursion = 0;
@@ -2383,7 +2394,8 @@
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void notrace
+lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2391,8 +2403,8 @@
 		return;
 
 	raw_local_irq_save(flags);
-	check_flags(flags);
 	current->lockdep_recursion = 1;
+	check_flags(flags);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
@@ -2408,6 +2420,7 @@
 void lockdep_reset(void)
 {
 	unsigned long flags;
+	int i;
 
 	raw_local_irq_save(flags);
 	current->curr_chain_key = 0;
@@ -2418,6 +2431,8 @@
 	nr_softirq_chains = 0;
 	nr_process_chains = 0;
 	debug_locks = 1;
+	for (i = 0; i < CHAINHASH_SIZE; i++)
+		INIT_LIST_HEAD(chainhash_table + i);
 	raw_local_irq_restore(flags);
 }
 
diff -urN ./linux-2.6.18.1/kernel/mutex-debug.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/mutex-debug.c
--- ./linux-2.6.18.1/kernel/mutex-debug.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/mutex-debug.c	2007-05-19 23:58:35.000000000 +0900
@@ -77,6 +77,9 @@
 
 void debug_mutex_unlock(struct mutex *lock)
 {
+	if (unlikely(!debug_locks))
+		return;
+
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff -urN ./linux-2.6.18.1/kernel/panic.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/panic.c
--- ./linux-2.6.18.1/kernel/panic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/panic.c	2007-05-19 23:58:35.000000000 +0900
@@ -66,6 +66,8 @@
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
+	stop_trace();
+
 	/*
 	 * It's possible to come here directly from a panic-assertion and not
 	 * have preempt disabled. Some functions called from here want
@@ -78,6 +80,7 @@
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	dump_stack();
 	bust_spinlocks(0);
 
 	/*
diff -urN ./linux-2.6.18.1/kernel/posix-cpu-timers.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/posix-cpu-timers.c
--- ./linux-2.6.18.1/kernel/posix-cpu-timers.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/posix-cpu-timers.c	2007-05-19 23:58:35.000000000 +0900
@@ -291,7 +291,7 @@
 		 * should be able to see it.
 		 */
 		struct task_struct *p;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_pid(pid);
 		if (p) {
 			if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -300,11 +300,13 @@
 								 p, &rtn);
 				}
 			} else if (p->tgid == pid && p->signal) {
+				read_lock(&tasklist_lock);
 				error = cpu_clock_sample_group(which_clock,
 							       p, &rtn);
+				read_unlock(&tasklist_lock);
 			}
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (error)
@@ -559,7 +561,7 @@
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
 	listpos = head;
@@ -716,7 +718,7 @@
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	ret = 0;
 	spin_lock(&p->sighand->siglock);
@@ -1267,12 +1269,11 @@
  * already updated our counts.  We need to check if any timers fire now.
  * Interrupts are disabled.
  */
-void run_posix_cpu_timers(struct task_struct *tsk)
+void __run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
 
-	BUG_ON(!irqs_disabled());
 
 #define UNEXPIRED(clock) \
 		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
@@ -1335,6 +1336,169 @@
 	}
 }
 
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
+
+static int posix_cpu_timers_thread(void *data)
+{
+	int cpu = (long)data;
+
+	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+
+
+	while (!kthread_should_stop()) {
+		struct task_struct *tsk = NULL;
+		struct task_struct *next = NULL;
+
+		if (cpu_is_offline(cpu)) {
+			goto wait_to_die;
+		}
+
+		/* grab task list */
+		raw_local_irq_disable();
+		tsk = per_cpu(posix_timer_tasklist, cpu);
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+		raw_local_irq_enable();
+
+
+		/* its possible the list is empty, just return */
+		if (!tsk) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+			continue;
+		}
+
+		/* Process task list */
+		while (1) {
+			/* save next */
+			next = tsk->posix_timer_list;
+
+			/* run the task timers, clear its ptr and
+			 * unreference it
+			 */
+			__run_posix_cpu_timers(tsk);
+			tsk->posix_timer_list = NULL;
+			put_task_struct(tsk);
+
+			/* check if this is the last on the list */
+			if (next == tsk)
+				break;
+			tsk = next;
+		}
+	}
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	unsigned long cpu = smp_processor_id();
+	struct task_struct *tasklist;
+
+	BUG_ON(!irqs_disabled());
+	if(!per_cpu(posix_timer_task, cpu))
+		return;
+	/* get per-cpu references */
+	tasklist = per_cpu(posix_timer_tasklist, cpu);
+
+	/* check to see if we're already queued */
+	if (!tsk->posix_timer_list) {
+		get_task_struct(tsk);
+		if (tasklist) {
+			tsk->posix_timer_list = tasklist;
+		} else {
+			/*
+			 * The list is terminated by a self-pointing
+			 * task_struct
+			 */
+			tsk->posix_timer_list = tsk;
+		}
+		per_cpu(posix_timer_tasklist, cpu) = tsk;
+	}
+	/* XXX signal the thread somehow */
+	wake_up_process(per_cpu(posix_timer_task,cpu));
+}
+
+
+
+
+/*
+ * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int posix_cpu_thread_call(struct notifier_block *nfb, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct sched_param param;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(posix_cpu_timers_thread, hcpu,
+					"posix_cpu_timers/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio to avoid getting starved */
+		param.sched_priority = MAX_RT_PRIO-1;
+		sched_setscheduler(p, SCHED_FIFO, &param);
+		per_cpu(posix_timer_task,cpu) = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(per_cpu(posix_timer_task,cpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(per_cpu(posix_timer_task,cpu),
+			     any_online_cpu(cpu_online_map));
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+	case CPU_DEAD:
+		kthread_stop(per_cpu(posix_timer_task,cpu));
+		per_cpu(posix_timer_task,cpu) = NULL;
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __devinitdata posix_cpu_thread_notifier = {
+	.notifier_call = posix_cpu_thread_call,
+	.priority = 10
+};
+
+int __init posix_cpu_thread_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu);
+	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&posix_cpu_thread_notifier);
+	return 0;
+}
+
+
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
@@ -1561,6 +1725,12 @@
 		.timer_create = thread_cpu_timer_create,
 		.nsleep = thread_cpu_nsleep,
 	};
+	unsigned long cpu;
+
+	/* init the per-cpu posix_timer_tasklets */
+	for_each_cpu_mask(cpu, cpu_possible_map) {
+		per_cpu(posix_timer_tasklist, cpu) = NULL;
+	}
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
 	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
diff -urN ./linux-2.6.18.1/kernel/posix-timers.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/posix-timers.c
--- ./linux-2.6.18.1/kernel/posix-timers.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/posix-timers.c	2007-05-19 23:58:35.000000000 +0900
@@ -356,7 +356,7 @@
 		if (timr->it.real.interval.tv64 != 0) {
 			timr->it_overrun +=
 				hrtimer_forward(timer,
-						timer->base->softirq_time,
+						hrtimer_cb_get_time(timer),
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
@@ -776,6 +776,7 @@
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		hrtimer_wait_for_timer(&timr->it.real.timer);
 		rtn = NULL;	// We already got the old time...
 		goto retry;
 	}
@@ -815,6 +816,7 @@
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 
@@ -847,6 +849,7 @@
 
 	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		hrtimer_wait_for_timer(&timer->it.real.timer);
 		goto retry_delete;
 	}
 	list_del(&timer->list);
diff -urN ./linux-2.6.18.1/kernel/power/pm.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/power/pm.c
--- ./linux-2.6.18.1/kernel/power/pm.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/power/pm.c	2007-05-19 23:58:35.000000000 +0900
@@ -202,8 +202,6 @@
 	return 0;
 }
 
-EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
 
 
diff -urN ./linux-2.6.18.1/kernel/power/swsusp.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/power/swsusp.c
--- ./linux-2.6.18.1/kernel/power/swsusp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/power/swsusp.c	2007-05-19 23:58:35.000000000 +0900
@@ -248,6 +248,7 @@
 	restore_processor_state();
 Restore_highmem:
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
diff -urN ./linux-2.6.18.1/kernel/printk.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/printk.c
--- ./linux-2.6.18.1/kernel/printk.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/printk.c	2007-05-19 23:58:35.000000000 +0900
@@ -84,7 +84,7 @@
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #define LOG_BUF_MASK	(log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -326,12 +326,29 @@
 {
 	struct console *con;
 
+	touch_critical_timing();
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write &&
-				(cpu_online(smp_processor_id()) ||
-				(con->flags & CON_ANYTIME)))
+				(cpu_online(raw_smp_processor_id()) ||
+				(con->flags & CON_ANYTIME))) {
+			/*
+			 * Disable tracing of printk details - it just
+			 * clobbers the trace output with lots of
+			 * repetitive lines (especially if console is
+			 * on a serial line):
+			 */
+#ifdef CONFIG_LATENCY_TRACE
+			int trace_save = trace_enabled;
+
+			trace_enabled = 0;
+			con->write(con, &LOG_BUF(start), end - start);
+			trace_enabled = trace_save;
+#else
 			con->write(con, &LOG_BUF(start), end - start);
+#endif
+		}
 	}
+	touch_critical_timing();
 }
 
 /*
@@ -432,6 +449,7 @@
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	init_MUTEX(&console_sem);
+	zap_rt_locks();
 }
 
 #if defined(CONFIG_PRINTK_TIME)
@@ -522,6 +540,7 @@
 	lockdep_off();
 	spin_lock(&logbuf_lock);
 	printk_cpu = smp_processor_id();
+	preempt_enable();
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -620,7 +639,6 @@
 		local_irq_restore(flags);
 	}
 
-	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -810,13 +828,33 @@
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+		/*
+		 * on PREEMPT_RT, call console drivers with
+		 * interrupts enabled (if printk was called
+		 * with interrupts disabled):
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+#else
 		spin_unlock(&logbuf_lock);
+#endif
 		call_console_drivers(_con_start, _log_end);
+#ifndef CONFIG_PREEMPT_RT
 		local_irq_restore(flags);
+#endif
 	}
 	console_locked = 0;
-	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled())
+#endif
 	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
 		/*
 		 * If we printk from within the lock dependency code,
@@ -1065,7 +1103,7 @@
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
+	static DEFINE_RAW_SPINLOCK(ratelimit_lock);
 	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
@@ -1105,3 +1143,20 @@
 				printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+
+static DEFINE_RAW_SPINLOCK(warn_lock);
+
+void __WARN_ON(const char *func, const char *file, const int line)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&warn_lock, flags);
+	printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n",
+		current->comm, current->pid, raw_smp_processor_id(),
+		func, file, line);
+	dump_stack();
+	spin_unlock_irqrestore(&warn_lock, flags);
+}
+
+EXPORT_SYMBOL(__WARN_ON);
+
diff -urN ./linux-2.6.18.1/kernel/profile.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/profile.c
--- ./linux-2.6.18.1/kernel/profile.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/profile.c	2007-05-19 23:58:35.000000000 +0900
@@ -41,6 +41,7 @@
 static unsigned long prof_len, prof_shift;
 static int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+int prof_pid = -1;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -366,7 +367,7 @@
 {
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && (prof_pid == -1 || prof_pid == current->pid))
 		profile_hit(type, (void *)profile_pc(regs));
 }
 
diff -urN ./linux-2.6.18.1/kernel/rcuclassic.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcuclassic.c
--- ./linux-2.6.18.1/kernel/rcuclassic.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcuclassic.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,569 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = SPIN_LOCK_UNLOCKED,
+	.cpumask = CPU_MASK_NONE,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = SPIN_LOCK_UNLOCKED,
+	.cpumask = CPU_MASK_NONE,
+};
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+/* Fake initialization required by compiler */
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+		rdp->last_rs_qlen = rdp->qlen;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
+
+/*
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void fastcall call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void fastcall call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	struct rcu_head *next, *list;
+	int count = 0;
+
+	list = rdp->donelist;
+	while (list) {
+		next = rdp->donelist = list->next;
+		list->func(list);
+		list = next;
+		rdp->qlen--;
+		if (++count >= rdp->blimit)
+			break;
+	}
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
+	else
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_ctrlblk.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_ctrlblk.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+{
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
+		rcp->next_pending = 0;
+		/*
+		 * next_pending == 0 must be visible in
+		 * __rcu_process_callbacks() before it can see new value of cur.
+		 */
+		smp_wmb();
+		rcp->cur++;
+
+		/*
+		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+		 * Barrier  Otherwise it can cause tickless idle CPUs to be
+		 * included in rcp->cpumask, which will extend graceperiods
+		 * unnecessarily.
+		 */
+		smp_mb();
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+	}
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+{
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
+		/* batch completed ! */
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp);
+	}
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->quiescbatch != rcp->cur) {
+		/* start new grace period: */
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->quiescbatch = rcp->cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+	rdp->qs_pending = 0;
+
+	spin_lock(&rcp->lock);
+	/*
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp);
+
+	spin_unlock(&rcp->lock);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
+{
+	local_irq_disable();
+	*this_rdp->nxttail = list;
+	if (list)
+		this_rdp->nxttail = tail;
+	local_irq_enable();
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_bh(&rcp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
+					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
+					&per_cpu(rcu_bh_data, cpu));
+	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from softirq context.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
+	}
+
+	if (rdp->nxtlist && !rdp->curlist) {
+		local_irq_save(flags);
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
+		local_irq_restore(flags);
+
+		/*
+		 * start the next batch of callbacks
+		 */
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rcp->lock);
+			rcp->next_pending = 1;
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
+		}
+	}
+
+	rcu_check_quiescent_state(rcp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+void rcu_process_callbacks(struct softirq_action *unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+}
+
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
+
+void rcu_advance_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
+	rdp->blimit = blimit;
+}
+
+static void __devinit rcu_online_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+static int __devinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism.  Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init __rcu_init(void)
+{
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
+
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
diff -urN ./linux-2.6.18.1/kernel/rcupdate.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcupdate.c
--- ./linux-2.6.18.1/kernel/rcupdate.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcupdate.c	2007-05-19 23:58:35.000000000 +0900
@@ -19,7 +19,7 @@
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
- * 
+ *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  * Papers:
@@ -40,155 +40,53 @@
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
 #include <linux/completion.h>
-#include <linux/moduleparam.h>
 #include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/module.h>
 
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
-
-/* Fake initialization required by compiler */
-static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-#ifdef CONFIG_SMP
-static int rsinterval = 1000;
-#endif
-
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head);
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	cpumask_t cpumask;
-	set_need_resched();
-	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
-		rdp->last_rs_qlen = rdp->qlen;
-		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 */
-		cpumask = rcp->cpumask;
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
-			smp_send_reschedule(cpu);
-	}
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
-	set_need_resched();
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
 }
-#endif
 
 /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * synchronize_rcu - wait until a grace period has elapsed.
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
  * read-side critical sections have completed.  RCU read-side critical
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
- */
-void fastcall call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
-	local_irq_restore(flags);
-}
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ * If your read-side code is not protected by rcu_read_lock(), do -not-
+ * use synchronize_rcu().
  */
-void fastcall call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+void synchronize_rcu(void)
 {
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_bh_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-	}
-
-	local_irq_restore(flags);
-}
+	struct rcu_synchronize rcu;
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
-	return rcu_ctrlblk.completed;
-}
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
-	return rcu_bh_ctrlblk.completed;
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
 }
 
 static void rcu_barrier_callback(struct rcu_head *notused)
@@ -203,10 +101,8 @@
 static void rcu_barrier_func(void *notused)
 {
 	int cpu = smp_processor_id();
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_head *head;
+	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
-	head = &rdp->barrier;
 	atomic_inc(&rcu_barrier_cpu_count);
 	call_rcu(head, rcu_barrier_callback);
 }
@@ -225,414 +121,12 @@
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = rdp->donelist = list->next;
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
-	if (rcp->next_pending &&
-			rcp->completed == rcp->cur) {
-		rcp->next_pending = 0;
-		/*
-		 * next_pending == 0 must be visible in
-		 * __rcu_process_callbacks() before it can see new value of cur.
-		 */
-		smp_wmb();
-		rcp->cur++;
-
-		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
-		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
-
-	}
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
-	}
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
-		return;
-	}
-
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
-	 */
-	if (!rdp->qs_pending)
-		return;
-
-	/* 
-	 * Was there a quiescent state since the beginning of the grace
-	 * period? If no, then exit and wait for the next call.
-	 */
-	if (!rdp->passed_quiesc)
-		return;
-	rdp->qs_pending = 0;
-
-	spin_lock(&rcp->lock);
-	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
-	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock(&rcp->lock);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail)
-{
-	local_irq_disable();
-	*this_rdp->nxttail = list;
-	if (list)
-		this_rdp->nxttail = tail;
-	local_irq_enable();
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
-	 */
-	spin_lock_bh(&rcp->lock);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	spin_unlock_bh(&rcp->lock);
-	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
-	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from tasklet context. 
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
-		*rdp->donetail = rdp->curlist;
-		rdp->donetail = rdp->curtail;
-		rdp->curlist = NULL;
-		rdp->curtail = &rdp->curlist;
-	}
-
-	if (rdp->nxtlist && !rdp->curlist) {
-		local_irq_disable();
-		rdp->curlist = rdp->nxtlist;
-		rdp->curtail = rdp->nxttail;
-		rdp->nxtlist = NULL;
-		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
-
-		/*
-		 * start the next batch of callbacks
-		 */
-
-		/* determine batch number */
-		rdp->batch = rcp->cur + 1;
-		/* see the comment and corresponding wmb() in
-		 * the rcu_start_batch()
-		 */
-		smp_rmb();
-
-		if (!rcp->next_pending) {
-			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcp->lock);
-			rcp->next_pending = 1;
-			rcu_start_batch(rcp);
-			spin_unlock(&rcp->lock);
-		}
-	}
-
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(unsigned long unused)
-{
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
-
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
-
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
-		return 1;
-
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
-		return 1;
-
-	/* nothing to do */
-	return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so.  This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
-	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
-}
-
-void rcu_check_callbacks(int cpu, int user)
-{
-	if (user || 
-	    (idle_cpu(cpu) && !in_softirq() && 
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
-	} else if (!in_softirq())
-		rcu_bh_qsctr_inc(cpu);
-	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
-}
-
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->curtail = &rdp->curlist;
-	rdp->nxttail = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
-	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
-	rdp->blimit = blimit;
-}
-
-static void __devinit rcu_online_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-	switch (action) {
-	case CPU_UP_PREPARE:
-		rcu_online_cpu(cpu);
-		break;
-	case CPU_DEAD:
-		rcu_offline_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
 
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call	= rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init rcu_init(void)
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	/* Register notifier for non-boot CPUs */
-	register_cpu_notifier(&rcu_nb);
+	__rcu_init();
 }
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
-}
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
-}
+EXPORT_SYMBOL_GPL(rcu_barrier);
 
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-#ifdef CONFIG_SMP
-module_param(rsinterval, int, 0);
-#endif
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-EXPORT_SYMBOL_GPL(call_rcu);
-EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff -urN ./linux-2.6.18.1/kernel/rcupreempt.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcupreempt.c
--- ./linux-2.6.18.1/kernel/rcupreempt.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcupreempt.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,443 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ *		for pushing me away from locks and towards counters.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/cpumask.h>
+#include <linux/rcupreempt_trace.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		completed;	/* Number of last completed batch. */
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+#ifdef CONFIG_RCU_TRACE
+	struct rcupreempt_trace trace;
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+struct rcu_ctrlblk {
+	raw_spinlock_t	fliplock;
+	long		completed;	/* Number of last completed batch. */
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = RAW_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
+	.completed = 0,
+};
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
+void __rcu_read_lock(void)
+{
+	int flipctr;
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);
+
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		flipctr = rcu_ctrlblk.completed & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (unlikely(flipctr != (rcu_ctrlblk.completed & 0x1))) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counters, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+	}
+	local_irq_restore(oldirq);
+}
+
+void __rcu_read_unlock(void)
+{
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (unlikely(current->rcu_flipctr2 != NULL)) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+
+	local_irq_restore(oldirq);
+}
+
+static void __rcu_advance_callbacks(void)
+{
+
+	if (rcu_data.completed != rcu_ctrlblk.completed) {
+		if (rcu_data.waitlist != NULL) {
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+			RCU_TRACE(rcupreempt_trace_move2done, &rcu_data.trace);
+		}
+		if (rcu_data.nextlist != NULL) {
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+			RCU_TRACE(rcupreempt_trace_move2wait, &rcu_data.trace);
+		} else {
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.completed = rcu_ctrlblk.completed;
+	}
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+	int cpu;
+	long flipctr;
+	unsigned long oldirq;
+
+	flipctr = rcu_ctrlblk.completed;
+	RCU_TRACE(rcupreempt_trace_try_flip1, &rcu_data.trace);
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) {
+		RCU_TRACE(rcupreempt_trace_try_flip_e1, &rcu_data.trace);
+		return;
+	}
+	if (unlikely(flipctr != rcu_ctrlblk.completed)) {
+
+		/* Our work is done!  ;-) */
+
+		RCU_TRACE(rcupreempt_trace_try_flip_e2, &rcu_data.trace);
+		spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Check for completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+	RCU_TRACE(rcupreempt_trace_try_flip2, &rcu_data.trace);
+	for_each_possible_cpu(cpu) {
+		if (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+			RCU_TRACE(rcupreempt_trace_try_flip_e3,
+							&rcu_data.trace);
+			spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+			return;
+		}
+	}
+
+	/* Do the flip. */
+
+	smp_mb();
+	rcu_ctrlblk.completed++;
+
+	RCU_TRACE(rcupreempt_trace_try_flip3, &rcu_data.trace);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long oldirq;
+
+	if (rcu_ctrlblk.completed == rcu_data.completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rcu_data.completed) {
+			return;
+		}
+	}
+	spin_lock_irqsave(&rcu_data.lock, oldirq);
+	RCU_TRACE(rcupreempt_trace_check_callbacks, &rcu_data.trace);
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+	} else {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+		tasklet_schedule(&rcu_data.rcu_tasklet);
+	}
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+	unsigned long oldirq;
+
+	if (rcu_ctrlblk.completed == rcu_data.completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rcu_data.completed) {
+			return;
+		}
+	}
+	spin_lock_irqsave(&rcu_data.lock, oldirq);
+	RCU_TRACE(rcupreempt_trace_check_callbacks, &rcu_data.trace);
+	__rcu_advance_callbacks();
+	spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+}
+
+void rcu_process_callbacks(unsigned long unused)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, flags);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	RCU_TRACE(rcupreempt_trace_done_remove, &rcu_data.trace);
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+		RCU_TRACE(rcupreempt_trace_invoke, &rcu_data.trace);
+	}
+}
+
+void fastcall call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	__rcu_advance_callbacks();
+	*rcu_data.nexttail = head;
+	rcu_data.nexttail = &head->next;
+	RCU_TRACE(rcupreempt_trace_next_add, &rcu_data.trace);
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
+ */
+void __synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map;
+	}
+	for_each_online_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+
+int notrace rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init __rcu_init(void)
+{
+/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
+	spin_lock_init(&rcu_data.lock);
+	rcu_data.completed = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef CONFIG_RCU_TRACE
+int rcu_read_proc_data(char *page)
+{
+	struct rcupreempt_trace *trace = &rcu_data.trace;
+	return sprintf(page,
+		       "ggp=%ld lgp=%ld rcc=%ld\n"
+		       "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		       "rtf1=%d rtf2=%ld rtf3=%ld rtfe1=%d rtfe2=%ld rtfe3=%ld\n",
+
+		       rcu_ctrlblk.completed,
+		       rcu_data.completed,
+		       trace->rcu_check_callbacks,
+
+		       trace->next_add,
+		       trace->next_length,
+		       trace->wait_add,
+		       trace->wait_length,
+		       trace->done_add,
+		       trace->done_length,
+		       trace->done_remove,
+		       atomic_read(&trace->done_invoked),
+
+		       atomic_read(&trace->rcu_try_flip1),
+		       trace->rcu_try_flip2,
+		       trace->rcu_try_flip3,
+		       atomic_read(&trace->rcu_try_flip_e1),
+		       trace->rcu_try_flip_e2,
+		       trace->rcu_try_flip_e3);
+}
+
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.completed;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.completed);
+}
+
+int rcu_read_proc_ptrs_data(char *page)
+{
+	return sprintf(page,
+		       "nl=%p/%p nt=%p\n wl=%p/%p wt=%p dl=%p/%p dt=%p\n",
+		       &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail,
+		       &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail,
+		       &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail
+		      );
+}
+
+int rcu_read_proc_ctrs_data(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_data.completed & 0x1;
+
+	cnt += sprintf(&page[cnt], "CPU last cur\n");
+	for_each_online_cpu(cpu) {
+		cnt += sprintf(&page[cnt], "%3d %4d %3d\n",
+			       cpu,
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[!f]),
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[f]));
+	}
+	cnt += sprintf(&page[cnt], "ggp = %ld\n", rcu_data.completed);
+	return (cnt);
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
diff -urN ./linux-2.6.18.1/kernel/rcupreempt_trace.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcupreempt_trace.c
--- ./linux-2.6.18.1/kernel/rcupreempt_trace.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcupreempt_trace.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,99 @@
+/*
+ * Read-Copy Update tracing for realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/rcupreempt_trace.h>
+
+void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
+{
+	trace->done_length += trace->wait_length;
+	trace->done_add += trace->wait_length;
+	trace->wait_length = 0;
+}
+void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
+{
+	trace->wait_length += trace->next_length;
+	trace->wait_add += trace->next_length;
+	trace->next_length = 0;
+}
+void rcupreempt_trace_try_flip1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip1);
+}
+void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_e1);
+}
+void rcupreempt_trace_try_flip_e2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_e2++;
+}
+void rcupreempt_trace_try_flip_e3(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_e3++;
+}
+void rcupreempt_trace_try_flip2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip2++;
+}
+void rcupreempt_trace_try_flip3(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip3++;
+}
+void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
+{
+	trace->rcu_check_callbacks++;
+}
+void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
+{
+	trace->done_remove += trace->done_length;
+	trace->done_length = 0;
+}
+void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->done_invoked);
+}
+void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
+{
+        trace->next_add++;
+        trace->next_length++;
+}
diff -urN ./linux-2.6.18.1/kernel/rcutorture.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcutorture.c
--- ./linux-2.6.18.1/kernel/rcutorture.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rcutorture.c	2007-05-19 23:58:35.000000000 +0900
@@ -263,7 +263,11 @@
 
 static int rcu_bh_torture_completed(void)
 {
+#ifdef CONFIG_PREEMPT_RCU
+	return rcu_batches_completed();
+#else
 	return rcu_batches_completed_bh();
+#endif
 }
 
 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
diff -urN ./linux-2.6.18.1/kernel/rt.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rt.c
--- ./linux-2.6.18.1/kernel/rt.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rt.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,561 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ * By Esben Nielsen:
+ *    Doing priority inheritance with help of the scheduler.
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  - major rework based on Esben Nielsens initial patch
+ *  - replaced thread_info references by task_struct refs
+ *  - removed task->pending_owner dependency
+ *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
+ *    in the scheduler return path as discussed with Steven Rostedt
+ *
+ *  Copyright (C) 2006, Kihon Technologies Inc.
+ *    Steven Rostedt <rostedt@goodmis.org>
+ *  - debugged and patched Thomas Gleixner's rework.
+ *  - added back the cmpxchg to the rework.
+ *  - turned atomic require back on for SMP.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+	//trace_lock_init();
+}
+#endif
+
+/*
+ * struct mutex functions
+ */
+void _mutex_init(struct mutex *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key);
+#endif
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(_mutex_init);
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+	int ret;
+
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_lock_interruptible(&lock->lock, 0);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+#endif
+
+int __lockfunc _mutex_trylock(struct mutex *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(_mutex_trylock);
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);
+
+/*
+ * rwlock_t functions
+ */
+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
+{
+	int ret = rt_mutex_trylock(&rwlock->lock);
+
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	spin_lock_irqsave(&lock->wait_lock, flags);
+	if (rt_mutex_real_owner(lock) == current) {
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
+		rwlock->read_depth++;
+		/*
+		 * NOTE: we handle it as a write-lock:
+		 */
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+	ret = rt_mutex_trylock(lock);
+	if (ret)
+		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	__rt_spin_lock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+	unsigned long flags;
+	struct rt_mutex *lock = &rwlock->lock;
+
+	/*
+	 * NOTE: we handle it as a write-lock:
+	 */
+	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	spin_lock_irqsave(&lock->wait_lock, flags);
+	if (rt_mutex_real_owner(lock) == current) {
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
+		rwlock->read_depth++;
+		return;
+	}
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+	__rt_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+	struct rt_mutex *lock = &rwlock->lock;
+	unsigned long flags;
+
+	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+	// TRACE_WARN_ON(lock->save_state != 1);
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	spin_lock_irqsave(&lock->wait_lock, flags);
+	if (rt_mutex_real_owner(lock) == current && rwlock->read_depth) {
+		spin_unlock_irqrestore(&lock->wait_lock, flags);
+		rwlock->read_depth--;
+		return;
+	}
+	spin_unlock_irqrestore(&lock->wait_lock, flags);
+	__rt_spin_unlock(&rwlock->lock);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_write_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_write_lock_irqsave);
+
+unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
+{
+	rt_read_lock(rwlock);
+
+	return 0;
+}
+EXPORT_SYMBOL(rt_read_lock_irqsave);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+	lockdep_init_map(&rwlock->dep_map, name, key);
+#endif
+	__rt_mutex_init(&rwlock->lock, name);
+	rwlock->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+
+/*
+ * rw_semaphores
+ */
+
+void fastcall rt_up_write(struct rw_semaphore *rwsem)
+{
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+void fastcall rt_up_read(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+
+	rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	spin_lock_irqsave(&rwsem->lock.wait_lock, flags);
+	if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) {
+		spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth--;
+		return;
+	}
+	spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void fastcall rt_up_read_non_owner(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	spin_lock_irqsave(&rwsem->lock.wait_lock, flags);
+	if (rt_mutex_real_owner(&rwsem->lock) == current && rwsem->read_depth) {
+		spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth--;
+		return;
+	}
+	spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_mutex_unlock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_up_read_non_owner);
+#endif
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void fastcall rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG();
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+
+int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	int ret = rt_mutex_trylock(&rwsem->lock);
+
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+void fastcall rt_down_write(struct rw_semaphore *rwsem)
+{
+	rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+void fastcall rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
+{
+	rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_write_nested);
+
+int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	spin_lock_irqsave(&rwsem->lock.wait_lock, flags);
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		/*
+		 * NOTE: we handle it as a write-lock:
+		 */
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+	spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+
+	ret = rt_mutex_trylock(&rwsem->lock);
+	if (ret)
+		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+	return ret;
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+void fastcall rt_down_read(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+
+	/*
+	 * NOTE: we handle it as a write-lock:
+	 */
+	rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
+
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	spin_lock_irqsave(&rwsem->lock.wait_lock, flags);
+
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		/* TODO: lockdep: acquire-read here? */
+		rwsem->read_depth++;
+		return;
+	}
+	spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+/*
+ * Same as rt_down_read() but no lockdep calls:
+ */
+void fastcall rt_down_read_non_owner(struct rw_semaphore *rwsem)
+{
+	unsigned long flags;
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	spin_lock_irqsave(&rwsem->lock.wait_lock, flags);
+
+	if (rt_mutex_real_owner(&rwsem->lock) == current) {
+		spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+		rwsem->read_depth++;
+		return;
+	}
+	spin_unlock_irqrestore(&rwsem->lock.wait_lock, flags);
+	rt_mutex_lock(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_down_read_non_owner);
+
+#endif
+
+void fastcall __rt_rwsem_init(struct rw_semaphore *rwsem, char *name,
+			      struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
+	lockdep_init_map(&rwsem->dep_map, name, key);
+#endif
+	__rt_mutex_init(&rwsem->lock, name);
+	rwsem->read_depth = 0;
+}
+EXPORT_SYMBOL(__rt_rwsem_init);
+
+/*
+ * Semaphores
+ */
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	if (unlikely(count > 0))
+		rt_mutex_unlock(&sem->lock);
+}
+
+void fastcall rt_down(struct semaphore *sem)
+{
+	rt_mutex_lock(&sem->lock);
+	__down_complete(sem);
+}
+EXPORT_SYMBOL(rt_down);
+
+int fastcall rt_down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	ret = rt_mutex_lock_interruptible(&sem->lock, 0);
+	if (ret)
+		return ret;
+	__down_complete(sem);
+	return 0;
+}
+EXPORT_SYMBOL(rt_down_interruptible);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int fastcall rt_down_trylock(struct semaphore *sem)
+{
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (rt_mutex_trylock(&sem->lock)) {
+		__down_complete(sem);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(rt_down_trylock);
+
+void fastcall rt_up(struct semaphore *sem)
+{
+	int count;
+
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable();
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (likely(count == 1))
+		rt_mutex_unlock(&sem->lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(rt_up);
+
+void fastcall __sema_init(struct semaphore *sem, int val,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__rt_mutex_init(&sem->lock, name);
+		rt_mutex_lock(&sem->lock);
+		break;
+	default:
+		__rt_mutex_init(&sem->lock, name);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file,
+			   int line)
+{
+	__sema_init(sem, 1, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX);
+
diff -urN ./linux-2.6.18.1/kernel/rtmutex-debug.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rtmutex-debug.c
--- ./linux-2.6.18.1/kernel/rtmutex-debug.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rtmutex-debug.c	2007-05-19 23:58:35.000000000 +0900
@@ -17,6 +17,7 @@
  * See rt.c in preempt-rt for proper credits and further information
  */
 #include <linux/config.h>
+#include <linux/rt_lock.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
@@ -36,66 +37,6 @@
 # include "rtmutex.h"
 #endif
 
-# define TRACE_WARN_ON(x)			WARN_ON(x)
-# define TRACE_BUG_ON(x)			BUG_ON(x)
-
-# define TRACE_OFF()						\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-		if (spin_is_locked(&current->pi_lock))		\
-			spin_unlock(&current->pi_lock);		\
-	}							\
-} while (0)
-
-# define TRACE_OFF_NOLOCK()					\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-	}							\
-} while (0)
-
-# define TRACE_BUG_LOCKED()			\
-do {						\
-	TRACE_OFF();				\
-	BUG();					\
-} while (0)
-
-# define TRACE_WARN_ON_LOCKED(c)		\
-do {						\
-	if (unlikely(c)) {			\
-		TRACE_OFF();			\
-		WARN_ON(1);			\
-	}					\
-} while (0)
-
-# define TRACE_BUG_ON_LOCKED(c)			\
-do {						\
-	if (unlikely(c))			\
-		TRACE_BUG_LOCKED();		\
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
-#endif
-
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-int rt_trace_on = 1;
-
-void deadlock_trace_off(void)
-{
-	rt_trace_on = 0;
-}
-
 static void printk_task(struct task_struct *p)
 {
 	if (p)
@@ -123,8 +64,8 @@
 
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-	WARN_ON(!plist_head_empty(&task->pi_waiters));
-	WARN_ON(task->pi_blocked_on);
+	DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
 /*
@@ -137,7 +78,7 @@
 {
 	struct task_struct *task;
 
-	if (!rt_trace_on || detect || !act_waiter)
+	if (!debug_locks || detect || !act_waiter)
 		return;
 
 	task = rt_mutex_owner(act_waiter->lock);
@@ -151,14 +92,15 @@
 {
 	struct task_struct *task;
 
-	if (!waiter->deadlock_lock || !rt_trace_on)
+	if (!waiter->deadlock_lock || !debug_locks)
 		return;
 
 	task = find_task_by_pid(waiter->deadlock_task_pid);
 	if (!task)
 		return;
 
-	TRACE_OFF_NOLOCK();
+	if (!debug_locks_off())
+		return;
 
 	printk("\n============================================\n");
 	printk(  "[ BUG: circular locking deadlock detected! ]\n");
@@ -185,7 +127,6 @@
 
 	printk("[ turning off deadlock detection."
 	       "Please report this trace. ]\n\n");
-	local_irq_disable();
 }
 
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -194,7 +135,8 @@
 
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+	if (debug_locks)
+		DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 
 void
@@ -204,7 +146,7 @@
 
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+	DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 }
 
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -216,9 +158,9 @@
 
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
-	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
-	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-	TRACE_WARN_ON(waiter->task);
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	DEBUG_LOCKS_WARN_ON(waiter->task);
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
@@ -234,9 +176,38 @@
 void
 rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
+	trace_special(0, 0, task->lock_count);
+	if (task->lock_count >= MAX_LOCK_STACK) {
+		if (!debug_locks_off())
+			return;
+		printk("BUG: %s/%d: lock count overflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+#ifdef CONFIG_PREEMPT_RT
+	task->owned_lock[task->lock_count] = lock;
+#endif
+	task->lock_count++;
+#endif
 }
 
 void rt_mutex_deadlock_account_unlock(struct task_struct *task)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
+	trace_special(0, 0, task->lock_count);
+	if (!task->lock_count) {
+		if (!debug_locks_off())
+			return;
+		printk("BUG: %s/%d: lock count underflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->lock_count--;
+#ifdef CONFIG_PREEMPT_RT
+	task->owned_lock[task->lock_count] = NULL;
+#endif
+#endif
 }
-
diff -urN ./linux-2.6.18.1/kernel/rtmutex-debug.h linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rtmutex-debug.h
--- ./linux-2.6.18.1/kernel/rtmutex-debug.h	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rtmutex-debug.h	2007-05-19 23:58:35.000000000 +0900
@@ -17,17 +17,17 @@
 extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
 extern void debug_rt_mutex_lock(struct rt_mutex *lock);
 extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
-				      struct task_struct *powner);
+extern void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
 				    struct rt_mutex *lock);
 extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w)			\
+# define debug_rt_mutex_reset_waiter(w) \
 	do { (w)->deadlock_lock = NULL; } while (0)
 
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
-						 int detect)
+static inline int
+debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect)
 {
-	return (waiter != NULL);
+	return waiter != NULL;
 }
diff -urN ./linux-2.6.18.1/kernel/rtmutex.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rtmutex.c
--- ./linux-2.6.18.1/kernel/rtmutex.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rtmutex.c	2007-05-19 23:58:35.000000000 +0900
@@ -92,6 +92,7 @@
 
 	do {
 		owner = *p;
+		cpu_relax();
 	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
 #else
@@ -103,6 +104,22 @@
 }
 #endif
 
+int pi_initialized;
+
+/*
+ * we initialize the wait_list runtime. (Could be done build-time and/or
+ * boot-time.)
+ */
+static inline void init_lists(struct rt_mutex *lock)
+{
+	if (unlikely(!lock->wait_list.prio_list.prev)) {
+		plist_head_init(&lock->wait_list, &lock->wait_lock);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		pi_initialized++;
+#endif
+	}
+}
+
 /*
  * Calculate task priority from the waiter list priority
  *
@@ -470,7 +487,7 @@
  *
  * Called with lock->wait_lock held.
  */
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void wakeup_next_waiter(struct rt_mutex *lock, int savestate)
 {
 	struct rt_mutex_waiter *waiter;
 	struct task_struct *pendowner;
@@ -518,7 +535,10 @@
 	}
 	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
 
-	wake_up_process(pendowner);
+	if (savestate)
+		wake_up_process_mutex(pendowner);
+	else
+		wake_up_process(pendowner);
 }
 
 /*
@@ -600,6 +620,295 @@
 }
 
 /*
+ * preemptible spin_lock functions:
+ */
+
+#ifdef CONFIG_PREEMPT_RT
+
+static inline void
+rt_spin_lock_fastlock(struct rt_mutex *lock,
+		void fastcall (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
+		rt_mutex_deadlock_account_lock(lock, current);
+	else
+		slowfn(lock);
+}
+
+static inline void
+rt_spin_lock_fastunlock(struct rt_mutex *lock,
+			void fastcall (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+/*
+ * Slow path lock function spin_lock style: this variant is very
+ * careful not to miss any non-lock wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static void fastcall noinline __sched
+rt_spin_lock_slowlock(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter waiter;
+	unsigned long saved_state, state;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+	init_lists(lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock)) {
+		spin_unlock(&lock->wait_lock);
+
+		return;
+	}
+
+	BUG_ON(rt_mutex_owner(lock) == current);
+
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll take
+	 * any intermediate wakeup into account as well, independently
+	 * of the lock sleep/wakeup mechanism. When we get a real
+	 * wakeup the task->state is TASK_RUNNING and we change
+	 * saved_state accordingly. If we did not get a real wakeup
+	 * then we return with the saved state.
+	 */
+	saved_state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+
+	for (;;) {
+		unsigned long saved_flags;
+		int saved_lock_depth = current->lock_depth;
+
+		/* Try to acquire the lock */
+		if (try_to_take_rt_mutex(lock))
+			break;
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by an higher prio task.
+		 */
+		if (!waiter.task) {
+			task_blocks_on_rt_mutex(lock, &waiter, 0);
+			/* Wakeup during boost ? */
+			if (unlikely(!waiter.task))
+				continue;
+		}
+
+		/*
+		 * Prevent schedule() to drop BKL, while waiting for
+		 * the lock ! We restore lock_depth when we come back.
+		 */
+		saved_flags = current->flags & PF_NOSCHED;
+		current->lock_depth = -1;
+		current->flags &= ~PF_NOSCHED;
+		spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		schedule_rt_mutex(lock);
+
+		spin_lock(&lock->wait_lock);
+		current->flags |= saved_flags;
+		current->lock_depth = saved_lock_depth;
+		state = xchg(&current->state, TASK_UNINTERRUPTIBLE);
+		if (unlikely(state == TASK_RUNNING))
+			saved_state = TASK_RUNNING;
+	}
+
+	state = xchg(&current->state, saved_state);
+	if (unlikely(state == TASK_RUNNING))
+		current->state = TASK_RUNNING;
+
+	/*
+	 * Extremely rare case, if we got woken up by a non-mutex wakeup,
+	 * and we managed to steal the lock despite us not being the
+	 * highest-prio waiter (due to SCHED_OTHER changing prio), then we
+	 * can end up with a non-NULL waiter.task:
+	 */
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter);
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up:
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_free_waiter(&waiter);
+}
+
+/*
+ * Slow path to release a rt_mutex spin_lock style
+ */
+static void fastcall noinline __sched
+rt_spin_lock_slowunlock(struct rt_mutex *lock)
+{
+	spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock, 1);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting.when necessary */
+	rt_mutex_adjust_prio(current);
+}
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+	/* NOTE: we always pass in '1' for nested, for simplicity */
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
+{
+	rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(__rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+{
+	spin_lock(lock);
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+	int ret = rt_mutex_trylock(&lock->lock);
+
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+	*flags = 0;
+	ret = rt_mutex_trylock(&lock->lock);
+	if (ret)
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+
+int _atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	rt_spin_lock(lock);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	rt_spin_unlock(lock);
+	return 0;
+}
+EXPORT_SYMBOL(_atomic_dec_and_spin_lock);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key);
+#endif
+	__rt_mutex_init(&lock->lock, name);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+
+#endif
+
+#ifdef CONFIG_PREEMPT_BKL
+
+static inline int rt_release_bkl(struct rt_mutex *lock)
+{
+	int saved_lock_depth = current->lock_depth;
+
+	current->lock_depth = -1;
+	/*
+	 * try_to_take_lock set the waiters, make sure it's
+	 * still correct.
+	 */
+	fixup_rt_mutex_waiters(lock);
+	spin_unlock(&lock->wait_lock);
+
+	up(&kernel_sem);
+
+	spin_lock(&lock->wait_lock);
+
+	return saved_lock_depth;
+}
+
+static inline void rt_reacquire_bkl(int saved_lock_depth)
+{
+	down(&kernel_sem);
+	current->lock_depth = saved_lock_depth;
+}
+
+#else
+# define rt_release_bkl(x)	(-1)
+# define rt_reacquire_bkl(x)	do { } while (0)
+#endif
+
+/*
  * Slow path lock function:
  */
 static int __sched
@@ -607,13 +916,14 @@
 		  struct hrtimer_sleeper *timeout,
 		  int detect_deadlock)
 {
+	int ret = 0, saved_lock_depth = -1;
 	struct rt_mutex_waiter waiter;
-	int ret = 0;
 
 	debug_rt_mutex_init_waiter(&waiter);
 	waiter.task = NULL;
 
 	spin_lock(&lock->wait_lock);
+	init_lists(lock);
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock)) {
@@ -621,6 +931,13 @@
 		return 0;
 	}
 
+	/*
+	 * We drop the BKL here before we go into the wait loop to avoid a
+	 * possible deadlock in the scheduler.
+	 */
+	if (unlikely(current->lock_depth >= 0))
+		saved_lock_depth = rt_release_bkl(lock);
+
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
@@ -629,6 +946,8 @@
 			      HRTIMER_ABS);
 
 	for (;;) {
+		unsigned long saved_flags;
+
 		/* Try to acquire the lock: */
 		if (try_to_take_rt_mutex(lock))
 			break;
@@ -666,6 +985,8 @@
 			if (unlikely(ret))
 				break;
 		}
+		saved_flags = current->flags & PF_NOSCHED;
+		current->flags &= ~PF_NOSCHED;
 
 		spin_unlock(&lock->wait_lock);
 
@@ -675,6 +996,8 @@
 			schedule_rt_mutex(lock);
 
 		spin_lock(&lock->wait_lock);
+
+		current->flags |= saved_flags;
 		set_current_state(state);
 	}
 
@@ -703,6 +1026,10 @@
 	if (unlikely(ret))
 		rt_mutex_adjust_prio(current);
 
+	/* Must we reaquire the BKL? */
+	if (unlikely(saved_lock_depth >= 0))
+		rt_reacquire_bkl(saved_lock_depth);
+
 	debug_rt_mutex_free_waiter(&waiter);
 
 	return ret;
@@ -720,6 +1047,8 @@
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
+		init_lists(lock);
+
 		ret = try_to_take_rt_mutex(lock);
 		/*
 		 * try_to_take_rt_mutex() sets the lock waiters
@@ -751,7 +1080,7 @@
 		return;
 	}
 
-	wakeup_next_waiter(lock);
+	wakeup_next_waiter(lock, 0);
 
 	spin_unlock(&lock->wait_lock);
 
diff -urN ./linux-2.6.18.1/kernel/rwsem.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rwsem.c
--- ./linux-2.6.18.1/kernel/rwsem.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/rwsem.c	2007-05-19 23:58:35.000000000 +0900
@@ -15,7 +15,7 @@
 /*
  * lock for reading
  */
-void down_read(struct rw_semaphore *sem)
+void compat_down_read(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -23,12 +23,12 @@
 	__down_read(sem);
 }
 
-EXPORT_SYMBOL(down_read);
+EXPORT_SYMBOL(compat_down_read);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int down_read_trylock(struct rw_semaphore *sem)
+int compat_down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret = __down_read_trylock(sem);
 
@@ -37,12 +37,12 @@
 	return ret;
 }
 
-EXPORT_SYMBOL(down_read_trylock);
+EXPORT_SYMBOL(compat_down_read_trylock);
 
 /*
  * lock for writing
  */
-void down_write(struct rw_semaphore *sem)
+void compat_down_write(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
@@ -50,12 +50,12 @@
 	__down_write(sem);
 }
 
-EXPORT_SYMBOL(down_write);
+EXPORT_SYMBOL(compat_down_write);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int down_write_trylock(struct rw_semaphore *sem)
+int compat_down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret = __down_write_trylock(sem);
 
@@ -64,36 +64,36 @@
 	return ret;
 }
 
-EXPORT_SYMBOL(down_write_trylock);
+EXPORT_SYMBOL(compat_down_write_trylock);
 
 /*
  * release a read lock
  */
-void up_read(struct rw_semaphore *sem)
+void compat_up_read(struct compat_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 
 	__up_read(sem);
 }
 
-EXPORT_SYMBOL(up_read);
+EXPORT_SYMBOL(compat_up_read);
 
 /*
  * release a write lock
  */
-void up_write(struct rw_semaphore *sem)
+void compat_up_write(struct compat_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 
 	__up_write(sem);
 }
 
-EXPORT_SYMBOL(up_write);
+EXPORT_SYMBOL(compat_up_write);
 
 /*
  * downgrade write lock to read lock
  */
-void downgrade_write(struct rw_semaphore *sem)
+void compat_downgrade_write(struct compat_rw_semaphore *sem)
 {
 	/*
 	 * lockdep: a downgraded write will live on as a write
@@ -102,30 +102,20 @@
 	__downgrade_write(sem);
 }
 
-EXPORT_SYMBOL(downgrade_write);
+EXPORT_SYMBOL(compat_downgrade_write);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-void down_read_nested(struct rw_semaphore *sem, int subclass)
+void compat_down_read_non_owner(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
-	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
 
 	__down_read(sem);
 }
 
-EXPORT_SYMBOL(down_read_nested);
+EXPORT_SYMBOL(compat_down_read_non_owner);
 
-void down_read_non_owner(struct rw_semaphore *sem)
-{
-	might_sleep();
-
-	__down_read(sem);
-}
-
-EXPORT_SYMBOL(down_read_non_owner);
-
-void down_write_nested(struct rw_semaphore *sem, int subclass)
+void compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
@@ -133,14 +123,14 @@
 	__down_write_nested(sem, subclass);
 }
 
-EXPORT_SYMBOL(down_write_nested);
+EXPORT_SYMBOL(compat_down_write_nested);
 
-void up_read_non_owner(struct rw_semaphore *sem)
+void compat_up_read_non_owner(struct compat_rw_semaphore *sem)
 {
 	__up_read(sem);
 }
 
-EXPORT_SYMBOL(up_read_non_owner);
+EXPORT_SYMBOL(compat_up_read_non_owner);
 
 #endif
 
diff -urN ./linux-2.6.18.1/kernel/sched.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sched.c
--- ./linux-2.6.18.1/kernel/sched.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sched.c	2007-06-17 00:19:11.000000000 +0900
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  */
 
 #include <linux/mm.h>
@@ -49,12 +51,17 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/acct.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
+#ifdef CONFIG_CABI
+#include <cabi/cabi.h>
+#include <cabi/cabi_debug.h>
+#endif
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -65,6 +72,10 @@
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
+#define __PRIO(prio) \
+	((prio) <= 99 ? 199 - (prio) : (prio) - 120)
+
+#define PRIO(p) __PRIO((p)->prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
@@ -189,7 +200,27 @@
  * These are the runqueue data structures:
  */
 
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
 struct prio_array {
+	struct rq *rq;
 	unsigned int nr_active;
 	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_PRIO];
@@ -203,7 +234,7 @@
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
@@ -224,6 +255,11 @@
 	 */
 	unsigned long nr_uninterruptible;
 
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long rt_nr_running;
+	unsigned long rt_nr_uninterruptible;
+#endif
+
 	unsigned long expired_timestamp;
 	unsigned long long timestamp_last_tick;
 	struct task_struct *curr, *idle;
@@ -262,6 +298,11 @@
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_cnt;
 	unsigned long ttwu_local;
+
+	/* RT-overload stats: */
+	unsigned long rto_schedule;
+	unsigned long rto_wakeup;
+	unsigned long rto_pulled;
 #endif
 	struct lock_class_key rq_lock_key;
 };
@@ -292,11 +333,23 @@
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
+# define _finish_arch_switch(prev)	do { } while (0)
 #endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
@@ -426,7 +479,7 @@
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 13
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -443,13 +496,14 @@
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 		    cpu, rq->yld_both_empty,
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
 		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 		    rq->ttwu_cnt, rq->ttwu_local,
 		    rq->rq_sched_info.cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt,
+		    rq->rto_schedule, rq->rto_wakeup, rq->rto_pulled);
 
 		seq_printf(seq, "\n");
 
@@ -667,6 +721,53 @@
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+static __cacheline_aligned_in_smp atomic_t rt_overload;
+#endif
+
+static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p)) {
+		rq->rt_nr_running++;
+# ifdef CONFIG_SMP
+		if (rq->rt_nr_running == 2)
+			atomic_inc(&rt_overload);
+# endif
+	}
+#endif
+}
+
+static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p)) {
+		WARN_ON(!rq->rt_nr_running);
+		rq->rt_nr_running--;
+# ifdef CONFIG_SMP
+		if (rq->rt_nr_running == 1)
+			atomic_dec(&rt_overload);
+# endif
+	}
+#endif
+}
+
+static inline void incr_rt_nr_uninterruptible(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p))
+		rq->rt_nr_uninterruptible++;
+#endif
+}
+
+static inline void decr_rt_nr_uninterruptible(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p))
+		rq->rt_nr_uninterruptible--;
+#endif
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -676,15 +777,18 @@
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_rt_tasks(p, array->rq);
 }
 
 static void enqueue_task(struct task_struct *p, struct prio_array *array)
 {
+	WARN_ON_ONCE(p->flags & PF_DEAD);
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	inc_rt_tasks(p, array->rq);
 }
 
 /*
@@ -758,6 +862,8 @@
 
 static void set_load_weight(struct task_struct *p)
 {
+	ENTER;
+
 	if (has_rt_policy(p)) {
 #ifdef CONFIG_SMP
 		if (p == task_rq(p)->migration_thread)
@@ -772,6 +878,8 @@
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
 	} else
 		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+
+	cabi_debug("[setload weight] %d\n", p->load_weight);
 }
 
 static inline void
@@ -813,6 +921,8 @@
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
+
+	trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
 	return prio;
 }
 
@@ -837,28 +947,6 @@
 }
 
 /*
- * __activate_task - move a task to the runqueue.
- */
-static void __activate_task(struct task_struct *p, struct rq *rq)
-{
-	struct prio_array *target = rq->active;
-
-	if (batch_task(p))
-		target = rq->expired;
-	enqueue_task(p, target);
-	inc_nr_running(p, rq);
-}
-
-/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
-{
-	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
-}
-
-/*
  * Recalculate p->normal_prio and p->prio after having slept,
  * updating the sleep-average too:
  */
@@ -928,6 +1016,55 @@
 	return effective_prio(p);
 }
 
+static inline void trace_start_sched_wakeup(struct task_struct *p, struct rq *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr))
+		__trace_start_sched_wakeup(p);
+}
+
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static void __activate_task(struct task_struct *p, struct rq *rq)
+{
+	struct prio_array *target = rq->active;
+
+	trace_special_pid(p->pid, p->prio, rq->nr_running);
+
+	if (batch_task(p))
+		target = rq->expired;
+	enqueue_task(p, target);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_task_after - move a task to the runqueue,
+ *                         to execute after a specific task.
+ */
+static inline
+void __activate_task_after(struct task_struct *p, struct task_struct *parent,
+			   struct rq *rq)
+{
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
+	WARN_ON_ONCE(p->flags & PF_DEAD);
+	sched_info_queued(p);
+	// FIXME: to head rather?
+	list_add_tail(&p->run_list, &parent->run_list);
+	p->array = parent->array;
+	p->array->nr_active++;
+	inc_rt_tasks(p, p->array->rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
+}
+
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
@@ -983,6 +1120,7 @@
  */
 static void deactivate_task(struct task_struct *p, struct rq *rq)
 {
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
 	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
@@ -1231,6 +1369,143 @@
 	return idlest;
 }
 
+static __always_inline struct task_struct * pick_task(struct rq *rq)
+{
+	struct prio_array *array = rq->active;
+	struct list_head *queue;
+	int idx;
+
+	if (!array->nr_active)
+		return rq->idle;
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+
+	return list_entry(queue->next, struct task_struct, run_list);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+
+static struct task_struct * pick_rt_task(struct rq *src_rq, int this_cpu)
+{
+	struct list_head *head, *curr;
+	struct prio_array *array;
+	struct task_struct *tmp;
+	int idx;
+
+	WARN_ON(!spin_is_locked(&src_rq->lock));
+	/*
+	 * Only consider the active array - we are looking for
+	 * RT tasks. Must have 2 tasks at least:
+	 */
+	array = src_rq->active;
+	if (unlikely(array->nr_active < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+next_in_bitmap:
+	/*
+	 * Only non-RT tasks available - abort the search:
+	 */
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->next;
+next_in_queue:
+	tmp = list_entry(curr, struct task_struct, run_list);
+	/*
+	 * Return the highest-prio non-running RT task (if task
+	 * may run on this CPU):
+	 */
+	if (!task_running(src_rq, tmp) &&
+				cpu_isset(this_cpu, tmp->cpus_allowed))
+		return tmp;
+
+	curr = curr->next;
+	if (curr != head)
+		goto next_in_queue;
+
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	goto next_in_bitmap;
+}
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+
+/*
+ * Pull RT tasks from other CPUs in the RT-overload
+ * case. Interrupts are disabled, local rq is locked.
+ */
+static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
+{
+	struct task_struct *p, *next;
+	struct rq *src_rq;
+	int cpu;
+
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * No need to do array switching - there can be no
+	 * RT tasks in the expired array and the idle task
+	 * is more than enough for comparing against RT tasks:
+	 */
+	next = pick_task(this_rq);
+
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		src_rq = cpu_rq(cpu);
+		if (src_rq->rt_nr_running <= 1)
+			continue;
+
+		/*
+		 * We can potentially drop this_rq's lock in
+		 * double_lock_balance, and another CPU could
+		 * steal our next task - hence we must cause
+		 * the caller to recalculate the next task
+		 * in that case:
+		 */
+		if (double_lock_balance(this_rq, src_rq))
+			next = pick_task(this_rq);
+		/*
+		 * Are there still pullable RT tasks?
+		 */
+		if (src_rq->rt_nr_running <= 1) {
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+		p = pick_rt_task(src_rq, this_cpu);
+
+		/*
+		 * Do we have an RT task that preempts
+		 * the to-be-scheduled task?
+		 */
+		if (p && (p->prio < next->prio)) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->array);
+			schedstat_inc(this_rq, rto_pulled);
+
+			set_task_cpu(p, this_cpu);
+
+			p->timestamp = p->timestamp -
+				src_rq->timestamp_last_tick
+				+ this_rq->timestamp_last_tick;
+			deactivate_task(p, src_rq);
+			activate_task(p, this_rq, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue. (low likelyhood
+			 * but possible)
+			 */
+		}
+		spin_unlock(&src_rq->lock);
+	}
+}
+
+#endif
+
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
@@ -1369,7 +1644,8 @@
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -1381,6 +1657,13 @@
 	int new_cpu;
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1483,11 +1766,43 @@
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
+	/*
+	 * If a newly woken up RT task cannot preempt the
+	 * current (RT) task (on a target runqueue) then try
+	 * to find another CPU it can preempt:
+	 */
+	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+		struct rq *this_rq = cpu_rq(this_cpu);
+		/*
+		 * Special-case: the task on this CPU can be
+		 * preempted. In that case there's no need to
+		 * trigger reschedules on other CPUs, we can
+		 * mark the current task for reschedule.
+		 *
+		 * (Note that it's safe to access this_rq without
+		 * extra locking in this particular case, because
+		 * we are on the current CPU.)
+		 */
+		if (TASK_PREEMPTS_CURR(p, this_rq))
+			set_tsk_need_resched(this_rq->curr);
+		else
+			/*
+			 * Neither the intended target runqueue
+			 * nor the current CPU can take this task.
+			 * Trigger a reschedule on all other CPUs
+			 * nevertheless, maybe one of them can take
+			 * this task:
+			 */
+			smp_send_reschedule_allbutself();
+
+		schedstat_inc(this_rq, rto_wakeup);
+	}
 
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
 		rq->nr_uninterruptible--;
+		decr_rt_nr_uninterruptible(p, rq);
 		/*
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
@@ -1504,7 +1819,6 @@
 			p->sleep_type = SLEEP_NONINTERACTIVE;
 
 
-	activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
@@ -1514,13 +1828,32 @@
 	 * to be considered on this CPU.)
 	 */
 	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
+		/*
+		 * Mutex wakeups cause no boosting:
+		 */
+		if (mutex)
+			__activate_task(p, rq);
+		else
+			activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq)) {
 			resched_task(rq->curr);
+			trace_start_sched_wakeup(p, rq);
+		}
+	} else {
+		activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			set_tsk_need_resched_delayed(rq->curr);
 	}
+	if (rq->curr && p && rq && _need_resched())
+		trace_special_pid(p->pid, PRIO(p), PRIO(rq->curr));
+
 	success = 1;
 
 out_running:
-	p->state = TASK_RUNNING;
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 out:
 	task_rq_unlock(rq, &flags);
 
@@ -1529,14 +1862,53 @@
 
 int fastcall wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
-				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+	int ret;
+
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				TASK_UNINTERRUPTIBLE, 0, 0);
+	mcount();
+	return ret;
 }
 EXPORT_SYMBOL(wake_up_process);
 
+int fastcall wake_up_process_sync(struct task_struct * p)
+{
+	int ret;
+
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 0);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process_sync);
+
+int fastcall wake_up_process_mutex(struct task_struct * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 1);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int fastcall wake_up_process_mutex_sync(struct task_struct * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 1);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	mcount();
+	return ret;
 }
 
 /*
@@ -1646,15 +2018,16 @@
 			else {
 				p->prio = current->prio;
 				p->normal_prio = current->normal_prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				inc_nr_running(p, rq);
+				__activate_task_after(p, current, rq);
 			}
 			set_need_resched();
-		} else
+			trace_start_sched_wakeup(p, rq);
+		} else {
 			/* Run child last */
 			__activate_task(p, rq);
+			if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq))
+				set_need_resched();
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1770,11 +2143,26 @@
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	/*
+	 * If we pushed an RT task off the runqueue,
+	 * then kick other CPUs, they might run it:
+	 */
+	if (unlikely(rt_task(current) && prev->array && rt_task(prev))) {
+		schedstat_inc(rq, rto_schedule);
+		smp_send_reschedule_allbutself();
+	}
+#endif
 	prev_task_flags = prev->flags;
-	finish_arch_switch(prev);
+	_finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	trace_stop_sched_switched(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_task_flags & PF_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
@@ -1792,12 +2180,15 @@
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	struct rq *rq = this_rq();
-
-	finish_task_switch(rq, prev);
+	preempt_disable(); // TODO: move this to fork setup
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
+#else
+	preempt_check_resched();
 #endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
@@ -1836,6 +2227,13 @@
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+	trace_cmdline();
+
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -1876,6 +2274,43 @@
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
+#if defined(CONFIG_PREEMPT_RT)
+unsigned long rt_nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt_nr_running;
+
+	return sum;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt_nr_running;
+}
+
+unsigned long rt_nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt_nr_uninterruptible;
+
+	return sum;
+}
+
+unsigned long rt_nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt_nr_uninterruptible;
+}
+#endif
+
 unsigned long long nr_context_switches(void)
 {
 	int i;
@@ -1967,7 +2402,7 @@
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
@@ -1977,9 +2412,12 @@
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
+
+			return 1;
 		} else
 			spin_lock(&busiest->lock);
 	}
+	return 0;
 }
 
 /*
@@ -2957,7 +3395,9 @@
 
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
-	if (TASK_NICE(p) > 0)
+	if (rt_task(p))
+		cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp);
+	else if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -2984,6 +3424,8 @@
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (rt_task(p))
+		cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
@@ -3029,6 +3471,8 @@
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 
+	BUG_ON(!irqs_disabled());
+
 	update_cpu_clock(p, rq, now);
 
 	rq->timestamp_last_tick = now;
@@ -3052,11 +3496,17 @@
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
+	 *
+	 * Priority-boosted SCHED_NORMAL tasks may go here too.
 	 */
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
+		 *
+		 * On PREEMPT_RT, boosted tasks will also get into this
+		 * branch and wont get their timeslice decreased until
+		 * they have done their work.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
@@ -3245,40 +3695,41 @@
 }
 #endif
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES)
 
-void fastcall add_preempt_count(int val)
+static void trace_array(struct prio_array *array)
 {
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-		return;
-	preempt_count() += val;
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	int i;
+	struct task_struct *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, struct task_struct, run_list);
+			trace_special_pid(p->pid, p->prio, PRIO(p));
+		}
+	}
 }
-EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(struct rq *rq)
 {
-	/*
-	 * Underflow?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-		return;
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-			!(preempt_count() & PREEMPT_MASK)))
-		return;
+	if (trace_enabled) {
+		trace_array(rq->active);
+		trace_array(rq->expired);
+	}
+}
+
+#else
 
-	preempt_count() -= val;
+static inline void trace_all_runnable_tasks(struct rq *rq)
+{
 }
-EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
@@ -3288,10 +3739,33 @@
 		sleep_type == SLEEP_INTERRUPTED);
 }
 
+
+#ifdef CONFIG_CABI
+struct task_struct * find_idle_process(void)
+{
+	return idle_task(smp_processor_id());
+}
+
+static inline void cabi_sched_check (struct task_struct *prev, struct task_struct *next)
+{
+
+	/* proto type */
+	extern void (*cabi_schedule_hook)
+               (struct task_struct *prev, struct task_struct *next);
+
+	 /* debug */
+	cabi_debug_ksched(prev, next);
+	
+	/* set hook */
+	if (cabi_schedule_hook)
+		cabi_schedule_hook(prev, next);
+}
+#endif
+
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	struct prio_array *array;
@@ -3302,24 +3776,26 @@
 	long *switch_count;
 	struct rq *rq;
 
+	WARN_ON(system_state == SYSTEM_BOOTING);
+
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic() && !current->exit_state)) {
+		stop_trace();
 		printk(KERN_ERR "BUG: scheduling while atomic: "
-			"%s/0x%08x/%d\n",
-			current->comm, preempt_count(), current->pid);
+			"%s/0x%08x/%d, CPU#%d\n",
+			current->comm, preempt_count(), current->pid,
+			smp_processor_id());
 		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-need_resched:
-	preempt_disable();
+	preempt_disable(); // FIXME: disable irqs here
 	prev = current;
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 	rq = this_rq();
 
 	/*
@@ -3327,7 +3803,7 @@
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 
@@ -3346,25 +3822,42 @@
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
+	cpu = smp_processor_id();
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
-
-	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+			!(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
-			if (prev->state == TASK_UNINTERRUPTIBLE)
+			if (prev->state == TASK_UNINTERRUPTIBLE) {
 				rq->nr_uninterruptible++;
+				incr_rt_nr_uninterruptible(prev, rq);
+			}
+			touch_softlockup_watchdog();
 			deactivate_task(prev, rq);
 		}
 	}
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+	if (unlikely(prev->flags & PF_DEAD)) {
+		if (prev->state != TASK_RUNNING) {
+			printk("prev->state: %ld != TASK_RUNNING??\n",
+				prev->state);
+			WARN_ON(1);
+		} else
+			deactivate_task(prev, rq);
+		prev->state = EXIT_DEAD;
+	}
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (unlikely(atomic_read(&rt_overload)))
+		balance_rt_tasks(rq, cpu);
+#endif
 
-	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
@@ -3418,6 +3911,7 @@
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_delayed(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
 	update_cpu_clock(prev, rq, now);
@@ -3427,6 +3921,12 @@
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	trace_all_runnable_tasks(rq);
+
+#ifdef CONFIG_CABI
+	cabi_sched_check(prev,next);
+#endif
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -3435,27 +3935,92 @@
 		++*switch_count;
 
 		prepare_task_switch(rq, next);
+
 		prev = context_switch(rq, prev, next);
+
 		barrier();
+		trace_special_pid(prev->pid, PRIO(prev), PRIO(current));
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
-	} else
-		spin_unlock_irq(&rq->lock);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+		trace_stop_sched_switched(next);
+	}
 
-	prev = current;
-	if (unlikely(reacquire_kernel_lock(prev) < 0))
-		goto need_resched_nonpreemptible;
-	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+	reacquire_kernel_lock(current);
+
+}
+
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+#ifndef CONFIG_CABI
+	if (unlikely(irqs_disabled())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling with irqs disabled: "
+			"%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
+	}
+#endif
+
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)));
+	local_irq_enable(); // TODO: do sti; ret
 }
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
+
+/*
+ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
+ */
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk("turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk("turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
@@ -3468,6 +4033,8 @@
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
@@ -3476,6 +4043,7 @@
 		return;
 
 need_resched:
+	local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3486,24 +4054,25 @@
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	schedule();
+	__schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+			test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
+	local_irq_enable();
 }
+
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
@@ -3512,10 +4081,17 @@
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
-	/* Catch callers which need to be fixed */
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
 
 need_resched:
+	local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3526,18 +4102,22 @@
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	local_irq_enable();
-	schedule();
+	__schedule();
+
 	local_irq_disable();
+
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+		     test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
+#ifdef CONFIG_CABI
+	cabi_dump_sched (21, current);
+#endif
+
 }
 
 #endif /* CONFIG_PREEMPT */
@@ -3545,7 +4125,8 @@
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, sync);
+	return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX,
+			      sync, 0);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -3586,8 +4167,9 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -3637,8 +4219,9 @@
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 1, 0, NULL);
+			 1, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete);
 
@@ -3649,11 +4232,18 @@
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 0, 0, NULL);
+			 0, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete_all);
 
+unsigned int fastcall completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
@@ -3776,6 +4366,19 @@
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
+/*
+ * Preempt any context:
+ */
+int cond_resched_all(void)
+{
+	if (hardirq_count())
+		return cond_resched_hardirq();
+	if (softirq_count())
+		return cond_resched_softirq();
+	return cond_resched();
+}
+
+EXPORT_SYMBOL(cond_resched_all);
 
 #define	SLEEP_ON_VAR					\
 	unsigned long flags;				\
@@ -3860,10 +4463,10 @@
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
+	int oldprio, prev_resched;
 	struct prio_array *array;
 	unsigned long flags;
 	struct rq *rq;
-	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
@@ -3875,6 +4478,8 @@
 		dequeue_task(p, array);
 	p->prio = prio;
 
+	trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p));
+	prev_resched = _need_resched();
 	if (array) {
 		/*
 		 * If changing to an RT priority then queue it
@@ -3894,6 +4499,8 @@
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
+	trace_special(prev_resched, _need_resched(), 0);
+
 	task_rq_unlock(rq, &flags);
 }
 
@@ -4059,11 +4666,21 @@
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
+
+	ENTER;
 	BUG_ON(p->array);
 
+	CABI_DBG;
+
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
+	cabi_debug("[__setsched] policy %d rt_priority %d "
+		  "normal_prio %d\n", 
+		  p->policy,
+		  p->rt_priority,
+		  p->normal_prio);
+
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	/*
@@ -4075,6 +4692,49 @@
 }
 
 /**
+ * Set the priority of a thread
+ * @p: the task in question.
+ * @prio: new priority
+ */
+void set_thread_priority(struct task_struct *p, int prio)
+{
+	struct prio_array *array;
+	unsigned long flags;
+	struct rq *rq;
+	int oldprio;
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	spin_lock_irqsave(&p->pi_lock, flags);
+	rq = __task_rq_lock(p);
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
+	oldprio = p->prio;
+	__setscheduler(p, p->policy, prio);
+	if (array) {
+		__activate_task(p, rq);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	__task_rq_unlock(rq);
+	spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	rt_mutex_adjust_pi(p);
+}
+
+#include <cabi/cabi_debug.h>
+/**
  * sched_setscheduler - change the scheduling policy and/or RT priority of
  * a thread.
  * @p: the task in question.
@@ -4091,6 +4751,8 @@
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
+	
+
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
@@ -4098,11 +4760,12 @@
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH)
 		return -EINVAL;
-	/*
-	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
-	 * SCHED_BATCH is 0.
-	 */
+	
+	 /* 
+	  * Valid priorities for SCHED_FIFO and SCHED_RR are
+	  * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	  * SCHED_BATCH is 0.  
+	  */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
@@ -4111,19 +4774,20 @@
 					!= (param->sched_priority == 0))
 		return -EINVAL;
 
-	/*
-	 * Allow unprivileged RT tasks to decrease priority:
-	 */
+
+	 /*
+	  * Allow unprivileged RT tasks to decrease priority:
+	  */
 	if (!capable(CAP_SYS_NICE)) {
-		/*
-		 * can't change policy, except between SCHED_NORMAL
-		 * and SCHED_BATCH:
-		 */
+		 /*
+		  * can't change policy, except between SCHED_NORMAL
+		  * and SCHED_BATCH:
+		  */
 		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
 			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
 				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
-		/* can't increase priority */
+		 /* can't increase priority */
 		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority >
@@ -4134,7 +4798,7 @@
 		    (current->euid != p->uid))
 			return -EPERM;
 	}
-
+	
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
@@ -4148,7 +4812,7 @@
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
-	/* recheck policy now with rq lock held */
+	/*  recheck policy now with rq lock held  */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		__task_rq_unlock(rq);
@@ -4156,10 +4820,15 @@
 		goto recheck;
 	}
 	array = p->array;
+
 	if (array)
 		deactivate_task(p, rq);
+
 	oldprio = p->prio;
+
 	__setscheduler(p, policy, param->sched_priority);
+
+	
 	if (array) {
 		__activate_task(p, rq);
 		/*
@@ -4173,12 +4842,14 @@
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
+	
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
-
+	
 	rt_mutex_adjust_pi(p);
 
 	return 0;
+
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
@@ -4481,12 +5152,11 @@
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(rq->lock);
-	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-	_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
+	spin_unlock_no_resched(&rq->lock);
 
-	schedule();
+	__schedule();
+	local_irq_enable();
+	preempt_check_resched();
 
 	return 0;
 }
@@ -4511,10 +5181,11 @@
 	 * cond_resched() call.
 	 */
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	local_irq_enable();
 }
 
 int __sched cond_resched(void)
@@ -4535,33 +5206,51 @@
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
 {
 	int ret = 0;
 
-	if (need_lockbreak(lock)) {
+	if (need_lockbreak_raw(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	if (need_resched() && __resched_legal(1)) {
-		spin_release(&lock->dep_map, 1, _THIS_IP_);
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
+		spin_unlock_no_resched(lock);
 		__cond_resched();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
+
+#ifdef CONFIG_PREEMPT_RT
+
+int __cond_resched_spinlock(spinlock_t *lock)
+{
+#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		spin_unlock_no_resched(lock);
+		__cond_resched();
+		spin_lock(lock);
+		return 1;
+	}
+#endif
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_spinlock);
+
+#endif
 
 int __sched cond_resched_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!in_softirq());
 
-	if (need_resched() && __resched_legal(0)) {
+	if (softirq_need_resched() && __resched_legal(0)) {
 		raw_local_irq_disable();
 		_local_bh_enable();
 		raw_local_irq_enable();
@@ -4569,21 +5258,80 @@
 		local_bh_disable();
 		return 1;
 	}
+#endif
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Preempt a hardirq context if necessary:
+ */
+int cond_resched_hardirq(void)
+{
+	BUG_ON(!in_irq());
+
+	if (hardirq_need_resched()) {
+		irq_exit();
+		__cond_resched();
+		irq_enter();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched_hardirq);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
+
 /**
  * yield - yield the current processor to other threads.
  *
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void __sched yield(void)
+void __sched __yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
+
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
 EXPORT_SYMBOL(yield);
 
 /*
@@ -4725,7 +5473,7 @@
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = "RMSDTtZX";
 
 static void show_task(struct task_struct *p)
 {
@@ -4734,19 +5482,23 @@
 	unsigned state;
 
 	state = p->state ? __ffs(p->state) + 1 : 0;
-	printk("%-13.13s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+	printk("%-13.13s %c [%p]", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', p);
 #if (BITS_PER_LONG == 32)
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->array)
+		printk("[on rq #%d] ", task_cpu(p));
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
@@ -4773,13 +5525,14 @@
 	else
 		printk(" (NOTLB)\n");
 
-	if (state != TASK_RUNNING)
+//	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 
 void show_state(void)
 {
 	struct task_struct *g, *p;
+	int do_unlock = 1;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -4790,7 +5543,16 @@
 	       "                                                       sibling\n");
 	printk("  task                 PC          pid father child younger older\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -4800,7 +5562,8 @@
 		show_task(p);
 	} while_each_thread(g, p);
 
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
 	debug_show_all_locks();
 }
 
@@ -4833,7 +5596,9 @@
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
@@ -4938,7 +5703,9 @@
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	WARN_ON(p == rq_src->curr);
 	set_task_cpu(p, dest_cpu);
+
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
@@ -6792,6 +7559,7 @@
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->rq = rq;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
@@ -6813,6 +7581,9 @@
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2006 Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -6822,7 +7593,7 @@
 	init_idle(current, smp_processor_id());
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 void __might_sleep(char *file, int line)
 {
 #ifdef in_atomic
@@ -6830,13 +7601,17 @@
 
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
+		stop_trace();
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
 		dump_stack();
 	}
 #endif
@@ -6920,3 +7695,23 @@
 }
 
 #endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+	static int once = 1;
+
+	barrier();
+	dec_preempt_count();
+
+	if (once && !preempt_count()) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
+
diff -urN ./linux-2.6.18.1/kernel/sched.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sched.c.orig
--- ./linux-2.6.18.1/kernel/sched.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sched.c.orig	2007-05-20 00:11:13.000000000 +0900
@@ -0,0 +1,7649 @@
+/*
+ *  kernel/sched.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/suspend.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/kallsyms.h>
+#include <linux/acct.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <asm/tlb.h>
+
+#include <asm/unistd.h>
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+#define __PRIO(prio) \
+	((prio) <= 99 ? 199 - (prio) : (prio) - 120)
+
+#define PRIO(p) __PRIO((p)->prio)
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+ * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * Timeslices get refilled after they expire.
+ */
+#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+#define DEF_TIMESLICE		(100 * HZ / 1000)
+#define ON_RUNQUEUE_WEIGHT	 30
+#define CHILD_PENALTY		 95
+#define PARENT_PENALTY		100
+#define EXIT_WEIGHT		  3
+#define PRIO_BONUS_RATIO	 25
+#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+#define INTERACTIVE_DELTA	  2
+#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define CURRENT_BONUS(p) \
+	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+		MAX_SLEEP_AVG)
+
+#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+
+#ifdef CONFIG_SMP
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+			num_online_cpus())
+#else
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+#endif
+
+#define SCALE(v1,v1_max,v2_max) \
+	(v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+		INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+	((p)->prio <= (p)->static_prio - DELTA(p))
+
+#define INTERACTIVE_SLEEP(p) \
+	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
+/*
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+
+static unsigned int static_prio_timeslice(int static_prio)
+{
+	if (static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+	else
+		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+}
+
+static inline unsigned int task_timeslice(struct task_struct *p)
+{
+	return static_prio_timeslice(p->static_prio);
+}
+
+/*
+ * These are the runqueue data structures:
+ */
+
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
+struct prio_array {
+	struct rq *rq;
+	unsigned int nr_active;
+	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+	struct list_head queue[MAX_PRIO];
+};
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+	raw_spinlock_t lock;
+
+	/*
+	 * nr_running and cpu_load should be in the same cacheline because
+	 * remote CPUs use both these fields when doing load calculation.
+	 */
+	unsigned long nr_running;
+	unsigned long raw_weighted_load;
+#ifdef CONFIG_SMP
+	unsigned long cpu_load[3];
+#endif
+	unsigned long long nr_switches;
+
+	/*
+	 * This is part of a global counter where only the total sum
+	 * over all CPUs matters. A task can increase this counter on
+	 * one CPU and if it got migrated afterwards it may decrease
+	 * it on another CPU. Always updated under the runqueue lock:
+	 */
+	unsigned long nr_uninterruptible;
+
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long rt_nr_running;
+	unsigned long rt_nr_uninterruptible;
+#endif
+
+	unsigned long expired_timestamp;
+	unsigned long long timestamp_last_tick;
+	struct task_struct *curr, *idle;
+	struct mm_struct *prev_mm;
+	struct prio_array *active, *expired, arrays[2];
+	int best_expired_prio;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	struct sched_domain *sd;
+
+	/* For active balancing */
+	int active_balance;
+	int push_cpu;
+	int cpu;		/* cpu of this runqueue */
+
+	struct task_struct *migration_thread;
+	struct list_head migration_queue;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+	/* latency stats */
+	struct sched_info rq_sched_info;
+
+	/* sys_sched_yield() stats */
+	unsigned long yld_exp_empty;
+	unsigned long yld_act_empty;
+	unsigned long yld_both_empty;
+	unsigned long yld_cnt;
+
+	/* schedule() stats */
+	unsigned long sched_switch;
+	unsigned long sched_cnt;
+	unsigned long sched_goidle;
+
+	/* try_to_wake_up() stats */
+	unsigned long ttwu_cnt;
+	unsigned long ttwu_local;
+
+	/* RT-overload stats: */
+	unsigned long rto_schedule;
+	unsigned long rto_wakeup;
+	unsigned long rto_pulled;
+#endif
+	struct lock_class_key rq_lock_key;
+};
+
+static DEFINE_PER_CPU(struct rq, runqueues);
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return rq->cpu;
+#else
+	return 0;
+#endif
+}
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
+#define this_rq()		(&__get_cpu_var(runqueues))
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define _finish_arch_switch(prev)	do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+	return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	rq->lock.owner = current;
+#endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+	spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
+	return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	spin_unlock_irq(&rq->lock);
+#else
+	spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+repeat_lock_task:
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock(&rq->lock);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
+/*
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts.  Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
+ */
+static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+repeat_lock_task:
+	local_irq_save(*flags);
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock_irqrestore(&rq->lock, *flags);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+	__releases(rq->lock)
+{
+	spin_unlock(&rq->lock);
+}
+
+static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+	__releases(rq->lock)
+{
+	spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 13
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+	int cpu;
+
+	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+	seq_printf(seq, "timestamp %lu\n", jiffies);
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+		struct sched_domain *sd;
+		int dcnt = 0;
+#endif
+
+		/* runqueue-specific stats */
+		seq_printf(seq,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    cpu, rq->yld_both_empty,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->rq_sched_info.cpu_time,
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt,
+		    rq->rto_schedule, rq->rto_wakeup, rq->rto_pulled);
+
+		seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+		/* domain-specific stats */
+		preempt_disable();
+		for_each_domain(cpu, sd) {
+			enum idle_type itype;
+			char mask_str[NR_CPUS];
+
+			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+					itype++) {
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+				    sd->lb_cnt[itype],
+				    sd->lb_balanced[itype],
+				    sd->lb_failed[itype],
+				    sd->lb_imbalance[itype],
+				    sd->lb_gained[itype],
+				    sd->lb_hot_gained[itype],
+				    sd->lb_nobusyq[itype],
+				    sd->lb_nobusyg[itype]);
+			}
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+		}
+		preempt_enable();
+#endif
+	}
+	return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+	char *buf = kmalloc(size, GFP_KERNEL);
+	struct seq_file *m;
+	int res;
+
+	if (!buf)
+		return -ENOMEM;
+	res = single_open(file, show_schedstat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+struct file_operations proc_schedstat_operations = {
+	.open    = schedstat_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq) {
+		rq->rq_sched_info.run_delay += delta_jiffies;
+		rq->rq_sched_info.pcnt++;
+	}
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq)
+		rq->rq_sched_info.cpu_time += delta_jiffies;
+}
+# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
+# define schedstat_inc(rq, field)	do { } while (0)
+# define schedstat_add(rq, field, amt)	do { } while (0)
+#endif
+
+/*
+ * rq_lock - lock a given runqueue and disable interrupts.
+ */
+static inline struct rq *this_rq_lock(void)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	spin_lock(&rq->lock);
+
+	return rq;
+}
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(struct task_struct *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct task_struct *t)
+{
+	unsigned long now = jiffies, delta_jiffies = 0;
+
+	if (t->sched_info.last_queued)
+		delta_jiffies = now - t->sched_info.last_queued;
+	sched_info_dequeued(t);
+	t->sched_info.run_delay += delta_jiffies;
+	t->sched_info.last_arrival = now;
+	t->sched_info.pcnt++;
+
+	rq_sched_info_arrive(task_rq(t), delta_jiffies);
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
+{
+	if (unlikely(sched_info_on()))
+		if (!t->sched_info.last_queued)
+			t->sched_info.last_queued = jiffies;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(struct task_struct *t)
+{
+	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
+
+	t->sched_info.cpu_time += delta_jiffies;
+	rq_sched_info_depart(task_rq(t), delta_jiffies);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	struct rq *rq = task_rq(prev);
+
+	/*
+	 * prev now departs the cpu.  It's not interesting to record
+	 * stats about how efficient we were at scheduling the idle
+	 * process, however.
+	 */
+	if (prev != rq->idle)
+		sched_info_depart(prev);
+
+	if (next != rq->idle)
+		sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	if (unlikely(sched_info_on()))
+		__sched_info_switch(prev, next);
+}
+#else
+#define sched_info_queued(t)		do { } while (0)
+#define sched_info_switch(t, next)	do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+static __cacheline_aligned_in_smp atomic_t rt_overload;
+#endif
+
+static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p)) {
+		rq->rt_nr_running++;
+# ifdef CONFIG_SMP
+		if (rq->rt_nr_running == 2)
+			atomic_inc(&rt_overload);
+# endif
+	}
+#endif
+}
+
+static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p)) {
+		WARN_ON(!rq->rt_nr_running);
+		rq->rt_nr_running--;
+# ifdef CONFIG_SMP
+		if (rq->rt_nr_running == 1)
+			atomic_dec(&rt_overload);
+# endif
+	}
+#endif
+}
+
+static inline void incr_rt_nr_uninterruptible(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p))
+		rq->rt_nr_uninterruptible++;
+#endif
+}
+
+static inline void decr_rt_nr_uninterruptible(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (rt_task(p))
+		rq->rt_nr_uninterruptible--;
+#endif
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
+{
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
+	dec_rt_tasks(p, array->rq);
+}
+
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
+{
+	WARN_ON_ONCE(p->flags & PF_DEAD);
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+	inc_rt_tasks(p, array->rq);
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task(struct task_struct *p, struct prio_array *array)
+{
+	list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
+{
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
+/*
+ * __normal_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
+ *
+ * We use 25% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *
+ * Both properties are important to certain workloads.
+ */
+
+static inline int __normal_prio(struct task_struct *p)
+{
+	int bonus, prio;
+
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
+}
+
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+	LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static void set_load_weight(struct task_struct *p)
+{
+	if (has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing.
+			 * Giving its load any weight will skew balancing
+			 * adversely.
+			 */
+			p->load_weight = 0;
+		else
+#endif
+			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+	} else
+		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void
+inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+{
+	rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void
+dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+{
+	rq->raw_weighted_load -= p->load_weight;
+}
+
+static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
+{
+	rq->nr_running++;
+	inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
+{
+	rq->nr_running--;
+	dec_raw_weighted_load(rq, p);
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+	int prio;
+
+	if (has_rt_policy(p))
+		prio = MAX_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __normal_prio(p);
+
+	trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
+	return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
+}
+
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
+static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+{
+	/* Caller must always ensure 'now >= p->timestamp' */
+	unsigned long sleep_time = now - p->timestamp;
+
+	if (batch_task(p))
+		sleep_time = 0;
+
+	if (likely(sleep_time > 0)) {
+		/*
+		 * This ceiling is set to the lowest priority that would allow
+		 * a task to be reinserted into the active array on timeslice
+		 * completion.
+		 */
+		unsigned long ceiling = INTERACTIVE_SLEEP(p);
+
+		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+			/*
+			 * Prevents user tasks from achieving best priority
+			 * with one single large enough sleep.
+			 */
+			p->sleep_avg = ceiling;
+			/*
+			 * Using INTERACTIVE_SLEEP() as a ceiling places a
+			 * nice(0) task 1ms sleep away from promotion, and
+			 * gives it 700ms to round-robin with no chance of
+			 * being demoted.  This is more than generous, so
+			 * mark this sleep as non-interactive to prevent the
+			 * on-runqueue bonus logic from intervening should
+			 * this task not receive cpu immediately.
+			 */
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+		} else {
+			/*
+			 * Tasks waking from uninterruptible sleep are
+			 * limited in their sleep_avg rise as they
+			 * are likely to be waiting on I/O
+			 */
+			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+				if (p->sleep_avg >= ceiling)
+					sleep_time = 0;
+				else if (p->sleep_avg + sleep_time >=
+					 ceiling) {
+						p->sleep_avg = ceiling;
+						sleep_time = 0;
+				}
+			}
+
+			/*
+			 * This code gives a bonus to interactive tasks.
+			 *
+			 * The boost works by updating the 'average sleep time'
+			 * value here, based on ->timestamp. The more time a
+			 * task spends sleeping, the higher the average gets -
+			 * and the higher the priority boost gets as well.
+			 */
+			p->sleep_avg += sleep_time;
+
+		}
+		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+			p->sleep_avg = NS_MAX_SLEEP_AVG;
+	}
+
+	return effective_prio(p);
+}
+
+static inline void trace_start_sched_wakeup(struct task_struct *p, struct rq *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr))
+		__trace_start_sched_wakeup(p);
+}
+
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static void __activate_task(struct task_struct *p, struct rq *rq)
+{
+	struct prio_array *target = rq->active;
+
+	trace_special_pid(p->pid, p->prio, rq->nr_running);
+
+	if (batch_task(p))
+		target = rq->expired;
+	enqueue_task(p, target);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_task_after - move a task to the runqueue,
+ *                         to execute after a specific task.
+ */
+static inline
+void __activate_task_after(struct task_struct *p, struct task_struct *parent,
+			   struct rq *rq)
+{
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
+	WARN_ON_ONCE(p->flags & PF_DEAD);
+	sched_info_queued(p);
+	// FIXME: to head rather?
+	list_add_tail(&p->run_list, &parent->run_list);
+	p->array = parent->array;
+	p->array->nr_active++;
+	inc_rt_tasks(p, p->array->rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static void activate_task(struct task_struct *p, struct rq *rq, int local)
+{
+	unsigned long long now;
+
+	now = sched_clock();
+#ifdef CONFIG_SMP
+	if (!local) {
+		/* Compensate for drifting sched_clock */
+		struct rq *this_rq = this_rq();
+		now = (now - this_rq->timestamp_last_tick)
+			+ rq->timestamp_last_tick;
+	}
+#endif
+
+	if (!rt_task(p))
+		p->prio = recalc_task_prio(p, now);
+
+	/*
+	 * This checks to make sure it's not an uninterruptible task
+	 * that is now waking up.
+	 */
+	if (p->sleep_type == SLEEP_NORMAL) {
+		/*
+		 * Tasks which were woken up by interrupts (ie. hw events)
+		 * are most likely of interactive nature. So we give them
+		 * the credit of extending their sleep time to the period
+		 * of time they spend on the runqueue, waiting for execution
+		 * on a CPU, first time around:
+		 */
+		if (in_interrupt())
+			p->sleep_type = SLEEP_INTERRUPTED;
+		else {
+			/*
+			 * Normal first-time wakeups get a credit too for
+			 * on-runqueue time, but it will be weighted down:
+			 */
+			p->sleep_type = SLEEP_INTERACTIVE;
+		}
+	}
+	p->timestamp = now;
+
+	__activate_task(p, rq);
+}
+
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+	trace_special_pid(p->pid, PRIO(p), rq->nr_running);
+	dec_nr_running(p, rq);
+	dequeue_task(p, p->array);
+	p->array = NULL;
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
+{
+	int cpu;
+
+	assert_spin_locked(&task_rq(p)->lock);
+
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
+}
+#else
+static inline void resched_task(struct task_struct *p)
+{
+	assert_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched(p);
+}
+#endif
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->raw_weighted_load;
+}
+
+#ifdef CONFIG_SMP
+struct migration_req {
+	struct list_head list;
+
+	struct task_struct *task;
+	int dest_cpu;
+
+	struct completion done;
+};
+
+/*
+ * The task's runqueue lock must be held.
+ * Returns true if you have to wait for migration thread.
+ */
+static int
+migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+{
+	struct rq *rq = task_rq(p);
+
+	/*
+	 * If the task is not on a runqueue (and not running), then
+	 * it is sufficient to simply update the task's cpu field.
+	 */
+	if (!p->array && !task_running(rq, p)) {
+		set_task_cpu(p, dest_cpu);
+		return 0;
+	}
+
+	init_completion(&req->done);
+	req->task = p;
+	req->dest_cpu = dest_cpu;
+	list_add(&req->list, &rq->migration_queue);
+
+	return 1;
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+void wait_task_inactive(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	int preempted;
+
+repeat:
+	rq = task_rq_lock(p, &flags);
+	/* Must be off runqueue entirely, not preempted. */
+	if (unlikely(p->array || task_running(rq, p))) {
+		/* If it's preempted, we yield.  It could be a while. */
+		preempted = !task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+		cpu_relax();
+		if (preempted)
+			yield();
+		goto repeat;
+	}
+	task_rq_unlock(rq, &flags);
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesnt have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static inline unsigned long source_load(int cpu, int type)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (type == 0)
+		return rq->raw_weighted_load;
+
+	return min(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static inline unsigned long target_load(int cpu, int type)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (type == 0)
+		return rq->raw_weighted_load;
+
+	return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long n = rq->nr_running;
+
+	return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		/* Skip over this group if it has no CPUs allowed */
+		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+			goto nextgroup;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+nextgroup:
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+static __always_inline struct task_struct * pick_task(struct rq *rq)
+{
+	struct prio_array *array = rq->active;
+	struct list_head *queue;
+	int idx;
+
+	if (!array->nr_active)
+		return rq->idle;
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+
+	return list_entry(queue->next, struct task_struct, run_list);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+
+static struct task_struct * pick_rt_task(struct rq *src_rq, int this_cpu)
+{
+	struct list_head *head, *curr;
+	struct prio_array *array;
+	struct task_struct *tmp;
+	int idx;
+
+	WARN_ON(!spin_is_locked(&src_rq->lock));
+	/*
+	 * Only consider the active array - we are looking for
+	 * RT tasks. Must have 2 tasks at least:
+	 */
+	array = src_rq->active;
+	if (unlikely(array->nr_active < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+next_in_bitmap:
+	/*
+	 * Only non-RT tasks available - abort the search:
+	 */
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->next;
+next_in_queue:
+	tmp = list_entry(curr, struct task_struct, run_list);
+	/*
+	 * Return the highest-prio non-running RT task (if task
+	 * may run on this CPU):
+	 */
+	if (!task_running(src_rq, tmp) &&
+				cpu_isset(this_cpu, tmp->cpus_allowed))
+		return tmp;
+
+	curr = curr->next;
+	if (curr != head)
+		goto next_in_queue;
+
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	goto next_in_bitmap;
+}
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+
+/*
+ * Pull RT tasks from other CPUs in the RT-overload
+ * case. Interrupts are disabled, local rq is locked.
+ */
+static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
+{
+	struct task_struct *p, *next;
+	struct rq *src_rq;
+	int cpu;
+
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * No need to do array switching - there can be no
+	 * RT tasks in the expired array and the idle task
+	 * is more than enough for comparing against RT tasks:
+	 */
+	next = pick_task(this_rq);
+
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		src_rq = cpu_rq(cpu);
+		if (src_rq->rt_nr_running <= 1)
+			continue;
+
+		/*
+		 * We can potentially drop this_rq's lock in
+		 * double_lock_balance, and another CPU could
+		 * steal our next task - hence we must cause
+		 * the caller to recalculate the next task
+		 * in that case:
+		 */
+		if (double_lock_balance(this_rq, src_rq))
+			next = pick_task(this_rq);
+		/*
+		 * Are there still pullable RT tasks?
+		 */
+		if (src_rq->rt_nr_running <= 1) {
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+		p = pick_rt_task(src_rq, this_cpu);
+
+		/*
+		 * Do we have an RT task that preempts
+		 * the to-be-scheduled task?
+		 */
+		if (p && (p->prio < next->prio)) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->array);
+			schedstat_inc(this_rq, rto_pulled);
+
+			set_task_cpu(p, this_cpu);
+
+			p->timestamp = p->timestamp -
+				src_rq->timestamp_last_tick
+				+ this_rq->timestamp_last_tick;
+			deactivate_task(p, src_rq);
+			activate_task(p, this_rq, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue. (low likelyhood
+			 * but possible)
+			 */
+		}
+		spin_unlock(&src_rq->lock);
+	}
+}
+
+#endif
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+	cpumask_t tmp;
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	/* Traverse only the allowed CPUs */
+	cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+	for_each_cpu_mask(i, tmp) {
+		load = weighted_cpuload(i);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
+
+	for_each_domain(cpu, tmp) {
+ 		/*
+ 	 	 * If power savings logic is enabled for a domain, stop there.
+ 	 	 */
+		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+			break;
+		if (tmp->flags & flag)
+			sd = tmp;
+	}
+
+	while (sd) {
+		cpumask_t span;
+		struct sched_group *group;
+		int new_cpu;
+		int weight;
+
+		span = sd->span;
+		group = find_idlest_group(sd, t, cpu);
+		if (!group)
+			goto nextlevel;
+
+		new_cpu = find_idlest_cpu(group, t, cpu);
+		if (new_cpu == -1 || new_cpu == cpu)
+			goto nextlevel;
+
+		/* Now try balancing at a lower domain level */
+		cpu = new_cpu;
+nextlevel:
+		sd = NULL;
+		weight = cpus_weight(span);
+		for_each_domain(cpu, tmp) {
+			if (weight <= cpus_weight(tmp->span))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+	cpumask_t tmp;
+	struct sched_domain *sd;
+	int i;
+
+	if (idle_cpu(cpu))
+		return cpu;
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i))
+					return i;
+			}
+		}
+		else
+			break;
+	}
+	return cpu;
+}
+#else
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+	return cpu;
+}
+#endif
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the to-be-woken-up thread
+ * @state: the mask of task states that can be woken
+ * @sync: do a synchronous wakeup?
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * returns failure only if the task is already active.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex)
+{
+	int cpu, this_cpu, success = 0;
+	unsigned long flags;
+	long old_state;
+	struct rq *rq;
+#ifdef CONFIG_SMP
+	struct sched_domain *sd, *this_sd = NULL;
+	unsigned long load, this_load;
+	int new_cpu;
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
+	rq = task_rq_lock(p, &flags);
+	old_state = p->state;
+	if (!(old_state & state))
+		goto out;
+
+	if (p->array)
+		goto out_running;
+
+	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();
+
+#ifdef CONFIG_SMP
+	if (unlikely(task_running(rq, p)))
+		goto out_activate;
+
+	new_cpu = cpu;
+
+	schedstat_inc(rq, ttwu_cnt);
+	if (cpu == this_cpu) {
+		schedstat_inc(rq, ttwu_local);
+		goto out_set_cpu;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			schedstat_inc(sd, ttwu_wake_remote);
+			this_sd = sd;
+			break;
+		}
+	}
+
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
+
+	/*
+	 * Check for affine wakeup and passive balancing possibilities.
+	 */
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
+
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
+
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
+			unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+			/*
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
+			 */
+			if (sync)
+				tl -= current->load_weight;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+				100*(tl + p->load_weight) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				goto out_set_cpu;
+			}
+		}
+	}
+
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+	new_cpu = wake_idle(new_cpu, p);
+	if (new_cpu != cpu) {
+		set_task_cpu(p, new_cpu);
+		task_rq_unlock(rq, &flags);
+		/* might preempt at this point */
+		rq = task_rq_lock(p, &flags);
+		old_state = p->state;
+		if (!(old_state & state))
+			goto out;
+		if (p->array)
+			goto out_running;
+
+		this_cpu = smp_processor_id();
+		cpu = task_cpu(p);
+	}
+	/*
+	 * If a newly woken up RT task cannot preempt the
+	 * current (RT) task (on a target runqueue) then try
+	 * to find another CPU it can preempt:
+	 */
+	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+		struct rq *this_rq = cpu_rq(this_cpu);
+		/*
+		 * Special-case: the task on this CPU can be
+		 * preempted. In that case there's no need to
+		 * trigger reschedules on other CPUs, we can
+		 * mark the current task for reschedule.
+		 *
+		 * (Note that it's safe to access this_rq without
+		 * extra locking in this particular case, because
+		 * we are on the current CPU.)
+		 */
+		if (TASK_PREEMPTS_CURR(p, this_rq))
+			set_tsk_need_resched(this_rq->curr);
+		else
+			/*
+			 * Neither the intended target runqueue
+			 * nor the current CPU can take this task.
+			 * Trigger a reschedule on all other CPUs
+			 * nevertheless, maybe one of them can take
+			 * this task:
+			 */
+			smp_send_reschedule_allbutself();
+
+		schedstat_inc(this_rq, rto_wakeup);
+	}
+
+out_activate:
+#endif /* CONFIG_SMP */
+	if (old_state == TASK_UNINTERRUPTIBLE) {
+		rq->nr_uninterruptible--;
+		decr_rt_nr_uninterruptible(p, rq);
+		/*
+		 * Tasks on involuntary sleep don't earn
+		 * sleep_avg beyond just interactive state.
+		 */
+		p->sleep_type = SLEEP_NONINTERACTIVE;
+	} else
+
+	/*
+	 * Tasks that have marked their sleep as noninteractive get
+	 * woken up with their sleep average not weighted in an
+	 * interactive way.
+	 */
+		if (old_state & TASK_NONINTERACTIVE)
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption, if the woken up task will run on
+	 * this cpu. (in this case the 'I will reschedule' promise of
+	 * the waker guarantees that the freshly woken up task is going
+	 * to be considered on this CPU.)
+	 */
+	if (!sync || cpu != this_cpu) {
+		/*
+		 * Mutex wakeups cause no boosting:
+		 */
+		if (mutex)
+			__activate_task(p, rq);
+		else
+			activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq)) {
+			resched_task(rq->curr);
+			trace_start_sched_wakeup(p, rq);
+		}
+	} else {
+		activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			set_tsk_need_resched_delayed(rq->curr);
+	}
+	if (rq->curr && p && rq && _need_resched())
+		trace_special_pid(p->pid, PRIO(p), PRIO(rq->curr));
+
+	success = 1;
+
+out_running:
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
+out:
+	task_rq_unlock(rq, &flags);
+
+	return success;
+}
+
+int fastcall wake_up_process(struct task_struct *p)
+{
+	int ret;
+
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				TASK_UNINTERRUPTIBLE, 0, 0);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int fastcall wake_up_process_sync(struct task_struct * p)
+{
+	int ret;
+
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 0);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process_sync);
+
+int fastcall wake_up_process_mutex(struct task_struct * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 1);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int fastcall wake_up_process_mutex_sync(struct task_struct * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 1);
+	mcount();
+	return ret;
+}
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
+int fastcall wake_up_state(struct task_struct *p, unsigned int state)
+{
+	int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	mcount();
+	return ret;
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+void fastcall sched_fork(struct task_struct *p, int clone_flags)
+{
+	int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+	set_task_cpu(p, cpu);
+
+	/*
+	 * We mark the process as running here, but have not actually
+	 * inserted it onto the runqueue yet. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_RUNNING;
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child:
+	 */
+	p->prio = current->normal_prio;
+
+	INIT_LIST_HEAD(&p->run_list);
+	p->array = NULL;
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	p->oncpu = 0;
+#endif
+#ifdef CONFIG_PREEMPT
+	/* Want to start with kernel preemption disabled. */
+	task_thread_info(p)->preempt_count = 1;
+#endif
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
+	 */
+	local_irq_disable();
+	p->time_slice = (current->time_slice + 1) >> 1;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
+	current->time_slice >>= 1;
+	p->timestamp = sched_clock();
+	if (unlikely(!current->time_slice)) {
+		/*
+		 * This case is rare, it happens when the parent has only
+		 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		scheduler_tick();
+	}
+	local_irq_enable();
+	put_cpu();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+{
+	struct rq *rq, *this_rq;
+	unsigned long flags;
+	int this_cpu, cpu;
+
+	rq = task_rq_lock(p, &flags);
+	BUG_ON(p->state != TASK_RUNNING);
+	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
+
+	/*
+	 * We decrease the sleep average of forking parents
+	 * and children as well, to keep max-interactive tasks
+	 * from forking tasks that are max-interactive. The parent
+	 * (current) is done further down, under its lock.
+	 */
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->prio = effective_prio(p);
+
+	if (likely(cpu == this_cpu)) {
+		if (!(clone_flags & CLONE_VM)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			if (unlikely(!current->array))
+				__activate_task(p, rq);
+			else {
+				p->prio = current->prio;
+				p->normal_prio = current->normal_prio;
+				__activate_task_after(p, current, rq);
+			}
+			set_need_resched();
+			trace_start_sched_wakeup(p, rq);
+		} else {
+			/* Run child last */
+			__activate_task(p, rq);
+			if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq))
+				set_need_resched();
+		}
+		/*
+		 * We skip the following code due to cpu == this_cpu
+	 	 *
+		 *   task_rq_unlock(rq, &flags);
+		 *   this_rq = task_rq_lock(current, &flags);
+		 */
+		this_rq = rq;
+	} else {
+		this_rq = cpu_rq(this_cpu);
+
+		/*
+		 * Not the local CPU - must adjust timestamp. This should
+		 * get optimised away in the !CONFIG_SMP case.
+		 */
+		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+					+ rq->timestamp_last_tick;
+		__activate_task(p, rq);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+
+		/*
+		 * Parent and child are on different CPUs, now get the
+		 * parent runqueue to update the parent's ->sleep_avg:
+		 */
+		task_rq_unlock(rq, &flags);
+		this_rq = task_rq_lock(current, &flags);
+	}
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+	task_rq_unlock(this_rq, &flags);
+}
+
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many threads.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+void fastcall sched_exit(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+
+	/*
+	 * If the child was a (relative-) CPU hog then decrease
+	 * the sleep_avg of the parent as well.
+	 */
+	rq = task_rq_lock(p->parent, &flags);
+	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+		p->parent->time_slice += p->time_slice;
+		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+			p->parent->time_slice = task_timeslice(p);
+	}
+	if (p->sleep_avg < p->parent->sleep_avg)
+		p->parent->sleep_avg = p->parent->sleep_avg /
+		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+		(EXIT_WEIGHT + 1);
+	task_rq_unlock(rq, &flags);
+}
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+{
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+	__releases(rq->lock)
+{
+	struct mm_struct *mm = rq->prev_mm;
+	unsigned long prev_task_flags;
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+	 * calls schedule one last time. The schedule call will never return,
+	 * and the scheduled task must drop that reference.
+	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+	 * still held, otherwise prev could be scheduled on another cpu, die
+	 * there before we look at prev->state, and then the reference would
+	 * be dropped twice.
+	 *		Manfred Spraul <manfred@colorfullife.com>
+	 */
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	/*
+	 * If we pushed an RT task off the runqueue,
+	 * then kick other CPUs, they might run it:
+	 */
+	if (unlikely(rt_task(current) && prev->array && rt_task(prev))) {
+		schedstat_inc(rq, rto_schedule);
+		smp_send_reschedule_allbutself();
+	}
+#endif
+	prev_task_flags = prev->flags;
+	_finish_arch_switch(prev);
+	finish_lock_switch(rq, prev);
+	trace_stop_sched_switched(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
+	if (mm)
+		mmdrop_delayed(mm);
+	if (unlikely(prev_task_flags & PF_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+	 	 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(struct task_struct *prev)
+	__releases(rq->lock)
+{
+	preempt_disable(); // TODO: move this to fork setup
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	local_irq_enable();
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#else
+	preempt_check_resched();
+#endif
+	if (current->set_child_tid)
+		put_user(current->pid, current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline struct task_struct *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	struct mm_struct *mm = next->mm;
+	struct mm_struct *oldmm = prev->active_mm;
+
+	if (unlikely(!mm)) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm(oldmm, mm, next);
+
+	if (unlikely(!prev->mm)) {
+		prev->active_mm = NULL;
+		WARN_ON(rq->prev_mm);
+		rq->prev_mm = oldmm;
+	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
+
+	trace_cmdline();
+
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+
+	return prev;
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->nr_running;
+
+	return sum;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += cpu_rq(i)->nr_uninterruptible;
+
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
+	return sum;
+}
+
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
+#if defined(CONFIG_PREEMPT_RT)
+unsigned long rt_nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt_nr_running;
+
+	return sum;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt_nr_running;
+}
+
+unsigned long rt_nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->rt_nr_uninterruptible;
+
+	return sum;
+}
+
+unsigned long rt_nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->rt_nr_uninterruptible;
+}
+#endif
+
+unsigned long long nr_context_switches(void)
+{
+	int i;
+	unsigned long long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += cpu_rq(i)->nr_switches;
+
+	return sum;
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+unsigned long nr_active(void)
+{
+	unsigned long i, running = 0, uninterruptible = 0;
+
+	for_each_online_cpu(i) {
+		running += cpu_rq(i)->nr_running;
+		uninterruptible += cpu_rq(i)->nr_uninterruptible;
+	}
+
+	if (unlikely((long)uninterruptible < 0))
+		uninterruptible = 0;
+
+	return running + uninterruptible;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+{
+	return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	if (rq1 == rq2) {
+		spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else {
+		if (rq1 < rq2) {
+			spin_lock(&rq1->lock);
+			spin_lock(&rq2->lock);
+		} else {
+			spin_lock(&rq2->lock);
+			spin_lock(&rq1->lock);
+		}
+	}
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
+}
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock(&this_rq->lock);
+
+			return 1;
+		} else
+			spin_lock(&busiest->lock);
+	}
+	return 0;
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(struct task_struct *p, int dest_cpu)
+{
+	struct migration_req req;
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_rq_lock(p, &flags);
+	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+	    || unlikely(cpu_is_offline(dest_cpu)))
+		goto out;
+
+	/* force the process onto the specified CPU */
+	if (migrate_task(p, dest_cpu, &req)) {
+		/* Need to wait for migration thread (might exit: take ref). */
+		struct task_struct *mt = rq->migration_thread;
+
+		get_task_struct(mt);
+		task_rq_unlock(rq, &flags);
+		wake_up_process(mt);
+		put_task_struct(mt);
+		wait_for_completion(&req.done);
+
+		return;
+	}
+out:
+	task_rq_unlock(rq, &flags);
+}
+
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+	int new_cpu, this_cpu = get_cpu();
+	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+	put_cpu();
+	if (new_cpu != this_cpu)
+		sched_migrate_task(current, new_cpu);
+}
+
+/*
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
+ */
+static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+		      struct task_struct *p, struct rq *this_rq,
+		      struct prio_array *this_array, int this_cpu)
+{
+	dequeue_task(p, src_array);
+	dec_nr_running(p, src_rq);
+	set_task_cpu(p, this_cpu);
+	inc_nr_running(p, this_rq);
+	enqueue_task(p, this_array);
+	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+				+ this_rq->timestamp_last_tick;
+	/*
+	 * Note that idle threads have a prio of MAX_PRIO, for this test
+	 * to be always true for them.
+	 */
+	if (TASK_PREEMPTS_CURR(p, this_rq))
+		resched_task(this_rq->curr);
+}
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+		     struct sched_domain *sd, enum idle_type idle,
+		     int *all_pinned)
+{
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 3) are cache-hot on their current CPU.
+	 */
+	if (!cpu_isset(this_cpu, p->cpus_allowed))
+		return 0;
+	*all_pinned = 0;
+
+	if (task_running(rq, p))
+		return 0;
+
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries)
+		return 1;
+
+	if (task_hot(p, rq->timestamp_last_tick, sd))
+		return 0;
+	return 1;
+}
+
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+
+/*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum idle_type idle,
+		      int *all_pinned)
+{
+	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+	    best_prio_seen, skip_for_load;
+	struct prio_array *array, *dst_array;
+	struct list_head *head, *curr;
+	struct task_struct *tmp;
+	long rem_load_move;
+
+	if (max_nr_move == 0 || max_load_move == 0)
+		goto out;
+
+	rem_load_move = max_load_move;
+	pinned = 1;
+	this_best_prio = rq_best_prio(this_rq);
+	best_prio = rq_best_prio(busiest);
+	/*
+	 * Enable handling of the case where there is more than one task
+	 * with the best priority.   If the current running task is one
+	 * of those with prio==best_prio we know it won't be moved
+	 * and therefore it's safe to override the skip (based on load) of
+	 * any task we find with that prio.
+	 */
+	best_prio_seen = best_prio == busiest->curr->prio;
+
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (busiest->expired->nr_active) {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
+	} else {
+		array = busiest->active;
+		dst_array = this_rq->active;
+	}
+
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == busiest->expired && busiest->active->nr_active) {
+			array = busiest->active;
+			dst_array = this_rq->active;
+			goto new_array;
+		}
+		goto out;
+	}
+
+	head = array->queue + idx;
+	curr = head->prev;
+skip_queue:
+	tmp = list_entry(curr, struct task_struct, run_list);
+
+	curr = curr->prev;
+
+	/*
+	 * To help distribute high priority tasks accross CPUs we don't
+	 * skip a task if it will be the highest priority task (i.e. smallest
+	 * prio value) on its new queue regardless of its load weight
+	 */
+	skip_for_load = tmp->load_weight > rem_load_move;
+	if (skip_for_load && idx < this_best_prio)
+		skip_for_load = !best_prio_seen && idx == best_prio;
+	if (skip_for_load ||
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+
+		best_prio_seen |= idx == best_prio;
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+
+#ifdef CONFIG_SCHEDSTATS
+	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+		schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
+
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pulled++;
+	rem_load_move -= tmp->load_weight;
+
+	/*
+	 * We only want to steal up to the prescribed number of tasks
+	 * and the prescribed amount of weighted load.
+	 */
+	if (pulled < max_nr_move && rem_load_move > 0) {
+		if (idx < this_best_prio)
+			this_best_prio = idx;
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+out:
+	/*
+	 * Right now, this is the only place pull_task() is called,
+	 * so we can safely collect pull_task() stats here rather than
+	 * inside pull_task().
+	 */
+	schedstat_add(sd, lb_gained[idle], pulled);
+
+	if (all_pinned)
+		*all_pinned = pinned;
+	return pulled;
+}
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+		   cpumask_t *cpus)
+{
+	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	unsigned long max_pull;
+	unsigned long busiest_load_per_task, busiest_nr_running;
+	unsigned long this_load_per_task, this_nr_running;
+	int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	int power_savings_balance = 1;
+	unsigned long leader_nr_running = 0, min_load_per_task = 0;
+	unsigned long min_nr_running = ULONG_MAX;
+	struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
+
+	max_load = this_load = total_load = total_pwr = 0;
+	busiest_load_per_task = busiest_nr_running = 0;
+	this_load_per_task = this_nr_running = 0;
+	if (idle == NOT_IDLE)
+		load_idx = sd->busy_idx;
+	else if (idle == NEWLY_IDLE)
+		load_idx = sd->newidle_idx;
+	else
+		load_idx = sd->idle_idx;
+
+	do {
+		unsigned long load, group_capacity;
+		int local_group;
+		int i;
+		unsigned long sum_nr_running, sum_weighted_load;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		/* Tally up the load of all CPUs in the group */
+		sum_weighted_load = sum_nr_running = avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			struct rq *rq;
+
+			if (!cpu_isset(i, *cpus))
+				continue;
+
+			rq = cpu_rq(i);
+
+			if (*sd_idle && !idle_cpu(i))
+				*sd_idle = 0;
+
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = target_load(i, load_idx);
+			else
+				load = source_load(i, load_idx);
+
+			avg_load += load;
+			sum_nr_running += rq->nr_running;
+			sum_weighted_load += rq->raw_weighted_load;
+		}
+
+		total_load += avg_load;
+		total_pwr += group->cpu_power;
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+			this_nr_running = sum_nr_running;
+			this_load_per_task = sum_weighted_load;
+		} else if (avg_load > max_load &&
+			   sum_nr_running > group_capacity) {
+			max_load = avg_load;
+			busiest = group;
+			busiest_nr_running = sum_nr_running;
+			busiest_load_per_task = sum_weighted_load;
+		}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+		/*
+		 * Busy processors will not participate in power savings
+		 * balance.
+		 */
+ 		if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ 			goto group_next;
+
+		/*
+		 * If the local group is idle or completely loaded
+		 * no need to do power savings balance at this domain
+		 */
+		if (local_group && (this_nr_running >= group_capacity ||
+				    !this_nr_running))
+			power_savings_balance = 0;
+
+ 		/*
+		 * If a group is already running at full capacity or idle,
+		 * don't include that group in power savings calculations
+ 		 */
+ 		if (!power_savings_balance || sum_nr_running >= group_capacity
+		    || !sum_nr_running)
+ 			goto group_next;
+
+ 		/*
+		 * Calculate the group which has the least non-idle load.
+ 		 * This is the group from where we need to pick up the load
+ 		 * for saving power
+ 		 */
+ 		if ((sum_nr_running < min_nr_running) ||
+ 		    (sum_nr_running == min_nr_running &&
+		     first_cpu(group->cpumask) <
+		     first_cpu(group_min->cpumask))) {
+ 			group_min = group;
+ 			min_nr_running = sum_nr_running;
+			min_load_per_task = sum_weighted_load /
+						sum_nr_running;
+ 		}
+
+ 		/*
+		 * Calculate the group which is almost near its
+ 		 * capacity but still has some space to pick up some load
+ 		 * from other group and save more power
+ 		 */
+ 		if (sum_nr_running <= group_capacity - 1) {
+ 			if (sum_nr_running > leader_nr_running ||
+ 			    (sum_nr_running == leader_nr_running &&
+ 			     first_cpu(group->cpumask) >
+ 			      first_cpu(group_leader->cpumask))) {
+ 				group_leader = group;
+ 				leader_nr_running = sum_nr_running;
+ 			}
+		}
+group_next:
+#endif
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+		goto out_balanced;
+
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+	if (this_load >= avg_load ||
+			100*max_load <= sd->imbalance_pct*this_load)
+		goto out_balanced;
+
+	busiest_load_per_task /= busiest_nr_running;
+	/*
+	 * We're trying to get all the cpus to the average_load, so we don't
+	 * want to push ourselves above the average load, nor do we wish to
+	 * reduce the max loaded cpu below the average load, as either of these
+	 * actions would just result in more rebalancing later, and ping-pong
+	 * tasks around. Thus we look for the minimum possible imbalance.
+	 * Negative imbalances (*we* are more loaded than anyone else) will
+	 * be counted as no imbalance for these purposes -- we can't fix that
+	 * by pulling tasks to us.  Be careful of negative numbers as they'll
+	 * appear as very large values with unsigned longs.
+	 */
+	if (max_load <= busiest_load_per_task)
+		goto out_balanced;
+
+	/*
+	 * In the presence of smp nice balancing, certain scenarios can have
+	 * max load less than avg load(as we skip the groups at or below
+	 * its cpu_power, while calculating max_load..)
+	 */
+	if (max_load < avg_load) {
+		*imbalance = 0;
+		goto small_imbalance;
+	}
+
+	/* Don't want to pull so many tasks that a group would go idle */
+	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = min(max_pull * busiest->cpu_power,
+				(avg_load - this_load) * this->cpu_power)
+			/ SCHED_LOAD_SCALE;
+
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < busiest_load_per_task) {
+		unsigned long tmp, pwr_now, pwr_move;
+		unsigned int imbn;
+
+small_imbalance:
+		pwr_move = pwr_now = 0;
+		imbn = 2;
+		if (this_nr_running) {
+			this_load_per_task /= this_nr_running;
+			if (busiest_load_per_task > this_load_per_task)
+				imbn = 1;
+		} else
+			this_load_per_task = SCHED_LOAD_SCALE;
+
+		if (max_load - this_load >= busiest_load_per_task * imbn) {
+			*imbalance = busiest_load_per_task;
+			return busiest;
+		}
+
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
+
+		pwr_now += busiest->cpu_power *
+			min(busiest_load_per_task, max_load);
+		pwr_now += this->cpu_power *
+			min(this_load_per_task, this_load);
+		pwr_now /= SCHED_LOAD_SCALE;
+
+		/* Amount of load we'd subtract */
+		tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power *
+				min(busiest_load_per_task, max_load - tmp);
+
+		/* Amount of load we'd add */
+		if (max_load*busiest->cpu_power <
+				busiest_load_per_task*SCHED_LOAD_SCALE)
+			tmp = max_load*busiest->cpu_power/this->cpu_power;
+		else
+			tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+		pwr_move /= SCHED_LOAD_SCALE;
+
+		/* Move if we gain throughput */
+		if (pwr_move <= pwr_now)
+			goto out_balanced;
+
+		*imbalance = busiest_load_per_task;
+	}
+
+	return busiest;
+
+out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		goto ret;
+
+	if (this == group_leader && group_leader != group_min) {
+		*imbalance = min_load_per_task;
+		return group_min;
+	}
+ret:
+#endif
+	*imbalance = 0;
+	return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum idle_type idle,
+		   unsigned long imbalance, cpumask_t *cpus)
+{
+	struct rq *busiest = NULL, *rq;
+	unsigned long max_load = 0;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+
+		if (!cpu_isset(i, *cpus))
+			continue;
+
+		rq = cpu_rq(i);
+
+		if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
+			continue;
+
+		if (rq->raw_weighted_load > max_load) {
+			max_load = rq->raw_weighted_load;
+			busiest = rq;
+		}
+	}
+
+	return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL	512
+
+static inline unsigned long minus_1_or_zero(unsigned long n)
+{
+	return n > 0 ? n - 1 : 0;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+			struct sched_domain *sd, enum idle_type idle)
+{
+	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	struct sched_group *group;
+	unsigned long imbalance;
+	struct rq *busiest;
+	cpumask_t cpus = CPU_MASK_ALL;
+
+	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+	    !sched_smt_power_savings)
+		sd_idle = 1;
+
+	schedstat_inc(sd, lb_cnt[idle]);
+
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+							&cpus);
+	if (!group) {
+		schedstat_inc(sd, lb_nobusyg[idle]);
+		goto out_balanced;
+	}
+
+	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
+	if (!busiest) {
+		schedstat_inc(sd, lb_nobusyq[idle]);
+		goto out_balanced;
+	}
+
+	BUG_ON(busiest == this_rq);
+
+	schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		double_rq_lock(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+				      minus_1_or_zero(busiest->nr_running),
+				      imbalance, sd, idle, &all_pinned);
+		double_rq_unlock(this_rq, busiest);
+
+		/* All tasks on this runqueue were pinned by CPU affinity */
+		if (unlikely(all_pinned)) {
+			cpu_clear(cpu_of(busiest), cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
+			goto out_balanced;
+		}
+	}
+
+	if (!nr_moved) {
+		schedstat_inc(sd, lb_failed[idle]);
+		sd->nr_balance_failed++;
+
+		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+
+			spin_lock(&busiest->lock);
+
+			/* don't kick the migration_thread, if the curr
+			 * task on busiest cpu can't be moved to this_cpu
+			 */
+			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+				spin_unlock(&busiest->lock);
+				all_pinned = 1;
+				goto out_one_pinned;
+			}
+
+			if (!busiest->active_balance) {
+				busiest->active_balance = 1;
+				busiest->push_cpu = this_cpu;
+				active_balance = 1;
+			}
+			spin_unlock(&busiest->lock);
+			if (active_balance)
+				wake_up_process(busiest->migration_thread);
+
+			/*
+			 * We've kicked active balancing, reset the failure
+			 * counter.
+			 */
+			sd->nr_balance_failed = sd->cache_nice_tries+1;
+		}
+	} else
+		sd->nr_balance_failed = 0;
+
+	if (likely(!active_balance)) {
+		/* We were unbalanced, so reset the balancing interval */
+		sd->balance_interval = sd->min_interval;
+	} else {
+		/*
+		 * If we've begun active balancing, start to back off. This
+		 * case may not be covered by the all_pinned logic if there
+		 * is only 1 task on the busy runqueue (because we don't call
+		 * move_tasks).
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval *= 2;
+	}
+
+	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	    !sched_smt_power_savings)
+		return -1;
+	return nr_moved;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[idle]);
+
+	sd->nr_balance_failed = 0;
+
+out_one_pinned:
+	/* tune up the balancing interval */
+	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+			(sd->balance_interval < sd->max_interval))
+		sd->balance_interval *= 2;
+
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+			!sched_smt_power_savings)
+		return -1;
+	return 0;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
+ */
+static int
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
+{
+	struct sched_group *group;
+	struct rq *busiest = NULL;
+	unsigned long imbalance;
+	int nr_moved = 0;
+	int sd_idle = 0;
+	cpumask_t cpus = CPU_MASK_ALL;
+
+	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+		sd_idle = 1;
+
+	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+				&sd_idle, &cpus);
+	if (!group) {
+		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+		goto out_balanced;
+	}
+
+	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+				&cpus);
+	if (!busiest) {
+		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+		goto out_balanced;
+	}
+
+	BUG_ON(busiest == this_rq);
+
+	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/* Attempt to move tasks */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
+					imbalance, sd, NEWLY_IDLE, NULL);
+		spin_unlock(&busiest->lock);
+
+		if (!nr_moved) {
+			cpu_clear(cpu_of(busiest), cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
+		}
+	}
+
+	if (!nr_moved) {
+		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+			return -1;
+	} else
+		sd->nr_balance_failed = 0;
+
+	return nr_moved;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+					!sched_smt_power_savings)
+		return -1;
+	sd->nr_balance_failed = 0;
+
+	return 0;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(this_cpu, sd) {
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			/* If we've pulled tasks over stop searching: */
+			if (load_balance_newidle(this_cpu, this_rq, sd))
+				break;
+		}
+	}
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+{
+	int target_cpu = busiest_rq->push_cpu;
+	struct sched_domain *sd;
+	struct rq *target_rq;
+
+	/* Is there any task to move? */
+	if (busiest_rq->nr_running <= 1)
+		return;
+
+	target_rq = cpu_rq(target_cpu);
+
+	/*
+	 * This condition is "impossible", if it occurs
+	 * we need to fix it.  Originally reported by
+	 * Bjorn Helgaas on a 128-cpu setup.
+	 */
+	BUG_ON(busiest_rq == target_rq);
+
+	/* move a task from busiest_rq to target_rq */
+	double_lock_balance(busiest_rq, target_rq);
+
+	/* Search for an sd spanning us and the target CPU. */
+	for_each_domain(target_cpu, sd) {
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+		    cpu_isset(busiest_cpu, sd->span))
+				break;
+	}
+
+	if (likely(sd)) {
+		schedstat_inc(sd, alb_cnt);
+
+		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+			       RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
+			       NULL))
+			schedstat_inc(sd, alb_pushed);
+		else
+			schedstat_inc(sd, alb_failed);
+	}
+	spin_unlock(&target_rq->lock);
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once: */
+static inline unsigned long cpu_offset(int cpu)
+{
+	return jiffies + cpu * HZ / NR_CPUS;
+}
+
+static void
+rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
+{
+	unsigned long this_load, interval, j = cpu_offset(this_cpu);
+	struct sched_domain *sd;
+	int i, scale;
+
+	this_load = this_rq->raw_weighted_load;
+
+	/* Update our load: */
+	for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
+		unsigned long old_load, new_load;
+
+		old_load = this_rq->cpu_load[i];
+		new_load = this_load;
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		interval = sd->balance_interval;
+		if (idle != SCHED_IDLE)
+			interval *= sd->busy_factor;
+
+		/* scale ms to jiffies */
+		interval = msecs_to_jiffies(interval);
+		if (unlikely(!interval))
+			interval = 1;
+
+		if (j - sd->last_balance >= interval) {
+			if (load_balance(this_cpu, this_rq, sd, idle)) {
+				/*
+				 * We've pulled tasks over so either we're no
+				 * longer idle, or one of our SMT siblings is
+				 * not idle.
+				 */
+				idle = NOT_IDLE;
+			}
+			sd->last_balance += interval;
+		}
+	}
+}
+#else
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
+{
+}
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+#endif
+
+static inline int wake_priority_sleeper(struct rq *rq)
+{
+	int ret = 0;
+
+#ifdef CONFIG_SCHED_SMT
+	spin_lock(&rq->lock);
+	/*
+	 * If an SMT sibling task has been put to sleep for priority
+	 * reasons reschedule the idle task to see if it can now run.
+	 */
+	if (rq->nr_running) {
+		resched_task(rq->idle);
+		ret = 1;
+	}
+	spin_unlock(&rq->lock);
+#endif
+	return ret;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+{
+	p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const struct task_struct *p)
+{
+	unsigned long long ns;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
+	ns = p->sched_time + sched_clock() - ns;
+	local_irq_restore(flags);
+
+	return ns;
+}
+
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+static inline int expired_starving(struct rq *rq)
+{
+	if (rq->curr->static_prio > rq->best_expired_prio)
+		return 1;
+	if (!STARVATION_LIMIT || !rq->expired_timestamp)
+		return 0;
+	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+		return 1;
+	return 0;
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp;
+
+	p->utime = cputime_add(p->utime, cputime);
+
+	/* Add user time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (rt_task(p))
+		cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp);
+	else if (TASK_NICE(p) > 0)
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+	else
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	struct rq *rq = this_rq();
+	cputime64_t tmp;
+
+	p->stime = cputime_add(p->stime, cputime);
+
+	/* Add system time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (hardirq_count() - hardirq_offset)
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	else if (softirq_count())
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (rt_task(p))
+		cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp);
+	else if (p != rq->idle)
+		cpustat->system = cputime64_add(cpustat->system, tmp);
+	else if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @p: the process from which the cpu time has been stolen
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(struct task_struct *p, cputime_t steal)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp = cputime_to_cputime64(steal);
+	struct rq *rq = this_rq();
+
+	if (p == rq->idle) {
+		p->stime = cputime_add(p->stime, steal);
+		if (atomic_read(&rq->nr_iowait) > 0)
+			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+		else
+			cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	} else
+		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+	unsigned long long now = sched_clock();
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	BUG_ON(!irqs_disabled());
+
+	update_cpu_clock(p, rq, now);
+
+	rq->timestamp_last_tick = now;
+
+	if (p == rq->idle) {
+		if (wake_priority_sleeper(rq))
+			goto out;
+		rebalance_tick(cpu, rq, SCHED_IDLE);
+		return;
+	}
+
+	/* Task might have expired already, but not scheduled off yet */
+	if (p->array != rq->active) {
+		set_tsk_need_resched(p);
+		goto out;
+	}
+	spin_lock(&rq->lock);
+	/*
+	 * The task was running during this tick - update the
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice. This makes it possible for interactive tasks
+	 * to use up their timeslices at their highest priority levels.
+	 *
+	 * Priority-boosted SCHED_NORMAL tasks may go here too.
+	 */
+	if (rt_task(p)) {
+		/*
+		 * RR tasks need a special form of timeslice management.
+		 * FIFO tasks have no timeslices.
+		 *
+		 * On PREEMPT_RT, boosted tasks will also get into this
+		 * branch and wont get their timeslice decreased until
+		 * they have done their work.
+		 */
+		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+			p->time_slice = task_timeslice(p);
+			p->first_time_slice = 0;
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			requeue_task(p, rq->active);
+		}
+		goto out_unlock;
+	}
+	if (!--p->time_slice) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		p->prio = effective_prio(p);
+		p->time_slice = task_timeslice(p);
+		p->first_time_slice = 0;
+
+		if (!rq->expired_timestamp)
+			rq->expired_timestamp = jiffies;
+		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+			enqueue_task(p, rq->expired);
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
+		} else
+			enqueue_task(p, rq->active);
+	} else {
+		/*
+		 * Prevent a too long timeslice allowing a task to monopolize
+		 * the CPU. We do this by splitting up the timeslice into
+		 * smaller pieces.
+		 *
+		 * Note: this does not mean the task's timeslices expire or
+		 * get lost in any way, they just might be preempted by
+		 * another task of equal priority. (one with higher
+		 * priority would have preempted this task already.) We
+		 * requeue this task to the end of the list on this priority
+		 * level, which is in essence a round-robin of tasks with
+		 * equal priority.
+		 *
+		 * This only applies to tasks in the interactive
+		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+		 */
+		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+			(p->array == rq->active)) {
+
+			requeue_task(p, rq->active);
+			set_tsk_need_resched(p);
+		}
+	}
+out_unlock:
+	spin_unlock(&rq->lock);
+out:
+	rebalance_tick(cpu, rq, NOT_IDLE);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(struct rq *rq)
+{
+	/* If an SMT runqueue is sleeping due to priority reasons wake it up */
+	if (rq->curr == rq->idle && rq->nr_running)
+		resched_task(rq->idle);
+}
+
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
+{
+	struct sched_domain *tmp, *sd = NULL;
+	int i;
+
+	for_each_domain(this_cpu, tmp) {
+		if (tmp->flags & SD_SHARE_CPUPOWER) {
+			sd = tmp;
+			break;
+		}
+	}
+
+	if (!sd)
+		return;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct rq *smt_rq = cpu_rq(i);
+
+		if (i == this_cpu)
+			continue;
+		if (unlikely(!spin_trylock(&smt_rq->lock)))
+			continue;
+
+		wakeup_busy_runqueue(smt_rq);
+		spin_unlock(&smt_rq->lock);
+	}
+}
+
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long
+smt_slice(struct task_struct *p, struct sched_domain *sd)
+{
+	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
+{
+	struct sched_domain *tmp, *sd = NULL;
+	int ret = 0, i;
+
+	/* kernel/rt threads do not participate in dependent sleeping */
+	if (!p->mm || rt_task(p))
+		return 0;
+
+	for_each_domain(this_cpu, tmp) {
+		if (tmp->flags & SD_SHARE_CPUPOWER) {
+			sd = tmp;
+			break;
+		}
+	}
+
+	if (!sd)
+		return 0;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct task_struct *smt_curr;
+		struct rq *smt_rq;
+
+		if (i == this_cpu)
+			continue;
+
+		smt_rq = cpu_rq(i);
+		if (unlikely(!spin_trylock(&smt_rq->lock)))
+			continue;
+
+		smt_curr = smt_rq->curr;
+
+		if (!smt_curr->mm)
+			goto unlock;
+
+		/*
+		 * If a user task with lower static priority than the
+		 * running task on the SMT sibling is trying to schedule,
+		 * delay it till there is proportionately less timeslice
+		 * left of the sibling task to prevent a lower priority
+		 * task from using an unfair proportion of the
+		 * physical cpu's resources. -ck
+		 */
+		if (rt_task(smt_curr)) {
+			/*
+			 * With real time tasks we run non-rt tasks only
+			 * per_cpu_gain% of the time.
+			 */
+			if ((jiffies % DEF_TIMESLICE) >
+				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
+					ret = 1;
+		} else {
+			if (smt_curr->static_prio < p->static_prio &&
+				!TASK_PREEMPTS_CURR(p, smt_rq) &&
+				smt_slice(smt_curr, sd) > task_timeslice(p))
+					ret = 1;
+		}
+unlock:
+		spin_unlock(&smt_rq->lock);
+	}
+	return ret;
+}
+#else
+static inline void wake_sleeping_dependent(int this_cpu)
+{
+}
+static inline int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
+{
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES)
+
+static void trace_array(struct prio_array *array)
+{
+	int i;
+	struct task_struct *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, struct task_struct, run_list);
+			trace_special_pid(p->pid, p->prio, PRIO(p));
+		}
+	}
+}
+
+static inline void trace_all_runnable_tasks(struct rq *rq)
+{
+	if (trace_enabled) {
+		trace_array(rq->active);
+		trace_array(rq->expired);
+	}
+}
+
+#else
+
+static inline void trace_all_runnable_tasks(struct rq *rq)
+{
+}
+
+#endif
+
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+	return (sleep_type == SLEEP_INTERACTIVE ||
+		sleep_type == SLEEP_INTERRUPTED);
+}
+
+/*
+ * __schedule() is the main scheduler function.
+ */
+void __sched __schedule(void)
+{
+	struct task_struct *prev, *next;
+	struct prio_array *array;
+	struct list_head *queue;
+	unsigned long long now;
+	unsigned long run_time;
+	int cpu, idx, new_prio;
+	long *switch_count;
+	struct rq *rq;
+
+	WARN_ON(system_state == SYSTEM_BOOTING);
+
+	/*
+	 * Test if we are atomic.  Since do_exit() needs to call into
+	 * schedule() atomically, we ignore that path for now.
+	 * Otherwise, whine if we are scheduling when we should not be.
+	 */
+	if (unlikely(in_atomic() && !current->exit_state)) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d, CPU#%d\n",
+			current->comm, preempt_count(), current->pid,
+			smp_processor_id());
+		dump_stack();
+	}
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+	preempt_disable(); // FIXME: disable irqs here
+	prev = current;
+	release_kernel_lock(prev);
+	rq = this_rq();
+
+	/*
+	 * The idle thread is not allowed to schedule!
+	 * Remove this check after it has been exercised a bit.
+	 */
+	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
+		printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
+		dump_stack();
+	}
+
+	schedstat_inc(rq, sched_cnt);
+	now = sched_clock();
+	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
+		run_time = now - prev->timestamp;
+		if (unlikely((long long)(now - prev->timestamp) < 0))
+			run_time = 0;
+	} else
+		run_time = NS_MAX_SLEEP_AVG;
+
+	/*
+	 * Tasks charged proportionately less run_time at high sleep_avg to
+	 * delay them losing their interactive status
+	 */
+	run_time /= (CURRENT_BONUS(prev) ? : 1);
+
+	cpu = smp_processor_id();
+	spin_lock_irq(&rq->lock);
+
+	switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+			!(preempt_count() & PREEMPT_ACTIVE)) {
+		switch_count = &prev->nvcsw;
+		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+				unlikely(signal_pending(prev))))
+			prev->state = TASK_RUNNING;
+		else {
+			if (prev->state == TASK_UNINTERRUPTIBLE) {
+				rq->nr_uninterruptible++;
+				incr_rt_nr_uninterruptible(prev, rq);
+			}
+			touch_softlockup_watchdog();
+			deactivate_task(prev, rq);
+		}
+	}
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+	if (unlikely(prev->flags & PF_DEAD)) {
+		if (prev->state != TASK_RUNNING) {
+			printk("prev->state: %ld != TASK_RUNNING??\n",
+				prev->state);
+			WARN_ON(1);
+		} else
+			deactivate_task(prev, rq);
+		prev->state = EXIT_DEAD;
+	}
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (unlikely(atomic_read(&rt_overload)))
+		balance_rt_tasks(rq, cpu);
+#endif
+
+	if (unlikely(!rq->nr_running)) {
+		idle_balance(cpu, rq);
+		if (!rq->nr_running) {
+			next = rq->idle;
+			rq->expired_timestamp = 0;
+			wake_sleeping_dependent(cpu);
+			goto switch_tasks;
+		}
+	}
+
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		schedstat_inc(rq, sched_switch);
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+		rq->best_expired_prio = MAX_PRIO;
+	}
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+		unsigned long long delta = now - next->timestamp;
+		if (unlikely((long long)(now - next->timestamp) < 0))
+			delta = 0;
+
+		if (next->sleep_type == SLEEP_INTERACTIVE)
+			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+		array = next->array;
+		new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+		if (unlikely(next->prio != new_prio)) {
+			dequeue_task(next, array);
+			next->prio = new_prio;
+			enqueue_task(next, array);
+		}
+	}
+	next->sleep_type = SLEEP_NORMAL;
+	if (dependent_sleeper(cpu, rq, next))
+		next = rq->idle;
+switch_tasks:
+	if (next == rq->idle)
+		schedstat_inc(rq, sched_goidle);
+	prefetch(next);
+	prefetch_stack(next);
+	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_delayed(prev);
+	rcu_qsctr_inc(task_cpu(prev));
+
+	update_cpu_clock(prev, rq, now);
+
+	prev->sleep_avg -= run_time;
+	if ((long)prev->sleep_avg <= 0)
+		prev->sleep_avg = 0;
+	prev->timestamp = prev->last_ran = now;
+
+	trace_all_runnable_tasks(rq);
+
+	sched_info_switch(prev, next);
+	if (likely(prev != next)) {
+		next->timestamp = now;
+		rq->nr_switches++;
+		rq->curr = next;
+		++*switch_count;
+
+		prepare_task_switch(rq, next);
+		prev = context_switch(rq, prev, next);
+		barrier();
+		trace_special_pid(prev->pid, PRIO(prev), PRIO(current));
+		/*
+		 * this_rq must be evaluated again because prev may have moved
+		 * CPUs since it called schedule(), thus the 'rq' on its stack
+		 * frame will be invalid.
+		 */
+		finish_task_switch(this_rq(), prev);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+		trace_stop_sched_switched(next);
+	}
+
+	reacquire_kernel_lock(current);
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+	if (unlikely(irqs_disabled())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling with irqs disabled: "
+			"%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
+	}
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)));
+	local_irq_enable(); // TODO: do sti; ret
+}
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_PREEMPT
+
+/*
+ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
+ */
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk("turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk("turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.  Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched preempt_schedule(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task.  Just return..
+	 */
+	if (unlikely(ti->preempt_count || irqs_disabled()))
+		return;
+
+need_resched:
+	local_irq_disable();
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	__schedule();
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+			test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
+		goto need_resched;
+	local_irq_enable();
+}
+
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
+
+need_resched:
+	local_irq_disable();
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	__schedule();
+
+	local_irq_disable();
+
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) ||
+		     test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
+		goto need_resched;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+			  void *key)
+{
+	return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX,
+			      sync, 0);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			     int nr_exclusive, int sync, void *key)
+{
+	struct list_head *tmp, *next;
+
+	list_for_each_safe(tmp, next, &q->task_list) {
+		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		unsigned flags = curr->flags;
+
+		if (curr->func(curr, mode, sync, key) &&
+				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			break;
+	}
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ */
+void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+	preempt_check_resched_delayed();
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+{
+	__wake_up_common(q, mode, 1, 0, NULL);
+}
+
+/**
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	unsigned long flags;
+	int sync = 1;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		sync = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+
+void fastcall complete(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done++;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 1, 1, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
+}
+EXPORT_SYMBOL(complete);
+
+void fastcall complete_all(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += UINT_MAX/2;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 0, 1, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
+}
+EXPORT_SYMBOL(complete_all);
+
+unsigned int fastcall completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
+void fastcall __sched wait_for_completion(struct completion *x)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+	spin_unlock_irq(&x->wait.lock);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+unsigned long fastcall __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+			if (!timeout) {
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				ret = -ERESTARTSYS;
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+unsigned long fastcall __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				timeout = -ERESTARTSYS;
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+			if (!timeout) {
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/*
+ * Preempt any context:
+ */
+int cond_resched_all(void)
+{
+	if (hardirq_count())
+		return cond_resched_hardirq();
+	if (softirq_count())
+		return cond_resched_softirq();
+	return cond_resched();
+}
+
+EXPORT_SYMBOL(cond_resched_all);
+
+#define	SLEEP_ON_VAR					\
+	unsigned long flags;				\
+	wait_queue_t wait;				\
+	init_waitqueue_entry(&wait, current);
+
+#define SLEEP_ON_HEAD					\
+	spin_lock_irqsave(&q->lock,flags);		\
+	__add_wait_queue(q, &wait);			\
+	spin_unlock(&q->lock);
+
+#define	SLEEP_ON_TAIL					\
+	spin_lock_irq(&q->lock);			\
+	__remove_wait_queue(q, &wait);			\
+	spin_unlock_irqrestore(&q->lock, flags);
+
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_INTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_INTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	timeout = schedule_timeout(timeout);
+	SLEEP_ON_TAIL
+
+	return timeout;
+}
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void fastcall __sched sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+EXPORT_SYMBOL(sleep_on);
+
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	timeout = schedule_timeout(timeout);
+	SLEEP_ON_TAIL
+
+	return timeout;
+}
+
+EXPORT_SYMBOL(sleep_on_timeout);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	int oldprio, prev_resched;
+	struct prio_array *array;
+	unsigned long flags;
+	struct rq *rq;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p));
+	prev_resched = _need_resched();
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	trace_special(prev_resched, _need_resched(), 0);
+
+	task_rq_unlock(rq, &flags);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+	struct prio_array *array;
+	int old_prio, delta;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+		return;
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (has_rt_policy(p)) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		goto out_unlock;
+	}
+	array = p->array;
+	if (array) {
+		dequeue_task(p, array);
+		dec_raw_weighted_load(rq, p);
+	}
+
+	p->static_prio = NICE_TO_PRIO(nice);
+	set_load_weight(p);
+	old_prio = p->prio;
+	p->prio = effective_prio(p);
+	delta = p->prio - old_prio;
+
+	if (array) {
+		enqueue_task(p, array);
+		inc_raw_weighted_load(rq, p);
+		/*
+		 * If the task increased its priority or is running and
+		 * lowered its priority, then reschedule its CPU:
+		 */
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
+			resched_task(rq->curr);
+	}
+out_unlock:
+	task_rq_unlock(rq, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = 20 - nice;
+
+	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+		capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+asmlinkage long sys_nice(int increment)
+{
+	long nice, retval;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+	if (increment < -40)
+		increment = -40;
+	if (increment > 40)
+		increment = 40;
+
+	nice = PRIO_TO_NICE(current->static_prio) + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const struct task_struct *p)
+{
+	return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const struct task_struct *p)
+{
+	return TASK_NICE(p);
+}
+EXPORT_SYMBOL_GPL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+struct task_struct *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_pid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, int policy, int prio)
+{
+	BUG_ON(p->array);
+
+	p->policy = policy;
+	p->rt_priority = prio;
+	p->normal_prio = normal_prio(p);
+	/* we are holding p->pi_lock already */
+	p->prio = rt_mutex_getprio(p);
+	/*
+	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+	 */
+	if (policy == SCHED_BATCH)
+		p->sleep_avg = 0;
+	set_load_weight(p);
+}
+
+/**
+ * Set the priority of a thread
+ * @p: the task in question.
+ * @prio: new priority
+ */
+void set_thread_priority(struct task_struct *p, int prio)
+{
+	struct prio_array *array;
+	unsigned long flags;
+	struct rq *rq;
+	int oldprio;
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	spin_lock_irqsave(&p->pi_lock, flags);
+	rq = __task_rq_lock(p);
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
+	oldprio = p->prio;
+	__setscheduler(p, p->policy, prio);
+	if (array) {
+		__activate_task(p, rq);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	__task_rq_unlock(rq);
+	spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	rt_mutex_adjust_pi(p);
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of
+ * a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       struct sched_param *param)
+{
+	int retval, oldprio, oldpolicy = -1;
+	struct prio_array *array;
+	unsigned long flags;
+	struct rq *rq;
+
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0)
+		policy = oldpolicy = p->policy;
+	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+		return -EINVAL;
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+		return -EINVAL;
+	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
+					!= (param->sched_priority == 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (!capable(CAP_SYS_NICE)) {
+		/*
+		 * can't change policy, except between SCHED_NORMAL
+		 * and SCHED_BATCH:
+		 */
+		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
+			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+			return -EPERM;
+		/* can't increase priority */
+		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+		    param->sched_priority > p->rt_priority &&
+		    param->sched_priority >
+				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+			return -EPERM;
+		/* can't change other user's priorities */
+		if ((current->euid != p->euid) &&
+		    (current->euid != p->uid))
+			return -EPERM;
+	}
+
+	retval = security_task_setscheduler(p, policy, param);
+	if (retval)
+		return retval;
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	spin_lock_irqsave(&p->pi_lock, flags);
+	/*
+	 * To be able to change p->policy safely, the apropriate
+	 * runqueue lock must be held.
+	 */
+	rq = __task_rq_lock(p);
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		__task_rq_unlock(rq);
+		spin_unlock_irqrestore(&p->pi_lock, flags);
+		goto recheck;
+	}
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
+	oldprio = p->prio;
+	__setscheduler(p, policy, param->sched_priority);
+	if (array) {
+		__activate_task(p, rq);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	__task_rq_unlock(rq);
+	spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	rt_mutex_adjust_pi(p);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	int retval;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+	read_lock_irq(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock_irq(&tasklist_lock);
+		return -ESRCH;
+	}
+	retval = sched_setscheduler(p, policy, &lparam);
+	read_unlock_irq(&tasklist_lock);
+
+	return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+				       struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+{
+	return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+asmlinkage long sys_sched_getscheduler(pid_t pid)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	read_unlock(&tasklist_lock);
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+{
+	struct sched_param lp;
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	lp.sched_priority = p->rt_priority;
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+	cpumask_t cpus_allowed;
+	struct task_struct *p;
+	int retval;
+
+	lock_cpu_hotplug();
+	read_lock(&tasklist_lock);
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		unlock_cpu_hotplug();
+		return -ESRCH;
+	}
+
+	/*
+	 * It is not safe to call set_cpus_allowed with the
+	 * tasklist_lock held.  We will bump the task_struct's
+	 * usage count and then drop tasklist_lock.
+	 */
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	retval = -EPERM;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+			!capable(CAP_SYS_NICE))
+		goto out_unlock;
+
+	retval = security_task_setscheduler(p, 0, NULL);
+	if (retval)
+		goto out_unlock;
+
+	cpus_allowed = cpuset_cpus_allowed(p);
+	cpus_and(new_mask, new_mask, cpus_allowed);
+	retval = set_cpus_allowed(p, new_mask);
+
+out_unlock:
+	put_task_struct(p);
+	unlock_cpu_hotplug();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
+				      unsigned long __user *user_mask_ptr)
+{
+	cpumask_t new_mask;
+	int retval;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+	if (retval)
+		return retval;
+
+	return sched_setaffinity(pid, new_mask);
+}
+
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+#endif
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	struct task_struct *p;
+	int retval;
+
+	lock_cpu_hotplug();
+	read_lock(&tasklist_lock);
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	unlock_cpu_hotplug();
+	if (retval)
+		return retval;
+
+	return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
+				      unsigned long __user *user_mask_ptr)
+{
+	int ret;
+	cpumask_t mask;
+
+	if (len < sizeof(cpumask_t))
+		return -EINVAL;
+
+	ret = sched_getaffinity(pid, &mask);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
+		return -EFAULT;
+
+	return sizeof(cpumask_t);
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * this function yields the current CPU by moving the calling thread
+ * to the expired array. If there are no other threads running on this
+ * CPU then this function will return.
+ */
+asmlinkage long sys_sched_yield(void)
+{
+	struct rq *rq = this_rq_lock();
+	struct prio_array *array = current->array, *target = rq->expired;
+
+	schedstat_inc(rq, yld_cnt);
+	/*
+	 * We implement yielding by moving the task into the expired
+	 * queue.
+	 *
+	 * (special rule: RT tasks will just roundrobin in the active
+	 *  array.)
+	 */
+	if (rt_task(current))
+		target = rq->active;
+
+	if (array->nr_active == 1) {
+		schedstat_inc(rq, yld_act_empty);
+		if (!rq->expired->nr_active)
+			schedstat_inc(rq, yld_both_empty);
+	} else if (!rq->expired->nr_active)
+		schedstat_inc(rq, yld_exp_empty);
+
+	if (array != target) {
+		dequeue_task(current, array);
+		enqueue_task(current, target);
+	} else
+		/*
+		 * requeue_task is cheaper so perform that if possible.
+		 */
+		requeue_task(current, array);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	spin_unlock_no_resched(&rq->lock);
+
+	__schedule();
+	local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+
+static inline int __resched_legal(int expected_preempt_count)
+{
+	if (unlikely(preempt_count() != expected_preempt_count))
+		return 0;
+	if (unlikely(system_state != SYSTEM_RUNNING))
+		return 0;
+	return 1;
+}
+
+static void __cond_resched(void)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+	__might_sleep(__FILE__, __LINE__);
+#endif
+	/*
+	 * The BKS might be reacquired before we have dropped
+	 * PREEMPT_ACTIVE, which could trigger a second
+	 * cond_resched() call.
+	 */
+	do {
+		local_irq_disable();
+		add_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
+	} while (need_resched());
+	local_irq_enable();
+}
+
+int __sched cond_resched(void)
+{
+	if (need_resched() && __resched_legal(0)) {
+		__cond_resched();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched);
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
+{
+	int ret = 0;
+
+	if (need_lockbreak_raw(lock)) {
+		spin_unlock(lock);
+		cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	if (need_resched() && __resched_legal(1)) {
+		spin_unlock_no_resched(lock);
+		__cond_resched();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
+
+#ifdef CONFIG_PREEMPT_RT
+
+int __cond_resched_spinlock(spinlock_t *lock)
+{
+#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		spin_unlock_no_resched(lock);
+		__cond_resched();
+		spin_lock(lock);
+		return 1;
+	}
+#endif
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_spinlock);
+
+#endif
+
+int __sched cond_resched_softirq(void)
+{
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!in_softirq());
+
+	if (softirq_need_resched() && __resched_legal(0)) {
+		raw_local_irq_disable();
+		_local_bh_enable();
+		raw_local_irq_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+#endif
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched_softirq);
+
+/*
+ * Preempt a hardirq context if necessary:
+ */
+int cond_resched_hardirq(void)
+{
+	BUG_ON(!in_irq());
+
+	if (hardirq_need_resched()) {
+		irq_exit();
+		__cond_resched();
+		irq_enter();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cond_resched_hardirq);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * this is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched __yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
+EXPORT_SYMBOL(yield);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+	struct rq *rq = &__raw_get_cpu_var(runqueues);
+
+	delayacct_blkio_start();
+	atomic_inc(&rq->nr_iowait);
+	schedule();
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+}
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+	struct rq *rq = &__raw_get_cpu_var(runqueues);
+	long ret;
+
+	delayacct_blkio_start();
+	atomic_inc(&rq->nr_iowait);
+	ret = schedule_timeout(timeout);
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_max(int policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_min(int policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+		ret = 0;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+asmlinkage
+long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+	struct timespec t;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	jiffies_to_timespec(p->policy == SCHED_FIFO ?
+				0 : task_timeslice(p), &t);
+	read_unlock(&tasklist_lock);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+out_nounlock:
+	return retval;
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+static inline struct task_struct *eldest_child(struct task_struct *p)
+{
+	if (list_empty(&p->children))
+		return NULL;
+	return list_entry(p->children.next,struct task_struct,sibling);
+}
+
+static inline struct task_struct *older_sibling(struct task_struct *p)
+{
+	if (p->sibling.prev==&p->parent->children)
+		return NULL;
+	return list_entry(p->sibling.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *younger_sibling(struct task_struct *p)
+{
+	if (p->sibling.next==&p->parent->children)
+		return NULL;
+	return list_entry(p->sibling.next,struct task_struct,sibling);
+}
+
+static const char stat_nam[] = "RMSDTtZX";
+
+static void show_task(struct task_struct *p)
+{
+	struct task_struct *relative;
+	unsigned long free = 0;
+	unsigned state;
+
+	state = p->state ? __ffs(p->state) + 1 : 0;
+	printk("%-13.13s %c [%p]", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', p);
+#if (BITS_PER_LONG == 32)
+	if (0 && (state == TASK_RUNNING))
+		printk(" running ");
+	else
+		printk(" %08lX ", thread_saved_pc(p));
+#else
+	if (0 && (state == TASK_RUNNING))
+		printk("  running task   ");
+	else
+		printk(" %016lx ", thread_saved_pc(p));
+#endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->array)
+		printk("[on rq #%d] ", task_cpu(p));
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	{
+		unsigned long *n = end_of_stack(p);
+		while (!*n)
+			n++;
+		free = (unsigned long)n - (unsigned long)end_of_stack(p);
+	}
+#endif
+	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
+	if ((relative = eldest_child(p)))
+		printk("%5d ", relative->pid);
+	else
+		printk("      ");
+	if ((relative = younger_sibling(p)))
+		printk("%7d", relative->pid);
+	else
+		printk("       ");
+	if ((relative = older_sibling(p)))
+		printk(" %5d", relative->pid);
+	else
+		printk("      ");
+	if (!p->mm)
+		printk(" (L-TLB)\n");
+	else
+		printk(" (NOTLB)\n");
+
+//	if (state != TASK_RUNNING)
+		show_stack(p, NULL);
+}
+
+void show_state(void)
+{
+	struct task_struct *g, *p;
+	int do_unlock = 1;
+
+#if (BITS_PER_LONG == 32)
+	printk("\n"
+	       "                                               sibling\n");
+	printk("  task             PC      pid father child younger older\n");
+#else
+	printk("\n"
+	       "                                                       sibling\n");
+	printk("  task                 PC          pid father child younger older\n");
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
+	read_lock(&tasklist_lock);
+#endif
+
+	do_each_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take alot of time:
+		 */
+		touch_nmi_watchdog();
+		show_task(p);
+	} while_each_thread(g, p);
+
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
+	debug_show_all_locks();
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void __devinit init_idle(struct task_struct *idle, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	idle->timestamp = sched_clock();
+	idle->sleep_avg = 0;
+	idle->array = NULL;
+	idle->prio = idle->normal_prio = MAX_PRIO;
+	idle->state = TASK_RUNNING;
+	idle->cpus_allowed = cpumask_of_cpu(cpu);
+	set_task_cpu(idle, cpu);
+
+	spin_lock_irqsave(&rq->lock, flags);
+	rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	idle->oncpu = 1;
+#endif
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
+	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
+#else
+	task_thread_info(idle)->preempt_count = 0;
+#endif
+}
+
+/*
+ * In a system that switches off the HZ timer nohz_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer nohz_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+
+#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we queue a struct migration_req structure in the source CPU's
+ *    runqueue and wake up that CPU's migration thread.
+ * 2) we down() the locked semaphore => thread blocks.
+ * 3) migration thread wakes up (implicitly it forces the migrated
+ *    thread off the CPU)
+ * 4) it gets the migration request and checks whether the migrated
+ *    task is still in the wrong runqueue.
+ * 5) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 6) migration thread up()s the semaphore.
+ * 7) we wake up and the migration is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely.  The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+{
+	struct migration_req req;
+	unsigned long flags;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+	if (!cpus_intersects(new_mask, cpu_online_map)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p->cpus_allowed = new_mask;
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpu_isset(task_cpu(p), new_mask))
+		goto out;
+
+	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, &flags);
+		wake_up_process(rq->migration_thread);
+		wait_for_completion(&req.done);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	}
+out:
+	task_rq_unlock(rq, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu.  We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	struct rq *rq_dest, *rq_src;
+	int ret = 0;
+
+	if (unlikely(cpu_is_offline(dest_cpu)))
+		return ret;
+
+	rq_src = cpu_rq(src_cpu);
+	rq_dest = cpu_rq(dest_cpu);
+
+	double_rq_lock(rq_src, rq_dest);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto out;
+	/* Affinity changed (again). */
+	if (!cpu_isset(dest_cpu, p->cpus_allowed))
+		goto out;
+
+	WARN_ON(p == rq_src->curr);
+	set_task_cpu(p, dest_cpu);
+
+	if (p->array) {
+		/*
+		 * Sync timestamp with rq_dest's before activating.
+		 * The same thing could be achieved by doing this step
+		 * afterwards, and pretending it was a local activate.
+		 * This way is cleaner and logically correct.
+		 */
+		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+				+ rq_dest->timestamp_last_tick;
+		deactivate_task(p, rq_src);
+		__activate_task(p, rq_dest);
+		if (TASK_PREEMPTS_CURR(p, rq_dest))
+			resched_task(rq_dest->curr);
+	}
+	ret = 1;
+out:
+	double_rq_unlock(rq_src, rq_dest);
+	return ret;
+}
+
+/*
+ * migration_thread - this is a highprio system thread that performs
+ * thread migration by bumping thread off CPU then 'pushing' onto
+ * another runqueue.
+ */
+static int migration_thread(void *data)
+{
+	int cpu = (long)data;
+	struct rq *rq;
+
+	rq = cpu_rq(cpu);
+	BUG_ON(rq->migration_thread != current);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		struct migration_req *req;
+		struct list_head *head;
+
+		try_to_freeze();
+
+		spin_lock_irq(&rq->lock);
+
+		if (cpu_is_offline(cpu)) {
+			spin_unlock_irq(&rq->lock);
+			goto wait_to_die;
+		}
+
+		if (rq->active_balance) {
+			active_load_balance(rq, cpu);
+			rq->active_balance = 0;
+		}
+
+		head = &rq->migration_queue;
+
+		if (list_empty(head)) {
+			spin_unlock_irq(&rq->lock);
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+			continue;
+		}
+		req = list_entry(head->next, struct migration_req, list);
+		list_del_init(head->next);
+
+		spin_unlock(&rq->lock);
+		__migrate_task(req->task, cpu, req->dest_cpu);
+		local_irq_enable();
+
+		complete(&req->done);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Figure out where task on dead CPU should go, use force if neccessary. */
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+{
+	unsigned long flags;
+	cpumask_t mask;
+	struct rq *rq;
+	int dest_cpu;
+
+restart:
+	/* On same node? */
+	mask = node_to_cpumask(cpu_to_node(dead_cpu));
+	cpus_and(mask, mask, p->cpus_allowed);
+	dest_cpu = any_online_cpu(mask);
+
+	/* On any allowed CPU? */
+	if (dest_cpu == NR_CPUS)
+		dest_cpu = any_online_cpu(p->cpus_allowed);
+
+	/* No more Mr. Nice Guy. */
+	if (dest_cpu == NR_CPUS) {
+		rq = task_rq_lock(p, &flags);
+		cpus_setall(p->cpus_allowed);
+		dest_cpu = any_online_cpu(p->cpus_allowed);
+		task_rq_unlock(rq, &flags);
+
+		/*
+		 * Don't tell them about moving exiting tasks or
+		 * kernel threads (both mm NULL), since they never
+		 * leave kernel.
+		 */
+		if (p->mm && printk_ratelimit())
+			printk(KERN_INFO "process %d (%s) no "
+			       "longer affine to cpu%d\n",
+			       p->pid, p->comm, dead_cpu);
+	}
+	if (!__migrate_task(p, dead_cpu, dest_cpu))
+		goto restart;
+}
+
+/*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
+ */
+static void migrate_nr_uninterruptible(struct rq *rq_src)
+{
+	struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+	unsigned long flags;
+
+	local_irq_save(flags);
+	double_rq_lock(rq_src, rq_dest);
+	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+	rq_src->nr_uninterruptible = 0;
+	double_rq_unlock(rq_src, rq_dest);
+	local_irq_restore(flags);
+}
+
+/* Run through task list and migrate tasks from the dead cpu. */
+static void migrate_live_tasks(int src_cpu)
+{
+	struct task_struct *p, *t;
+
+	write_lock_irq(&tasklist_lock);
+
+	do_each_thread(t, p) {
+		if (p == current)
+			continue;
+
+		if (task_cpu(p) == src_cpu)
+			move_task_off_dead_cpu(src_cpu, p);
+	} while_each_thread(t, p);
+
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of the runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+	int this_cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(this_cpu);
+	struct task_struct *p = rq->idle;
+	unsigned long flags;
+
+	/* cpu has to be offline */
+	BUG_ON(cpu_online(this_cpu));
+
+	/*
+	 * Strictly not necessary since rest of the CPUs are stopped by now
+	 * and interrupts disabled on the current cpu.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+
+	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+
+	/* Add idle task to the _front_ of its priority queue: */
+	__activate_idle_task(p, rq);
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
+}
+
+static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+{
+	struct rq *rq = cpu_rq(dead_cpu);
+
+	/* Must be exiting, otherwise would be on tasklist. */
+	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
+
+	/* Cannot have done final schedule yet: would have vanished. */
+	BUG_ON(p->flags & PF_DEAD);
+
+	get_task_struct(p);
+
+	/*
+	 * Drop lock around migration; if someone else moves it,
+	 * that's OK.  No task can be added to this CPU, so iteration is
+	 * fine.
+	 */
+	spin_unlock_irq(&rq->lock);
+	move_task_off_dead_cpu(dead_cpu, p);
+	spin_lock_irq(&rq->lock);
+
+	put_task_struct(p);
+}
+
+/* release_task() removes task from tasklist, so we won't find dead tasks. */
+static void migrate_dead_tasks(unsigned int dead_cpu)
+{
+	struct rq *rq = cpu_rq(dead_cpu);
+	unsigned int arr, i;
+
+	for (arr = 0; arr < 2; arr++) {
+		for (i = 0; i < MAX_PRIO; i++) {
+			struct list_head *list = &rq->arrays[arr].queue[i];
+
+			while (!list_empty(list))
+				migrate_dead(dead_cpu, list_entry(list->next,
+					     struct task_struct, run_list));
+		}
+	}
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	struct task_struct *p;
+	int cpu = (long)hcpu;
+	unsigned long flags;
+	struct rq *rq;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio: stop_machine expects to yield to it. */
+		rq = task_rq_lock(p, &flags);
+		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+		task_rq_unlock(rq, &flags);
+		cpu_rq(cpu)->migration_thread = p;
+		break;
+
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(cpu_rq(cpu)->migration_thread);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		if (!cpu_rq(cpu)->migration_thread)
+			break;
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(cpu_rq(cpu)->migration_thread,
+			     any_online_cpu(cpu_online_map));
+		kthread_stop(cpu_rq(cpu)->migration_thread);
+		cpu_rq(cpu)->migration_thread = NULL;
+		break;
+
+	case CPU_DEAD:
+		migrate_live_tasks(cpu);
+		rq = cpu_rq(cpu);
+		kthread_stop(rq->migration_thread);
+		rq->migration_thread = NULL;
+		/* Idle task back to normal (off runqueue, low prio) */
+		rq = task_rq_lock(rq->idle, &flags);
+		deactivate_task(rq->idle, rq);
+		rq->idle->static_prio = MAX_PRIO;
+		__setscheduler(rq->idle, SCHED_NORMAL, 0);
+		migrate_dead_tasks(cpu);
+		task_rq_unlock(rq, &flags);
+		migrate_nr_uninterruptible(rq);
+		BUG_ON(rq->nr_running != 0);
+
+		/* No need to migrate the tasks: it was best-effort if
+		 * they didn't do lock_cpu_hotplug().  Just wake up
+		 * the requestors. */
+		spin_lock_irq(&rq->lock);
+		while (!list_empty(&rq->migration_queue)) {
+			struct migration_req *req;
+
+			req = list_entry(rq->migration_queue.next,
+					 struct migration_req, list);
+			list_del_init(&req->list);
+			complete(&req->done);
+		}
+		spin_unlock_irq(&rq->lock);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __cpuinitdata migration_notifier = {
+	.notifier_call = migration_call,
+	.priority = 10
+};
+
+int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	/* Start one for the boot CPU: */
+	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+#undef SCHED_DOMAIN_DEBUG
+#ifdef SCHED_DOMAIN_DEBUG
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	do {
+		int i;
+		char str[NR_CPUS];
+		struct sched_group *group = sd->groups;
+		cpumask_t groupmask;
+
+		cpumask_scnprintf(str, NR_CPUS, sd->span);
+		cpus_clear(groupmask);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 1; i++)
+			printk(" ");
+		printk("domain %d: ", level);
+
+		if (!(sd->flags & SD_LOAD_BALANCE)) {
+			printk("does not load-balance\n");
+			if (sd->parent)
+				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+			break;
+		}
+
+		printk("span %s\n", str);
+
+		if (!cpu_isset(cpu, sd->span))
+			printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+		if (!cpu_isset(cpu, group->cpumask))
+			printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 2; i++)
+			printk(" ");
+		printk("groups:");
+		do {
+			if (!group) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: group is NULL\n");
+				break;
+			}
+
+			if (!group->cpu_power) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+			}
+
+			if (!cpus_weight(group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: empty group\n");
+			}
+
+			if (cpus_intersects(groupmask, group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: repeated CPUs\n");
+			}
+
+			cpus_or(groupmask, groupmask, group->cpumask);
+
+			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+			printk(" %s", str);
+
+			group = group->next;
+		} while (group != sd->groups);
+		printk("\n");
+
+		if (!cpus_equal(sd->span, groupmask))
+			printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+		level++;
+		sd = sd->parent;
+
+		if (sd) {
+			if (!cpus_subset(groupmask, sd->span))
+				printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+		}
+
+	} while (sd);
+}
+#else
+# define sched_domain_debug(sd, cpu) do { } while (0)
+#endif
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpus_weight(sd->span) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_IDLE |
+			 SD_WAKE_AFFINE |
+			 SD_WAKE_BALANCE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpus_equal(sd->span, parent->span))
+		return 0;
+
+	/* Does parent contain flags not in child? */
+	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
+	if (cflags & SD_WAKE_AFFINE)
+		pflags &= ~SD_WAKE_BALANCE;
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC);
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * hold the hotplug lock.
+ */
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; tmp = tmp->parent) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+		if (sd_parent_degenerate(tmp, parent))
+			tmp->parent = parent->parent;
+	}
+
+	if (sd && sd_degenerate(sd))
+		sd = sd->parent;
+
+	sched_domain_debug(sd, cpu);
+
+	rcu_assign_pointer(rq->sd, sd);
+}
+
+/* cpus with isolated domains */
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	int ints[NR_CPUS], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+	cpus_clear(cpu_isolated_map);
+	for (i = 1; i <= ints[0]; i++)
+		if (ints[i] < NR_CPUS)
+			cpu_set(ints[i], cpu_isolated_map);
+	return 1;
+}
+
+__setup ("isolcpus=", isolated_cpu_setup);
+
+/*
+ * init_sched_build_groups takes an array of groups, the cpumask we wish
+ * to span, and a pointer to a function which identifies what group a CPU
+ * belongs to. The return value of group_fn must be a valid index into the
+ * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
+ * keep track of groups covered with a cpumask_t).
+ *
+ * init_sched_build_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ */
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
+{
+	struct sched_group *first = NULL, *last = NULL;
+	cpumask_t covered = CPU_MASK_NONE;
+	int i;
+
+	for_each_cpu_mask(i, span) {
+		int group = group_fn(i);
+		struct sched_group *sg = &groups[group];
+		int j;
+
+		if (cpu_isset(i, covered))
+			continue;
+
+		sg->cpumask = CPU_MASK_NONE;
+		sg->cpu_power = 0;
+
+		for_each_cpu_mask(j, span) {
+			if (group_fn(j) != group)
+				continue;
+
+			cpu_set(j, covered);
+			cpu_set(j, sg->cpumask);
+		}
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+	}
+	last->next = first;
+}
+
+#define SD_NODES_PER_DOMAIN 16
+
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE		2
+#define MIN_CACHE_SIZE		(64*1024U)
+#define DEFAULT_CACHE_SIZE	(5*1024*1024U)
+#define ITERATIONS		1
+#define SIZE_THRESH		130
+#define COST_THRESH		130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+			CONFIG_DEFAULT_MIGRATION_COST
+#else
+			-1LL
+#endif
+};
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+	int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+
+	printk("#ints: %d\n", ints[0]);
+	for (i = 1; i <= ints[0]; i++) {
+		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+	}
+	return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+	get_option(&str, &migration_factor);
+	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+	return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+static unsigned long domain_distance(int cpu1, int cpu2)
+{
+	unsigned long distance = 0;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu1, sd) {
+		WARN_ON(!cpu_isset(cpu1, sd->span));
+		if (cpu_isset(cpu2, sd->span))
+			return distance;
+		distance++;
+	}
+	if (distance >= MAX_DOMAIN_DISTANCE) {
+		WARN_ON(1);
+		distance = MAX_DOMAIN_DISTANCE-1;
+	}
+
+	return distance;
+}
+
+static unsigned int migration_debug;
+
+static int __init setup_migration_debug(char *str)
+{
+	get_option(&str, &migration_debug);
+	return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures with larger caches should tune this up during
+ * bootup. Gets used in the domain-setup code (i.e. during SMP
+ * bootup).
+ */
+unsigned int max_cache_size;
+
+static int __init setup_max_cache_size(char *str)
+{
+	get_option(&str, &max_cache_size);
+	return 1;
+}
+
+__setup("max_cache_size=", setup_max_cache_size);
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+static void touch_cache(void *__cache, unsigned long __size)
+{
+	unsigned long size = __size/sizeof(long), chunk1 = size/3,
+			chunk2 = 2*size/3;
+	unsigned long *cache = __cache;
+	int i;
+
+	for (i = 0; i < size/6; i += 8) {
+		switch (i % 6) {
+			case 0: cache[i]++;
+			case 1: cache[size-1-i]++;
+			case 2: cache[chunk1-i]++;
+			case 3: cache[chunk1+i]++;
+			case 4: cache[chunk2-i]++;
+			case 5: cache[chunk2+i]++;
+		}
+	}
+}
+
+/*
+ * Measure the cache-cost of one task migration. Returns in units of nsec.
+ */
+static unsigned long long
+measure_one(void *cache, unsigned long size, int source, int target)
+{
+	cpumask_t mask, saved_mask;
+	unsigned long long t0, t1, t2, t3, cost;
+
+	saved_mask = current->cpus_allowed;
+
+	/*
+	 * Flush source caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	/*
+	 * Migrate to the source CPU:
+	 */
+	mask = cpumask_of_cpu(source);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != source);
+
+	/*
+	 * Dirty the working set:
+	 */
+	t0 = sched_clock();
+	touch_cache(cache, size);
+	t1 = sched_clock();
+
+	/*
+	 * Migrate to the target CPU, dirty the L2 cache and access
+	 * the shared buffer. (which represents the working set
+	 * of a migrated task.)
+	 */
+	mask = cpumask_of_cpu(target);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != target);
+
+	t2 = sched_clock();
+	touch_cache(cache, size);
+	t3 = sched_clock();
+
+	cost = t1-t0 + t3-t2;
+
+	if (migration_debug >= 2)
+		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
+			source, target, t1-t0, t1-t0, t3-t2, cost);
+	/*
+	 * Flush target caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	set_cpus_allowed(current, saved_mask);
+
+	return cost;
+}
+
+/*
+ * Measure a series of task migrations and return the average
+ * result. Since this code runs early during bootup the system
+ * is 'undisturbed' and the average latency makes sense.
+ *
+ * The algorithm in essence auto-detects the relevant cache-size,
+ * so it will properly detect different cachesizes for different
+ * cache-hierarchies, depending on how the CPUs are connected.
+ *
+ * Architectures can prime the upper limit of the search range via
+ * max_cache_size, otherwise the search range defaults to 20MB...64K.
+ */
+static unsigned long long
+measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
+{
+	unsigned long long cost1, cost2;
+	int i;
+
+	/*
+	 * Measure the migration cost of 'size' bytes, over an
+	 * average of 10 runs:
+	 *
+	 * (We perturb the cache size by a small (0..4k)
+	 *  value to compensate size/alignment related artifacts.
+	 *  We also subtract the cost of the operation done on
+	 *  the same CPU.)
+	 */
+	cost1 = 0;
+
+	/*
+	 * dry run, to make sure we start off cache-cold on cpu1,
+	 * and to get any vmalloc pagefaults in advance:
+	 */
+	measure_one(cache, size, cpu1, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+
+	measure_one(cache, size, cpu2, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+
+	/*
+	 * (We measure the non-migrating [cached] cost on both
+	 *  cpu1 and cpu2, to handle CPUs with different speeds)
+	 */
+	cost2 = 0;
+
+	measure_one(cache, size, cpu1, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+
+	measure_one(cache, size, cpu2, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+
+	/*
+	 * Get the per-iteration migration cost:
+	 */
+	do_div(cost1, 2*ITERATIONS);
+	do_div(cost2, 2*ITERATIONS);
+
+	return cost1 - cost2;
+}
+
+static unsigned long long measure_migration_cost(int cpu1, int cpu2)
+{
+	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
+	unsigned int max_size, size, size_found = 0;
+	long long cost = 0, prev_cost;
+	void *cache;
+
+	/*
+	 * Search from max_cache_size*5 down to 64K - the real relevant
+	 * cachesize has to lie somewhere inbetween.
+	 */
+	if (max_cache_size) {
+		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
+		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
+	} else {
+		/*
+		 * Since we have no estimation about the relevant
+		 * search range
+		 */
+		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
+		size = MIN_CACHE_SIZE;
+	}
+
+	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+		return 0;
+	}
+
+	/*
+	 * Allocate the working set:
+	 */
+	cache = vmalloc(max_size);
+	if (!cache) {
+		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+		return 1000000; /* return 1 msec on very small boxen */
+	}
+
+	while (size <= max_size) {
+		prev_cost = cost;
+		cost = measure_cost(cpu1, cpu2, cache, size);
+
+		/*
+		 * Update the max:
+		 */
+		if (cost > 0) {
+			if (max_cost < cost) {
+				max_cost = cost;
+				size_found = size;
+			}
+		}
+		/*
+		 * Calculate average fluctuation, we use this to prevent
+		 * noise from triggering an early break out of the loop:
+		 */
+		fluct = abs(cost - prev_cost);
+		avg_fluct = (avg_fluct + fluct)/2;
+
+		if (migration_debug)
+			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+				cpu1, cpu2, size,
+				(long)cost / 1000000,
+				((long)cost / 100000) % 10,
+				(long)max_cost / 1000000,
+				((long)max_cost / 100000) % 10,
+				domain_distance(cpu1, cpu2),
+				cost, avg_fluct);
+
+		/*
+		 * If we iterated at least 20% past the previous maximum,
+		 * and the cost has dropped by more than 20% already,
+		 * (taking fluctuations into account) then we assume to
+		 * have found the maximum and break out of the loop early:
+		 */
+		if (size_found && (size*100 > size_found*SIZE_THRESH))
+			if (cost+avg_fluct <= 0 ||
+				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
+
+				if (migration_debug)
+					printk("-> found max.\n");
+				break;
+			}
+		/*
+		 * Increase the cachesize in 10% steps:
+		 */
+		size = size * 10 / 9;
+	}
+
+	if (migration_debug)
+		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
+			cpu1, cpu2, size_found, max_cost);
+
+	vfree(cache);
+
+	/*
+	 * A task is considered 'cache cold' if at least 2 times
+	 * the worst-case cost of migration has passed.
+	 *
+	 * (this limit is only listened to if the load-balancing
+	 * situation is 'nice' - if there is a large imbalance we
+	 * ignore it for the sake of CPU utilization and
+	 * processing fairness.)
+	 */
+	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
+}
+
+static void calibrate_migration_costs(const cpumask_t *cpu_map)
+{
+	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
+	unsigned long j0, j1, distance, max_distance = 0;
+	struct sched_domain *sd;
+
+	j0 = jiffies;
+
+	/*
+	 * First pass - calculate the cacheflush times:
+	 */
+	for_each_cpu_mask(cpu1, *cpu_map) {
+		for_each_cpu_mask(cpu2, *cpu_map) {
+			if (cpu1 == cpu2)
+				continue;
+			distance = domain_distance(cpu1, cpu2);
+			max_distance = max(max_distance, distance);
+			/*
+			 * No result cached yet?
+			 */
+			if (migration_cost[distance] == -1LL)
+				migration_cost[distance] =
+					measure_migration_cost(cpu1, cpu2);
+		}
+	}
+	/*
+	 * Second pass - update the sched domain hierarchy with
+	 * the new cache-hot-time estimations:
+	 */
+	for_each_cpu_mask(cpu, *cpu_map) {
+		distance = 0;
+		for_each_domain(cpu, sd) {
+			sd->cache_hot_time = migration_cost[distance];
+			distance++;
+		}
+	}
+	/*
+	 * Print the matrix:
+	 */
+	if (migration_debug)
+		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
+			max_cache_size,
+#ifdef CONFIG_X86
+			cpu_khz/1000
+#else
+			-1
+#endif
+		);
+	if (system_state == SYSTEM_BOOTING) {
+		printk("migration_cost=");
+		for (distance = 0; distance <= max_distance; distance++) {
+			if (distance)
+				printk(",");
+			printk("%ld", (long)migration_cost[distance] / 1000);
+		}
+		printk("\n");
+	}
+	j1 = jiffies;
+	if (migration_debug)
+		printk("migration: %ld seconds\n", (j1-j0)/HZ);
+
+	/*
+	 * Move back to the original CPU. NUMA-Q gets confused
+	 * if we migrate to another quad during bootup.
+	 */
+	if (raw_smp_processor_id() != orig_cpu) {
+		cpumask_t mask = cpumask_of_cpu(orig_cpu),
+			saved_mask = current->cpus_allowed;
+
+		set_cpus_allowed(current, mask);
+		set_cpus_allowed(current, saved_mask);
+	}
+}
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+	cpumask_t span, nodemask;
+	int i;
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
+/*
+ * SMT sched-domains:
+ */
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+
+static int cpu_to_cpu_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+/*
+ * multi-core sched-domains:
+ */
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+	return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+
+static int cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_MC
+	cpumask_t mask = cpu_coregroup_map(cpu);
+	return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
+	return first_cpu(cpu_sibling_map[cpu]);
+#else
+	return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
+ */
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
+
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
+}
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+	struct sched_group *sg = group_head;
+	int j;
+
+	if (!sg)
+		return;
+next_sg:
+	for_each_cpu_mask(j, sg->cpumask) {
+		struct sched_domain *sd;
+
+		sd = &per_cpu(phys_domains, j);
+		if (j != first_cpu(sd->groups->cpumask)) {
+			/*
+			 * Only add "power" once for each
+			 * physical package.
+			 */
+			continue;
+		}
+
+		sg->cpu_power += sd->groups->cpu_power;
+	}
+	sg = sg->next;
+	if (sg != group_head)
+		goto next_sg;
+}
+#endif
+
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+	int cpu;
+#ifdef CONFIG_NUMA
+	int i;
+
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
+
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
+			continue;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
+next_sg:
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
+	}
+#endif
+	for_each_cpu_mask(cpu, *cpu_map) {
+		if (sched_group_phys_bycpu[cpu]) {
+			kfree(sched_group_phys_bycpu[cpu]);
+			sched_group_phys_bycpu[cpu] = NULL;
+		}
+#ifdef CONFIG_SCHED_MC
+		if (sched_group_core_bycpu[cpu]) {
+			kfree(sched_group_core_bycpu[cpu]);
+			sched_group_core_bycpu[cpu] = NULL;
+		}
+#endif
+	}
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+	int i;
+	struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+	struct sched_group *sched_group_core = NULL;
+#endif
+#ifdef CONFIG_NUMA
+	struct sched_group **sched_group_nodes = NULL;
+	struct sched_group *sched_group_allnodes = NULL;
+
+	/*
+	 * Allocate the per-node list of sched groups
+	 */
+	sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_KERNEL);
+	if (!sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return -ENOMEM;
+	}
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
+
+	/*
+	 * Set up domains for cpus specified by the cpu_map.
+	 */
+	for_each_cpu_mask(i, *cpu_map) {
+		int group;
+		struct sched_domain *sd = NULL, *p;
+		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+
+#ifdef CONFIG_NUMA
+		if (cpus_weight(*cpu_map)
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			if (!sched_group_allnodes) {
+				sched_group_allnodes
+					= kmalloc(sizeof(struct sched_group)
+							* MAX_NUMNODES,
+						  GFP_KERNEL);
+				if (!sched_group_allnodes) {
+					printk(KERN_WARNING
+					"Can not alloc allnodes sched group\n");
+					goto error;
+				}
+				sched_group_allnodes_bycpu[i]
+						= sched_group_allnodes;
+			}
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
+		sd = &per_cpu(node_domains, i);
+		*sd = SD_NODE_INIT;
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
+#endif
+
+		if (!sched_group_phys) {
+			sched_group_phys
+				= kmalloc(sizeof(struct sched_group) * NR_CPUS,
+					  GFP_KERNEL);
+			if (!sched_group_phys) {
+				printk (KERN_WARNING "Can not alloc phys sched"
+						     "group\n");
+				goto error;
+			}
+			sched_group_phys_bycpu[i] = sched_group_phys;
+		}
+
+		p = sd;
+		sd = &per_cpu(phys_domains, i);
+		group = cpu_to_phys_group(i);
+		*sd = SD_CPU_INIT;
+		sd->span = nodemask;
+		sd->parent = p;
+		sd->groups = &sched_group_phys[group];
+
+#ifdef CONFIG_SCHED_MC
+		if (!sched_group_core) {
+			sched_group_core
+				= kmalloc(sizeof(struct sched_group) * NR_CPUS,
+					  GFP_KERNEL);
+			if (!sched_group_core) {
+				printk (KERN_WARNING "Can not alloc core sched"
+						     "group\n");
+				goto error;
+			}
+			sched_group_core_bycpu[i] = sched_group_core;
+		}
+
+		p = sd;
+		sd = &per_cpu(core_domains, i);
+		group = cpu_to_core_group(i);
+		*sd = SD_MC_INIT;
+		sd->span = cpu_coregroup_map(i);
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_core[group];
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+		p = sd;
+		sd = &per_cpu(cpu_domains, i);
+		group = cpu_to_cpu_group(i);
+		*sd = SD_SIBLING_INIT;
+		sd->span = cpu_sibling_map[i];
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_cpus[group];
+#endif
+	}
+
+#ifdef CONFIG_SCHED_SMT
+	/* Set up CPU (sibling) groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_sibling_map = cpu_sibling_map[i];
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
+		if (i != first_cpu(this_sibling_map))
+			continue;
+
+		init_sched_build_groups(sched_group_cpus, this_sibling_map,
+						&cpu_to_cpu_group);
+	}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+	/* Set up multi-core groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_core_map = cpu_coregroup_map(i);
+		cpus_and(this_core_map, this_core_map, *cpu_map);
+		if (i != first_cpu(this_core_map))
+			continue;
+		init_sched_build_groups(sched_group_core, this_core_map,
+					&cpu_to_core_group);
+	}
+#endif
+
+
+	/* Set up physical groups */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		init_sched_build_groups(sched_group_phys, nodemask,
+						&cpu_to_phys_group);
+	}
+
+#ifdef CONFIG_NUMA
+	/* Set up node groups */
+	if (sched_group_allnodes)
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
+					&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask)) {
+			sched_group_nodes[i] = NULL;
+			continue;
+		}
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+		if (!sg) {
+			printk(KERN_WARNING "Can not alloc domain group for "
+				"node %d\n", i);
+			goto error;
+		}
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		sg->next = sg;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc_node(sizeof(struct sched_group),
+					  GFP_KERNEL, i);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				goto error;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			sg->next = prev->next;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+	}
+#endif
+
+	/* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+		sd = &per_cpu(cpu_domains, i);
+		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+	}
+#endif
+#ifdef CONFIG_SCHED_MC
+	for_each_cpu_mask(i, *cpu_map) {
+		int power;
+		struct sched_domain *sd;
+		sd = &per_cpu(core_domains, i);
+		if (sched_smt_power_savings)
+			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+		else
+			power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+					    * SCHED_LOAD_SCALE / 10;
+		sd->groups->cpu_power = power;
+	}
+#endif
+
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
+		sd = &per_cpu(phys_domains, i);
+		if (i != first_cpu(sd->groups->cpumask))
+			continue;
+
+		sd->groups->cpu_power = 0;
+		if (sched_mc_power_savings || sched_smt_power_savings) {
+			int j;
+
+ 			for_each_cpu_mask(j, sd->groups->cpumask) {
+				struct sched_domain *sd1;
+ 				sd1 = &per_cpu(core_domains, j);
+ 				/*
+ 			 	 * for each core we will add once
+ 				 * to the group in physical domain
+ 			 	 */
+  	 			if (j != first_cpu(sd1->groups->cpumask))
+ 					continue;
+
+ 				if (sched_smt_power_savings)
+   					sd->groups->cpu_power += sd1->groups->cpu_power;
+ 				else
+   					sd->groups->cpu_power += SCHED_LOAD_SCALE;
+   			}
+ 		} else
+ 			/*
+ 			 * This has to be < 2 * SCHED_LOAD_SCALE
+ 			 * Lets keep it SCHED_LOAD_SCALE, so that
+ 			 * while calculating NUMA group's cpu_power
+ 			 * we can simply do
+ 			 *  numa_group->cpu_power += phys_group->cpu_power;
+ 			 *
+ 			 * See "only add power once for each physical pkg"
+ 			 * comment below
+ 			 */
+ 			sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
+		int power;
+		sd = &per_cpu(phys_domains, i);
+		if (sched_smt_power_savings)
+			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+		else
+			power = SCHED_LOAD_SCALE;
+		sd->groups->cpu_power = power;
+#endif
+	}
+
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++)
+		init_numa_sched_groups_power(sched_group_nodes[i]);
+
+	if (sched_group_allnodes) {
+		int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+		struct sched_group *sg = &sched_group_allnodes[group];
+
+		init_numa_sched_groups_power(sg);
+	}
+#endif
+
+	/* Attach the domains */
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+		sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+		sd = &per_cpu(core_domains, i);
+#else
+		sd = &per_cpu(phys_domains, i);
+#endif
+		cpu_attach_domain(sd, i);
+	}
+	/*
+	 * Tune cache-hot values:
+	 */
+	calibrate_migration_costs(cpu_map);
+
+	return 0;
+
+error:
+	free_sched_groups(cpu_map);
+	return -ENOMEM;
+}
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
+	int err;
+
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	err = build_sched_domains(&cpu_default_map);
+
+	return err;
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
+{
+	free_sched_groups(cpu_map);
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+	int i;
+
+	for_each_cpu_mask(i, *cpu_map)
+		cpu_attach_domain(NULL, i);
+	synchronize_sched();
+	arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	cpumask_t change_map;
+	int err = 0;
+
+	cpus_and(*partition1, *partition1, cpu_online_map);
+	cpus_and(*partition2, *partition2, cpu_online_map);
+	cpus_or(change_map, *partition1, *partition2);
+
+	/* Detach sched domains from all of the affected cpus */
+	detach_destroy_domains(&change_map);
+	if (!cpus_empty(*partition1))
+		err = build_sched_domains(partition1);
+	if (!err && !cpus_empty(*partition2))
+		err = build_sched_domains(partition2);
+
+	return err;
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+	int err;
+
+	lock_cpu_hotplug();
+	detach_destroy_domains(&cpu_online_map);
+	err = arch_init_sched_domains(&cpu_online_map);
+	unlock_cpu_hotplug();
+
+	return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+	int ret;
+
+	if (buf[0] != '0' && buf[0] != '1')
+		return -EINVAL;
+
+	if (smt)
+		sched_smt_power_savings = (buf[0] == '1');
+	else
+		sched_mc_power_savings = (buf[0] == '1');
+
+	ret = arch_reinit_sched_domains();
+
+	return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+	int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+	if (smt_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+	if (!err && mc_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_mc_power_savings.attr);
+#endif
+	return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+	return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+					    const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+	    sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+	return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+					     const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 1);
+}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+	    sched_smt_power_savings_store);
+#endif
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy.  The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to the NULL domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		detach_destroy_domains(&cpu_online_map);
+		return NOTIFY_OK;
+
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		/*
+		 * Fall through and re-initialise the domains.
+		 */
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* The hotplug lock is already held by cpu_up/cpu_down */
+	arch_init_sched_domains(&cpu_online_map);
+
+	return NOTIFY_OK;
+}
+#endif
+
+void __init sched_init_smp(void)
+{
+	lock_cpu_hotplug();
+	arch_init_sched_domains(&cpu_online_map);
+	unlock_cpu_hotplug();
+	/* XXX: Theoretical race here - CPU may be hotplugged now */
+	hotcpu_notifier(update_sched_domains, 0);
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+	/* Linker adds these: start and end of __sched functions */
+	extern char __sched_text_start[], __sched_text_end[];
+
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+	int i, j, k;
+
+	for_each_possible_cpu(i) {
+		struct prio_array *array;
+		struct rq *rq;
+
+		rq = cpu_rq(i);
+		spin_lock_init(&rq->lock);
+		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
+		rq->nr_running = 0;
+		rq->active = rq->arrays;
+		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
+
+#ifdef CONFIG_SMP
+		rq->sd = NULL;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
+		rq->active_balance = 0;
+		rq->push_cpu = 0;
+		rq->cpu = i;
+		rq->migration_thread = NULL;
+		INIT_LIST_HEAD(&rq->migration_queue);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			array->rq = rq;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
+		}
+	}
+
+	set_load_weight(&init_task);
+
+#ifdef CONFIG_RT_MUTEXES
+	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2006 Ingo Molnar\n");
+#endif
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+}
+
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
+void __might_sleep(char *file, int line)
+{
+#ifdef in_atomic
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
+		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+			return;
+		prev_jiffy = jiffies;
+		stop_trace();
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
+		dump_stack();
+	}
+#endif
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+	struct prio_array *array;
+	struct task_struct *p;
+	unsigned long flags;
+	struct rq *rq;
+
+	read_lock_irq(&tasklist_lock);
+	for_each_process(p) {
+		if (!rt_task(p))
+			continue;
+
+		spin_lock_irqsave(&p->pi_lock, flags);
+		rq = __task_rq_lock(p);
+
+		array = p->array;
+		if (array)
+			deactivate_task(p, task_rq(p));
+		__setscheduler(p, SCHED_NORMAL, 0);
+		if (array) {
+			__activate_task(p, task_rq(p));
+			resched_task(rq->curr);
+		}
+
+		__task_rq_unlock(rq);
+		spin_unlock_irqrestore(&p->pi_lock, flags);
+	}
+	read_unlock_irq(&tasklist_lock);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+struct task_struct *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+	static int once = 1;
+
+	barrier();
+	dec_preempt_count();
+
+	if (once && !preempt_count()) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
+
diff -urN ./linux-2.6.18.1/kernel/sched.c.rej linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sched.c.rej
--- ./linux-2.6.18.1/kernel/sched.c.rej	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sched.c.rej	2007-05-20 14:14:28.000000000 +0900
@@ -0,0 +1,62 @@
+***************
+*** 3929,3935 ****
+  	int cpu, idx, new_prio;
+  	long *switch_count;
+  	struct rq *rq;
+- 
+  	/*
+  	 * Test if we are atomic.  Since do_exit() needs to call into
+  	 * schedule() atomically, we ignore that path for now.
+--- 3960,3968 ----
+  	int cpu, idx, new_prio;
+  	long *switch_count;
+  	struct rq *rq;
++ #ifdef CONFIG_CABI
++ 	cabi_dump_sched (20, current);
++ #endif
+  	/*
+  	 * Test if we are atomic.  Since do_exit() needs to call into
+  	 * schedule() atomically, we ignore that path for now.
+***************
+*** 4055,4060 ****
+  		prev->sleep_avg = 0;
+  	prev->timestamp = prev->last_ran = now;
+  
+  	sched_info_switch(prev, next);
+  	if (likely(prev != next)) {
+  		next->timestamp = now;
+--- 4088,4097 ----
+  		prev->sleep_avg = 0;
+  	prev->timestamp = prev->last_ran = now;
+  
++ #ifdef CONFIG_CABI
++                 cabi_sched_check(prev,next);
++ #endif
++ 
+  	sched_info_switch(prev, next);
+  	if (likely(prev != next)) {
+  		next->timestamp = now;
+***************
+*** 4063,4069 ****
+  		++*switch_count;
+  
+  		prepare_task_switch(rq, next);
+  		prev = context_switch(rq, prev, next);
+  		barrier();
+  		/*
+  		 * this_rq must be evaluated again because prev may have moved
+--- 4100,4113 ----
+  		++*switch_count;
+  
+  		prepare_task_switch(rq, next);
++ 
++ #ifdef CONFIG_CABI
++ 		cabi_dump_sched (22, current);
++ #endif
+  		prev = context_switch(rq, prev, next);
++ #ifdef CONFIG_CABI
++ 		cabi_dump_sched (23, current);
++ #endif
+  		barrier();
+  		/*
+  		 * this_rq must be evaluated again because prev may have moved
diff -urN ./linux-2.6.18.1/kernel/signal.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/signal.c
--- ./linux-2.6.18.1/kernel/signal.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/signal.c	2007-05-19 23:58:35.000000000 +0900
@@ -768,8 +768,10 @@
 {
 	int ret = 0;
 
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
+#endif
 
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
@@ -1602,6 +1604,7 @@
 	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} else {
 		/*
@@ -1659,7 +1662,7 @@
 		do_notify_parent_cldstop(current, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
 	}
-
+	current->flags &= ~PF_NOSCHED;
 	schedule();
 	/*
 	 * Now we don't run again until continued.
@@ -1770,6 +1773,9 @@
 
 	try_to_freeze();
 
+#ifdef CONFIG_PREEMPT_RT
+	might_sleep();
+#endif
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
diff -urN ./linux-2.6.18.1/kernel/softirq.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/softirq.c
--- ./linux-2.6.18.1/kernel/softirq.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/softirq.c	2007-05-19 23:58:35.000000000 +0900
@@ -4,6 +4,9 @@
  *	Copyright (C) 1992 Linus Torvalds
  *
  * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *	Softirq-split implemetation by
+ *	Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
  */
 
 #include <linux/module.h>
@@ -17,6 +20,10 @@
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
 #include <linux/smp.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/delay.h>
 
 #include <asm/irq.h>
 /*
@@ -44,7 +51,41 @@
 
 static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct softirqdata {
+	int			nr;
+	unsigned long		cpu;
+	struct task_struct	*tsk;
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	wait_queue_head_t	wait;
+	int			running;
+#endif
+};
+
+static DEFINE_PER_CPU(struct softirqdata, ksoftirqd[MAX_SOFTIRQ]);
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+/*
+ * Preempting the softirq causes cases that would not be a
+ * problem when the softirq is not preempted. That is a
+ * process may have code to spin while waiting for a softirq
+ * to finish on another CPU.  But if it happens that the
+ * process has preempted the softirq, this could cause a
+ * deadlock.
+ */
+void wait_for_softirq(int softirq)
+{
+	struct softirqdata *data = &__get_cpu_var(ksoftirqd[softirq]);
+	if (data->running) {
+		DECLARE_WAITQUEUE(wait, current);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&data->wait, &wait);
+		if (data->running)
+			schedule();
+		remove_wait_queue(&data->wait, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+}
+#endif
 
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
@@ -52,15 +93,47 @@
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+static void wakeup_softirqd(int softirq)
 {
 	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk);
 
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
 }
 
+static void wakeup_softirqd_prio(int softirq, int prio)
+{
+	/* Interrupts are disabled: no need to stop preemption */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk);
+
+	if (tsk) {
+		if (tsk->normal_prio != prio)
+			set_thread_priority(tsk, prio);
+
+		if(tsk->state != TASK_RUNNING)
+			wake_up_process(tsk);
+	}
+}
+
+/*
+ * Wake up the softirq threads which have work
+ */
+static void trigger_softirqs(void)
+{
+	u32 pending = local_softirq_pending();
+	int curr = 0;
+
+	while (pending) {
+		if (pending & 1)
+			wakeup_softirqd(curr);
+		pending >>= 1;
+		curr++;
+	}
+}
+
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
@@ -96,20 +169,6 @@
 
 EXPORT_SYMBOL(local_bh_disable);
 
-void __local_bh_enable(void)
-{
-	WARN_ON_ONCE(in_irq());
-
-	/*
-	 * softirqs should never be enabled by __local_bh_enable(),
-	 * it always nests inside local_bh_enable() sections:
-	 */
-	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
-
-	sub_preempt_count(SOFTIRQ_OFFSET);
-}
-EXPORT_SYMBOL_GPL(__local_bh_enable);
-
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
@@ -192,6 +251,8 @@
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+#endif
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -203,7 +264,7 @@
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void __do_softirq(void)
+asmlinkage void ___do_softirq(void)
 {
 	struct softirq_action *h;
 	__u32 pending;
@@ -213,9 +274,6 @@
 	pending = local_softirq_pending();
 	account_system_vtime(current);
 
-	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
-
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
@@ -227,8 +285,17 @@
 
 	do {
 		if (pending & 1) {
-			h->action(h);
+			{
+				u32 preempt_count = preempt_count();
+				h->action(h);
+				if (preempt_count != preempt_count()) {
+					print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
+				}
+			}
 			rcu_bh_qsctr_inc(cpu);
+			cond_resched_all();
 		}
 		h++;
 		pending >>= 1;
@@ -241,12 +308,69 @@
 		goto restart;
 
 	if (pending)
-		wakeup_softirqd();
+		trigger_softirqs();
+}
+
+asmlinkage void __do_softirq(void)
+{
+	unsigned long p_flags;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			trigger_softirqs();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	trace_softirq_enter();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
 
 	trace_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
+
+	current->flags |= p_flags;
+}
+
+/*
+ * Process softirqs straight from hardirq context,
+ * without having to switch to a softirq thread.
+ * This can reduce the context-switch rate.
+ *
+ * NOTE: this is unused right now.
+ */
+void do_softirq_from_hardirq(void)
+{
+	unsigned long p_flags;
+
+	if (!local_softirq_pending())
+		return;
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
+
+	trace_softirq_exit();
+
+	account_system_vtime(current);
+	_local_bh_enable();
+
+	current->flags |= p_flags;
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -287,9 +411,31 @@
 	account_system_vtime(current);
 	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
+#ifndef CONFIG_PREEMPT_SOFTIRQS
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
-	preempt_enable_no_resched();
+#endif
+	__preempt_enable_no_resched();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+fastcall void raise_softirq_irqoff_prio(unsigned int nr, int prio)
+{
+	__do_raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt() || (current->flags & PF_HARDIRQ) || hardirq_count())
+		wakeup_softirqd_prio(nr, prio);
 }
 
 /*
@@ -297,7 +443,7 @@
  */
 inline fastcall void raise_softirq_irqoff(unsigned int nr)
 {
-	__raise_softirq_irqoff(nr);
+	__do_raise_softirq_irqoff(nr);
 
 	/*
 	 * If we're in an interrupt or softirq, we're done
@@ -308,8 +454,9 @@
 	 * Otherwise we wake up ksoftirqd to make sure we
 	 * schedule the softirq soon.
 	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
+	if (!in_interrupt() || (current->flags & PF_HARDIRQ) || hardirq_count())
+		//trigger_softirqs();
+		wakeup_softirqd(nr);
 }
 
 EXPORT_SYMBOL(raise_softirq_irqoff);
@@ -340,14 +487,24 @@
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
 
+static void inline
+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
+{
+	if (tasklet_trylock(t)) {
+		WARN_ON(t->next != NULL);
+		t->next = head->list;
+		head->list = t;
+		raise_softirq_irqoff(nr);
+		tasklet_unlock(t);
+	}
+}
+
 void fastcall __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = t;
-	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -358,81 +515,130 @@
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = t;
-	raise_softirq_irqoff(HI_SOFTIRQ);
+	__tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-static void tasklet_action(struct softirq_action *a)
+void fastcall tasklet_enable(struct tasklet_struct *t)
 {
-	struct tasklet_struct *list;
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_schedule(t);
+}
 
-	local_irq_disable();
-	list = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = NULL;
-	local_irq_enable();
+EXPORT_SYMBOL(tasklet_enable);
+
+void fastcall tasklet_hi_enable(struct tasklet_struct *t)
+{
+	if (!atomic_dec_and_test(&t->count))
+		return;
+	if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
+		tasklet_hi_schedule(t);
+}
+
+EXPORT_SYMBOL(tasklet_hi_enable);
+
+static void
+__tasklet_action(struct softirq_action *a, struct tasklet_struct *list)
+{
+	int loops = 1000000;
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
+		/*
+		 * Should always succeed - after a tasklist got on the
+		 * list (after getting the SCHED bit set from 0 to 1),
+		 * nothing but the tasklet softirq it got queued to can
+		 * lock it:
+		 */
+		if (!tasklet_trylock(t)) {
+			WARN_ON(1);
+			continue;
+		}
+
+		t->next = NULL;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
+		/*
+		 * If we cannot handle the tasklet because it's disabled,
+		 * mark it as pending. tasklet_enable() will later
+		 * re-schedule the tasklet.
+		 */
+		if (unlikely(atomic_read(&t->count))) {
+out_disabled:
+			/* implicit unlock: */
+			wmb();
+			t->state = TASKLET_STATEF_PENDING;
+			continue;
+		}
+
+		/*
+		 * After this point on the tasklet might be rescheduled
+		 * on another CPU, but it can only be added to another
+		 * CPU's tasklet list if we unlock the tasklet (which we
+		 * dont do yet).
+		 */
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+			WARN_ON(1);
+
+again:
+		t->func(t->data);
+
+		/*
+		 * Try to unlock the tasklet. We must use cmpxchg, because
+		 * another CPU might have scheduled or disabled the tasklet.
+		 * We only allow the STATE_RUN -> 0 transition here.
+		 */
+		while (!tasklet_tryunlock(t)) {
+			/*
+			 * If it got disabled meanwhile, bail out:
+			 */
+			if (atomic_read(&t->count))
+				goto out_disabled;
+			/*
+			 * If it got scheduled meanwhile, re-execute
+			 * the tasklet function:
+			 */
+			if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+				goto again;
+			if (!--loops) {
+				printk("hm, tasklet state: %08lx\n", t->state);
+				WARN_ON(1);
 				tasklet_unlock(t);
-				continue;
+				break;
 			}
-			tasklet_unlock(t);
 		}
-
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_vec).list;
-		__get_cpu_var(tasklet_vec).list = t;
-		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
 	}
 }
 
-static void tasklet_hi_action(struct softirq_action *a)
+static void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 
 	local_irq_disable();
-	list = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
 	local_irq_enable();
 
-	while (list) {
-		struct tasklet_struct *t = list;
+	__tasklet_action(a, list);
+}
 
-		list = list->next;
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
 
-		if (tasklet_trylock(t)) {
-			if (!atomic_read(&t->count)) {
-				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-					BUG();
-				t->func(t->data);
-				tasklet_unlock(t);
-				continue;
-			}
-			tasklet_unlock(t);
-		}
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	local_irq_enable();
 
-		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_hi_vec).list;
-		__get_cpu_var(tasklet_hi_vec).list = t;
-		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
-	}
+	__tasklet_action(a, list);
 }
 
-
 void tasklet_init(struct tasklet_struct *t,
 		  void (*func)(unsigned long), unsigned long data)
 {
@@ -452,7 +658,7 @@
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do
-			yield();
+			msleep(1);
 		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -467,36 +673,67 @@
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+static int ksoftirqd(void * __data)
 {
-	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 };
+	struct softirqdata *data = __data;
+	u32 mask = (1 << data->nr);
+	struct softirq_action *h;
+
+	param.sched_priority = 1;
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+//	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	init_waitqueue_head(&data->wait);
+#endif
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
+		if (!(local_softirq_pending() & mask)) {
+			__preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
 		}
-
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 1;
+#endif
 		__set_current_state(TASK_RUNNING);
 
-		while (local_softirq_pending()) {
+		while (local_softirq_pending() & mask) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
+			if (cpu_is_offline(data->cpu))
 				goto wait_to_die;
-			do_softirq();
-			preempt_enable_no_resched();
+
+			local_irq_disable();
+			__preempt_enable_no_resched();
+			set_softirq_pending(local_softirq_pending() & ~mask);
+			local_bh_disable();
+			local_irq_enable();
+
+			h = &softirq_vec[data->nr];
+			if (h)
+				h->action(h);
+			rcu_bh_qsctr_inc(data->cpu);
+
+			local_irq_disable();
+			_local_bh_enable();
+			local_irq_enable();
+
 			cond_resched();
 			preempt_disable();
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+		data->running = 0;
+		wake_up(&data->wait);
+#endif
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
@@ -543,7 +780,7 @@
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	struct tasklet_struct **i;
 
@@ -565,46 +802,74 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __cpuinit cpu_callback(struct notifier_block *nfb,
+static const char *softirq_names [] =
+{
+  [HI_SOFTIRQ]		= "high",
+  [TIMER_SOFTIRQ]	= "timer",
+  [NET_TX_SOFTIRQ]	= "net-tx",
+  [NET_RX_SOFTIRQ]	= "net-rx",
+  [BLOCK_SOFTIRQ]	= "block",
+  [TASKLET_SOFTIRQ]	= "tasklet",
+#ifdef CONFIG_HIGH_RES_TIMERS
+  [HRTIMER_SOFTIRQ]	= "hrtimer",
+#endif
+  [RCU_SOFTIRQ]		= "rcu",
+};
+
+static __cpuinit int cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
+	int hotcpu = (unsigned long)hcpu, i;
 	struct task_struct *p;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+		/* We may have tasklets already scheduled on
+		   processor 0, so don't check there. */
+		if (hotcpu != 0) {
+			BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
+			BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
 		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
- 		break;
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			per_cpu(ksoftirqd[i].nr, hotcpu) = i;
+			per_cpu(ksoftirqd[i].cpu, hotcpu) = hotcpu;
+			p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd[i], hotcpu),
+					   "softirq-%s/%d", softirq_names[i], hotcpu);
+			if (IS_ERR(p)) {
+				printk("ksoftirqd %d for %i failed\n", i, hotcpu);
+				return NOTIFY_BAD;
+			}
+			kthread_bind(p, hotcpu);
+			per_cpu(ksoftirqd[i].tsk, hotcpu) = p;
+		}
+		break;
 	case CPU_ONLINE:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			wake_up_process(per_cpu(ksoftirqd[i].tsk, hotcpu));
 		break;
+
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
-		if (!per_cpu(ksoftirqd, hotcpu))
-			break;
 		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     any_online_cpu(cpu_online_map));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			if (!per_cpu(ksoftirqd[i].tsk, hotcpu))
+				continue;
+			kthread_bind(per_cpu(ksoftirqd[i], hotcpu).tsk, any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		kthread_stop(p);
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			p = per_cpu(ksoftirqd[i], hotcpu).tsk;
+			per_cpu(ksoftirqd[i], hotcpu).tsk = NULL;
+			kthread_stop(p);
+		}
 		takeover_tasklets(hotcpu);
 		break;
 #endif /* CONFIG_HOTPLUG_CPU */
- 	}
+	}
 	return NOTIFY_OK;
 }
 
+
 static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
@@ -618,6 +883,34 @@
 	return 0;
 }
 
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+#endif
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * Call a function on all processors
diff -urN ./linux-2.6.18.1/kernel/softlockup.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/softlockup.c
--- ./linux-2.6.18.1/kernel/softlockup.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/softlockup.c	2007-05-19 23:58:35.000000000 +0900
@@ -14,7 +14,7 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 
-static DEFINE_SPINLOCK(print_lock);
+static DEFINE_RAW_SPINLOCK(print_lock);
 
 static DEFINE_PER_CPU(unsigned long, touch_timestamp);
 static DEFINE_PER_CPU(unsigned long, print_timestamp);
@@ -70,8 +70,8 @@
 		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
 		spin_lock(&print_lock);
-		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
-			this_cpu);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d! [%s:%d]\n",
+			this_cpu, current->comm, current->pid);
 		dump_stack();
 		spin_unlock(&print_lock);
 	}
diff -urN ./linux-2.6.18.1/kernel/spinlock.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/spinlock.c
--- ./linux-2.6.18.1/kernel/spinlock.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/spinlock.c	2007-05-19 23:58:35.000000000 +0900
@@ -20,14 +20,14 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
+int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock)
 {
-	__raw_read_lock(lock);
+	_raw_read_lock(lock);
 	return 1;
 }
-EXPORT_SYMBOL(generic__raw_read_trylock);
+EXPORT_SYMBOL(generic_raw_read_trylock);
 
-int __lockfunc _spin_trylock(spinlock_t *lock)
+int __lockfunc __spin_trylock(raw_spinlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_spin_trylock(lock)) {
@@ -38,9 +38,46 @@
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(__spin_trylock);
+
+int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	__preempt_enable_no_resched();
+	local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(__spin_trylock_irq);
+
+int __lockfunc __spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags)
+{
+	local_irq_save(*flags);
+	preempt_disable();
+
+	if (_raw_spin_trylock(lock)) {
+		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+		return 1;
+	}
+
+	__preempt_enable_no_resched();
+	local_irq_restore(*flags);
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(__spin_trylock_irqsave);
 
-int __lockfunc _read_trylock(rwlock_t *lock)
+int __lockfunc __read_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_read_trylock(lock)) {
@@ -51,9 +88,9 @@
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(__read_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc __write_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_write_trylock(lock)) {
@@ -64,7 +101,7 @@
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(__write_trylock);
 
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
@@ -72,17 +109,17 @@
  * not re-enabled during lock-acquire (which the preempt-spin-ops do):
  */
 #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
-	defined(CONFIG_DEBUG_LOCK_ALLOC)
+	defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_PREEMPT_RT)
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc __read_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(__read_lock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock)
 {
 	unsigned long flags;
 
@@ -101,27 +138,27 @@
 #endif
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(__spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc __spin_lock_irq(raw_spinlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(__spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc __spin_lock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(__spin_lock_bh);
 
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
@@ -131,27 +168,27 @@
 	_raw_read_lock(lock);
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(__read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc __read_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(__read_lock_irq);
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+void __lockfunc __read_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(__read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
@@ -161,43 +198,43 @@
 	_raw_write_lock(lock);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(__write_lock_irqsave);
 
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc __write_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(__write_lock_irq);
 
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc __write_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(__write_lock_bh);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc __spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
 
-EXPORT_SYMBOL(_spin_lock);
+EXPORT_SYMBOL(__spin_lock);
 
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc __write_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_write_lock(lock);
 }
 
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(__write_lock);
 
 #else /* CONFIG_PREEMPT: */
 
@@ -210,7 +247,7 @@
  */
 
 #define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype##_t *lock)			\
+void __lockfunc __##op##_lock(locktype##_t *lock)			\
 {									\
 	for (;;) {							\
 		preempt_disable();					\
@@ -220,15 +257,16 @@
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+			       		(lock)->break_lock)		\
 			cpu_relax();					\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock);						\
+EXPORT_SYMBOL(__##op##_lock);						\
 									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
@@ -242,23 +280,24 @@
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+EXPORT_SYMBOL(__##op##_lock_irqsave);					\
 									\
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+void __lockfunc __##op##_lock_irq(locktype##_t *lock)			\
 {									\
-	_##op##_lock_irqsave(lock);					\
+	__##op##_lock_irqsave(lock);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
+EXPORT_SYMBOL(__##op##_lock_irq);					\
 									\
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+void __lockfunc __##op##_lock_bh(locktype##_t *lock)			\
 {									\
 	unsigned long flags;						\
 									\
@@ -267,147 +306,161 @@
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
+	flags = __##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
 	local_irq_restore(flags);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_bh)
+EXPORT_SYMBOL(__##op##_lock_bh)
 
 /*
  * Build preemption-friendly versions of the following
  * lock-spinning functions:
  *
- *         _[spin|read|write]_lock()
- *         _[spin|read|write]_lock_irq()
- *         _[spin|read|write]_lock_irqsave()
- *         _[spin|read|write]_lock_bh()
+ *         __[spin|read|write]_lock()
+ *         __[spin|read|write]_lock_irq()
+ *         __[spin|read|write]_lock_irqsave()
+ *         __[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, spinlock);
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, raw_rwlock);
+BUILD_LOCK_OPS(write, raw_rwlock);
 
 #endif /* CONFIG_PREEMPT */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	_raw_spin_lock(lock);
 }
 
-EXPORT_SYMBOL(_spin_lock_nested);
+EXPORT_SYMBOL(__spin_lock_nested);
 
 #endif
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc __spin_unlock(raw_spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(__spin_unlock);
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock)
+{
+	spin_release(&lock->dep_map, 1, _RET_IP_);
+	_raw_spin_unlock(lock);
+	__preempt_enable_no_resched();
+}
+/* not exported */
+
+void __lockfunc __write_unlock(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_write_unlock);
+EXPORT_SYMBOL(__write_unlock);
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc __read_unlock(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(__read_unlock);
 
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc __spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(__spin_unlock_irqrestore);
 
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(__spin_unlock_irq);
 
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(__spin_unlock_bh);
 
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc __read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(__read_unlock_irqrestore);
 
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc __read_unlock_irq(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(__read_unlock_irq);
 
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc __read_unlock_bh(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(__read_unlock_bh);
 
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(__write_unlock_irqrestore);
 
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc __write_unlock_irq(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
+	__preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(__write_unlock_irq);
 
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc __write_unlock_bh(raw_rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(__write_unlock_bh);
 
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
@@ -416,18 +469,30 @@
 		return 1;
 	}
 
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(__spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+int notrace in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
 
 	return addr >= (unsigned long)__lock_text_start
-	&& addr < (unsigned long)__lock_text_end;
+		&& addr < (unsigned long)__lock_text_end;
 }
 EXPORT_SYMBOL(in_lock_functions);
+
+void notrace __debug_atomic_dec_and_test(atomic_t *v)
+{
+	static int warn_once = 1;
+
+	if (!atomic_read(v) && warn_once) {
+		warn_once = 0;
+		printk("BUG: atomic counter underflow!\n");
+		WARN_ON(1);
+	}
+}
diff -urN ./linux-2.6.18.1/kernel/stop_machine.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/stop_machine.c
--- ./linux-2.6.18.1/kernel/stop_machine.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/stop_machine.c	2007-05-19 23:58:35.000000000 +0900
@@ -56,7 +56,7 @@
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
 		if (!prepared && !irqs_disabled)
-			yield();
+			__yield();
 		else
 			cpu_relax();
 	}
@@ -106,7 +106,7 @@
 
 	/* Wait for them all to come to life. */
 	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		yield();
+		__yield();
 
 	/* If some failed, kill them all. */
 	if (ret < 0) {
@@ -129,7 +129,7 @@
 {
 	stopmachine_set_state(STOPMACHINE_EXIT);
 	local_irq_enable();
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 }
 
 struct stop_machine_data
diff -urN ./linux-2.6.18.1/kernel/sys.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sys.c
--- ./linux-2.6.18.1/kernel/sys.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sys.c	2007-05-19 23:58:35.000000000 +0900
@@ -31,6 +31,7 @@
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/rt_lock.h>
 #include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
@@ -132,8 +133,8 @@
 	return -ENOENT;
 }
 
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
-		unsigned long val, void *v)
+static int notrace __kprobes notifier_call_chain(struct notifier_block **nl,
+						 unsigned long val, void *v)
 {
 	int ret = NOTIFY_DONE;
 	struct notifier_block *nb, *next_nb;
@@ -1964,6 +1965,14 @@
 {
 	long error;
 
+#ifdef CONFIG_LATENCY_TRACE
+	if (option == PR_SET_TRACING) {
+		if (arg2)
+			return user_trace_start();
+		return user_trace_stop();
+	}
+#endif
+
 	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
 	if (error)
 		return error;
diff -urN ./linux-2.6.18.1/kernel/sysctl.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sysctl.c
--- ./linux-2.6.18.1/kernel/sysctl.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sysctl.c	2007-05-20 00:11:13.000000000 +0900
@@ -43,6 +43,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/profile.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 
@@ -282,6 +283,140 @@
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_print_at_crash",
+		.data		= &trace_print_at_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_GENERIC_HARDIRQS
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
@@ -577,6 +712,16 @@
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_NO_HZ
+	{
+		.ctl_name       = KERN_TIMEOUT_GRANULARITY,
+		.procname       = "timeout_granularity",
+		.data           = &timeout_granularity,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_PIDMAX,
 		.procname	= "pid_max",
diff -urN ./linux-2.6.18.1/kernel/sysctl.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sysctl.c.orig
--- ./linux-2.6.18.1/kernel/sysctl.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/sysctl.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,2701 @@
+/*
+ * sysctl.c: General linux system control interface
+ *
+ * Begun 24 March 1995, Stephen Tweedie
+ * Added /proc support, Dec 1995
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
+ * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
+ * Dynamic registration fixes, Stephen Tweedie.
+ * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
+ * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
+ *  Horn.
+ * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
+ * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
+ * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
+ *  Wendling.
+ * The list_for_each() macro wasn't appropriate for the sysctl loop.
+ *  Removed it and replaced it with older style, 03/23/00, Bill Wendling
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/capability.h>
+#include <linux/ctype.h>
+#include <linux/utsname.h>
+#include <linux/capability.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/net.h>
+#include <linux/sysrq.h>
+#include <linux/highuid.h>
+#include <linux/writeback.h>
+#include <linux/hugetlb.h>
+#include <linux/security.h>
+#include <linux/initrd.h>
+#include <linux/times.h>
+#include <linux/limits.h>
+#include <linux/dcache.h>
+#include <linux/syscalls.h>
+#include <linux/profile.h>
+#include <linux/nfs_fs.h>
+#include <linux/acpi.h>
+
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos);
+
+#if defined(CONFIG_SYSCTL)
+
+/* External variables not in a header file. */
+extern int C_A_D;
+extern int sysctl_overcommit_memory;
+extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
+extern int max_threads;
+extern int sysrq_enabled;
+extern int core_uses_pid;
+extern int suid_dumpable;
+extern char core_pattern[];
+extern int cad_pid;
+extern int pid_max;
+extern int min_free_kbytes;
+extern int printk_ratelimit_jiffies;
+extern int printk_ratelimit_burst;
+extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
+extern int percpu_pagelist_fraction;
+extern int compat_log;
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+int unknown_nmi_panic;
+extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
+				  void __user *, size_t *, loff_t *);
+#endif
+
+/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+static int maxolduid = 65535;
+static int minolduid;
+static int min_percpu_pagelist_fract = 8;
+
+static int ngroups_max = NGROUPS_MAX;
+
+#ifdef CONFIG_KMOD
+extern char modprobe_path[];
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+extern int sg_big_buff;
+#endif
+#ifdef CONFIG_SYSVIPC
+extern size_t shm_ctlmax;
+extern size_t shm_ctlall;
+extern int shm_ctlmni;
+extern int msg_ctlmax;
+extern int msg_ctlmnb;
+extern int msg_ctlmni;
+extern int sem_ctls[];
+#endif
+
+#ifdef __sparc__
+extern char reboot_command [];
+extern int stop_a_enabled;
+extern int scons_pwroff;
+#endif
+
+#ifdef __hppa__
+extern int pwrsw_enabled;
+extern int unaligned_enabled;
+#endif
+
+#ifdef CONFIG_S390
+#ifdef CONFIG_MATHEMU
+extern int sysctl_ieee_emulation_warnings;
+#endif
+extern int sysctl_userprocess_debug;
+extern int spin_retry;
+#endif
+
+extern int sysctl_hz_timer;
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+extern int acct_parm[];
+#endif
+
+#ifdef CONFIG_IA64
+extern int no_unaligned_warning;
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
+static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
+		       ctl_table *, void **);
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
+
+static ctl_table root_table[];
+static struct ctl_table_header root_table_header =
+	{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
+
+static ctl_table kern_table[];
+static ctl_table vm_table[];
+static ctl_table fs_table[];
+static ctl_table debug_table[];
+static ctl_table dev_table[];
+extern ctl_table random_table[];
+#ifdef CONFIG_UNIX98_PTYS
+extern ctl_table pty_table[];
+#endif
+#ifdef CONFIG_INOTIFY_USER
+extern ctl_table inotify_table[];
+#endif
+
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+int sysctl_legacy_va_layout;
+#endif
+
+/* /proc declarations: */
+
+#ifdef CONFIG_PROC_FS
+
+static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
+static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
+static int proc_opensys(struct inode *, struct file *);
+
+struct file_operations proc_sys_file_operations = {
+	.open		= proc_opensys,
+	.read		= proc_readsys,
+	.write		= proc_writesys,
+};
+
+extern struct proc_dir_entry *proc_sys_root;
+
+static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
+static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
+#endif
+
+/* The default sysctl tables: */
+
+static ctl_table root_table[] = {
+	{
+		.ctl_name	= CTL_KERN,
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= kern_table,
+	},
+	{
+		.ctl_name	= CTL_VM,
+		.procname	= "vm",
+		.mode		= 0555,
+		.child		= vm_table,
+	},
+#ifdef CONFIG_NET
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555,
+		.child		= net_table,
+	},
+#endif
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= fs_table,
+	},
+	{
+		.ctl_name	= CTL_DEBUG,
+		.procname	= "debug",
+		.mode		= 0555,
+		.child		= debug_table,
+	},
+	{
+		.ctl_name	= CTL_DEV,
+		.procname	= "dev",
+		.mode		= 0555,
+		.child		= dev_table,
+	},
+
+	{ .ctl_name = 0 }
+};
+
+static ctl_table kern_table[] = {
+	{
+		.ctl_name	= KERN_OSTYPE,
+		.procname	= "ostype",
+		.data		= system_utsname.sysname,
+		.maxlen		= sizeof(system_utsname.sysname),
+		.mode		= 0444,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_OSRELEASE,
+		.procname	= "osrelease",
+		.data		= system_utsname.release,
+		.maxlen		= sizeof(system_utsname.release),
+		.mode		= 0444,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_VERSION,
+		.procname	= "version",
+		.data		= system_utsname.version,
+		.maxlen		= sizeof(system_utsname.version),
+		.mode		= 0444,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_NODENAME,
+		.procname	= "hostname",
+		.data		= system_utsname.nodename,
+		.maxlen		= sizeof(system_utsname.nodename),
+		.mode		= 0644,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_DOMAINNAME,
+		.procname	= "domainname",
+		.data		= system_utsname.domainname,
+		.maxlen		= sizeof(system_utsname.domainname),
+		.mode		= 0644,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "panic",
+		.data		= &panic_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_print_at_crash",
+		.data		= &trace_print_at_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_GENERIC_HARDIRQS
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_CORE_USES_PID,
+		.procname	= "core_uses_pid",
+		.data		= &core_uses_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_CORE_PATTERN,
+		.procname	= "core_pattern",
+		.data		= core_pattern,
+		.maxlen		= 64,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_TAINTED,
+		.procname	= "tainted",
+		.data		= &tainted,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_CAP_BSET,
+		.procname	= "cap-bound",
+		.data		= &cap_bset,
+		.maxlen		= sizeof(kernel_cap_t),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec_bset,
+	},
+#ifdef CONFIG_BLK_DEV_INITRD
+	{
+		.ctl_name	= KERN_REALROOTDEV,
+		.procname	= "real-root-dev",
+		.data		= &real_root_dev,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef __sparc__
+	{
+		.ctl_name	= KERN_SPARC_REBOOT,
+		.procname	= "reboot-cmd",
+		.data		= reboot_command,
+		.maxlen		= 256,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_SPARC_STOP_A,
+		.procname	= "stop-a",
+		.data		= &stop_a_enabled,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_SPARC_SCONS_PWROFF,
+		.procname	= "scons-poweroff",
+		.data		= &scons_pwroff,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef __hppa__
+	{
+		.ctl_name	= KERN_HPPA_PWRSW,
+		.procname	= "soft-power",
+		.data		= &pwrsw_enabled,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_HPPA_UNALIGNED,
+		.procname	= "unaligned-trap",
+		.data		= &unaligned_enabled,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_CTLALTDEL,
+		.procname	= "ctrl-alt-del",
+		.data		= &C_A_D,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PRINTK,
+		.procname	= "printk",
+		.data		= &console_loglevel,
+		.maxlen		= 4*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_KMOD
+	{
+		.ctl_name	= KERN_MODPROBE,
+		.procname	= "modprobe",
+		.data		= &modprobe_path,
+		.maxlen		= KMOD_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+#endif
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+	{
+		.ctl_name	= KERN_HOTPLUG,
+		.procname	= "hotplug",
+		.data		= &uevent_helper,
+		.maxlen		= UEVENT_HELPER_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+	{
+		.ctl_name	= KERN_SG_BIG_BUFF,
+		.procname	= "sg-big-buff",
+		.data		= &sg_big_buff,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	{
+		.ctl_name	= KERN_ACCT,
+		.procname	= "acct",
+		.data		= &acct_parm,
+		.maxlen		= 3*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_SYSVIPC
+	{
+		.ctl_name	= KERN_SHMMAX,
+		.procname	= "shmmax",
+		.data		= &shm_ctlmax,
+		.maxlen		= sizeof (size_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= KERN_SHMALL,
+		.procname	= "shmall",
+		.data		= &shm_ctlall,
+		.maxlen		= sizeof (size_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= KERN_SHMMNI,
+		.procname	= "shmmni",
+		.data		= &shm_ctlmni,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MSGMAX,
+		.procname	= "msgmax",
+		.data		= &msg_ctlmax,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MSGMNI,
+		.procname	= "msgmni",
+		.data		= &msg_ctlmni,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MSGMNB,
+		.procname	=  "msgmnb",
+		.data		= &msg_ctlmnb,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_SEM,
+		.procname	= "sem",
+		.data		= &sem_ctls,
+		.maxlen		= 4*sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_MAGIC_SYSRQ
+	{
+		.ctl_name	= KERN_SYSRQ,
+		.procname	= "sysrq",
+		.data		= &sysrq_enabled,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_CADPID,
+		.procname	= "cad_pid",
+		.data		= &cad_pid,
+		.maxlen		= sizeof (int),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MAX_THREADS,
+		.procname	= "threads-max",
+		.data		= &max_threads,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_RANDOM,
+		.procname	= "random",
+		.mode		= 0555,
+		.child		= random_table,
+	},
+#ifdef CONFIG_UNIX98_PTYS
+	{
+		.ctl_name	= KERN_PTY,
+		.procname	= "pty",
+		.mode		= 0555,
+		.child		= pty_table,
+	},
+#endif
+	{
+		.ctl_name	= KERN_OVERFLOWUID,
+		.procname	= "overflowuid",
+		.data		= &overflowuid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+	{
+		.ctl_name	= KERN_OVERFLOWGID,
+		.procname	= "overflowgid",
+		.data		= &overflowgid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+#ifdef CONFIG_S390
+#ifdef CONFIG_MATHEMU
+	{
+		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
+		.procname	= "ieee_emulation_warnings",
+		.data		= &sysctl_ieee_emulation_warnings,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+	{
+		.ctl_name       = KERN_HZ_TIMER,
+		.procname       = "hz_timer",
+		.data           = &sysctl_hz_timer,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
+		.procname	= "userprocess_debug",
+		.data		= &sysctl_userprocess_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_NO_HZ
+	{
+		.ctl_name       = KERN_TIMEOUT_GRANULARITY,
+		.procname       = "timeout_granularity",
+		.data           = &timeout_granularity,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_PIDMAX,
+		.procname	= "pid_max",
+		.data		= &pid_max,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= sysctl_intvec,
+		.extra1		= &pid_max_min,
+		.extra2		= &pid_max_max,
+	},
+	{
+		.ctl_name	= KERN_PANIC_ON_OOPS,
+		.procname	= "panic_on_oops",
+		.data		= &panic_on_oops,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PRINTK_RATELIMIT,
+		.procname	= "printk_ratelimit",
+		.data		= &printk_ratelimit_jiffies,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= KERN_PRINTK_RATELIMIT_BURST,
+		.procname	= "printk_ratelimit_burst",
+		.data		= &printk_ratelimit_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_NGROUPS_MAX,
+		.procname	= "ngroups_max",
+		.data		= &ngroups_max,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+	{
+		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
+		.procname       = "unknown_nmi_panic",
+		.data           = &unknown_nmi_panic,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = &proc_unknown_nmi_panic,
+	},
+#endif
+#if defined(CONFIG_X86)
+	{
+		.ctl_name	= KERN_BOOTLOADER_TYPE,
+		.procname	= "bootloader_type",
+		.data		= &bootloader_type,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_MMU)
+	{
+		.ctl_name	= KERN_RANDOMIZE,
+		.procname	= "randomize_va_space",
+		.data		= &randomize_va_space,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_S390) && defined(CONFIG_SMP)
+	{
+		.ctl_name	= KERN_SPIN_RETRY,
+		.procname	= "spin_retry",
+		.data		= &spin_retry,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+	{
+		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
+		.procname	= "acpi_video_flags",
+		.data		= &acpi_video_flags,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#endif
+#ifdef CONFIG_IA64
+	{
+		.ctl_name	= KERN_IA64_UNALIGNED,
+		.procname	= "ignore-unaligned-usertrap",
+		.data		= &no_unaligned_warning,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_COMPAT
+	{
+		.ctl_name	= KERN_COMPAT_LOG,
+		.procname	= "compat-log",
+		.data		= &compat_log,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_RT_MUTEXES
+	{
+		.ctl_name	= KERN_MAX_LOCK_DEPTH,
+		.procname	= "max_lock_depth",
+		.data		= &max_lock_depth,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+
+	{ .ctl_name = 0 }
+};
+
+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int zero;
+static int one_hundred = 100;
+
+
+static ctl_table vm_table[] = {
+	{
+		.ctl_name	= VM_OVERCOMMIT_MEMORY,
+		.procname	= "overcommit_memory",
+		.data		= &sysctl_overcommit_memory,
+		.maxlen		= sizeof(sysctl_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_PANIC_ON_OOM,
+		.procname	= "panic_on_oom",
+		.data		= &sysctl_panic_on_oom,
+		.maxlen		= sizeof(sysctl_panic_on_oom),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_OVERCOMMIT_RATIO,
+		.procname	= "overcommit_ratio",
+		.data		= &sysctl_overcommit_ratio,
+		.maxlen		= sizeof(sysctl_overcommit_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_PAGE_CLUSTER,
+		.procname	= "page-cluster", 
+		.data		= &page_cluster,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_DIRTY_BACKGROUND,
+		.procname	= "dirty_background_ratio",
+		.data		= &dirty_background_ratio,
+		.maxlen		= sizeof(dirty_background_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= VM_DIRTY_RATIO,
+		.procname	= "dirty_ratio",
+		.data		= &vm_dirty_ratio,
+		.maxlen		= sizeof(vm_dirty_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= VM_DIRTY_WB_CS,
+		.procname	= "dirty_writeback_centisecs",
+		.data		= &dirty_writeback_interval,
+		.maxlen		= sizeof(dirty_writeback_interval),
+		.mode		= 0644,
+		.proc_handler	= &dirty_writeback_centisecs_handler,
+	},
+	{
+		.ctl_name	= VM_DIRTY_EXPIRE_CS,
+		.procname	= "dirty_expire_centisecs",
+		.data		= &dirty_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_userhz_jiffies,
+	},
+	{
+		.ctl_name	= VM_NR_PDFLUSH_THREADS,
+		.procname	= "nr_pdflush_threads",
+		.data		= &nr_pdflush_threads,
+		.maxlen		= sizeof nr_pdflush_threads,
+		.mode		= 0444 /* read-only*/,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_SWAPPINESS,
+		.procname	= "swappiness",
+		.data		= &vm_swappiness,
+		.maxlen		= sizeof(vm_swappiness),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#ifdef CONFIG_HUGETLB_PAGE
+	 {
+		.ctl_name	= VM_HUGETLB_PAGES,
+		.procname	= "nr_hugepages",
+		.data		= &max_huge_pages,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &hugetlb_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
+	 },
+	 {
+		.ctl_name	= VM_HUGETLB_GROUP,
+		.procname	= "hugetlb_shm_group",
+		.data		= &sysctl_hugetlb_shm_group,
+		.maxlen		= sizeof(gid_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	 },
+#endif
+	{
+		.ctl_name	= VM_LOWMEM_RESERVE_RATIO,
+		.procname	= "lowmem_reserve_ratio",
+		.data		= &sysctl_lowmem_reserve_ratio,
+		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
+		.mode		= 0644,
+		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= VM_DROP_PAGECACHE,
+		.procname	= "drop_caches",
+		.data		= &sysctl_drop_caches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= drop_caches_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= VM_MIN_FREE_KBYTES,
+		.procname	= "min_free_kbytes",
+		.data		= &min_free_kbytes,
+		.maxlen		= sizeof(min_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= &min_free_kbytes_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
+		.procname	= "percpu_pagelist_fraction",
+		.data		= &percpu_pagelist_fraction,
+		.maxlen		= sizeof(percpu_pagelist_fraction),
+		.mode		= 0644,
+		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_percpu_pagelist_fract,
+	},
+#ifdef CONFIG_MMU
+	{
+		.ctl_name	= VM_MAX_MAP_COUNT,
+		.procname	= "max_map_count",
+		.data		= &sysctl_max_map_count,
+		.maxlen		= sizeof(sysctl_max_map_count),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
+	{
+		.ctl_name	= VM_LAPTOP_MODE,
+		.procname	= "laptop_mode",
+		.data		= &laptop_mode,
+		.maxlen		= sizeof(laptop_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= VM_BLOCK_DUMP,
+		.procname	= "block_dump",
+		.data		= &block_dump,
+		.maxlen		= sizeof(block_dump),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_VFS_CACHE_PRESSURE,
+		.procname	= "vfs_cache_pressure",
+		.data		= &sysctl_vfs_cache_pressure,
+		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+	{
+		.ctl_name	= VM_LEGACY_VA_LAYOUT,
+		.procname	= "legacy_va_layout",
+		.data		= &sysctl_legacy_va_layout,
+		.maxlen		= sizeof(sysctl_legacy_va_layout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif
+#ifdef CONFIG_SWAP
+	{
+		.ctl_name	= VM_SWAP_TOKEN_TIMEOUT,
+		.procname	= "swap_token_timeout",
+		.data		= &swap_token_default_timeout,
+		.maxlen		= sizeof(swap_token_default_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+#endif
+#ifdef CONFIG_NUMA
+	{
+		.ctl_name	= VM_ZONE_RECLAIM_MODE,
+		.procname	= "zone_reclaim_mode",
+		.data		= &zone_reclaim_mode,
+		.maxlen		= sizeof(zone_reclaim_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_MIN_UNMAPPED,
+		.procname	= "min_unmapped_ratio",
+		.data		= &sysctl_min_unmapped_ratio,
+		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_min_unmapped_ratio_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
+#ifdef CONFIG_X86_32
+	{
+		.ctl_name	= VM_VDSO_ENABLED,
+		.procname	= "vdso_enabled",
+		.data		= &vdso_enabled,
+		.maxlen		= sizeof(vdso_enabled),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif
+	{ .ctl_name = 0 }
+};
+
+static ctl_table fs_table[] = {
+	{
+		.ctl_name	= FS_NRINODE,
+		.procname	= "inode-nr",
+		.data		= &inodes_stat,
+		.maxlen		= 2*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_STATINODE,
+		.procname	= "inode-state",
+		.data		= &inodes_stat,
+		.maxlen		= 7*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_NRFILE,
+		.procname	= "file-nr",
+		.data		= &files_stat,
+		.maxlen		= 3*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_nr_files,
+	},
+	{
+		.ctl_name	= FS_MAXFILE,
+		.procname	= "file-max",
+		.data		= &files_stat.max_files,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_DENTRY,
+		.procname	= "dentry-state",
+		.data		= &dentry_stat,
+		.maxlen		= 6*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_OVERFLOWUID,
+		.procname	= "overflowuid",
+		.data		= &fs_overflowuid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+	{
+		.ctl_name	= FS_OVERFLOWGID,
+		.procname	= "overflowgid",
+		.data		= &fs_overflowgid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+	{
+		.ctl_name	= FS_LEASES,
+		.procname	= "leases-enable",
+		.data		= &leases_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_DNOTIFY
+	{
+		.ctl_name	= FS_DIR_NOTIFY,
+		.procname	= "dir-notify-enable",
+		.data		= &dir_notify_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_MMU
+	{
+		.ctl_name	= FS_LEASE_TIME,
+		.procname	= "lease-break-time",
+		.data		= &lease_break_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_AIO_NR,
+		.procname	= "aio-nr",
+		.data		= &aio_nr,
+		.maxlen		= sizeof(aio_nr),
+		.mode		= 0444,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= FS_AIO_MAX_NR,
+		.procname	= "aio-max-nr",
+		.data		= &aio_max_nr,
+		.maxlen		= sizeof(aio_max_nr),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#ifdef CONFIG_INOTIFY_USER
+	{
+		.ctl_name	= FS_INOTIFY,
+		.procname	= "inotify",
+		.mode		= 0555,
+		.child		= inotify_table,
+	},
+#endif	
+#endif
+	{
+		.ctl_name	= KERN_SETUID_DUMPABLE,
+		.procname	= "suid_dumpable",
+		.data		= &suid_dumpable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table debug_table[] = {
+	{ .ctl_name = 0 }
+};
+
+static ctl_table dev_table[] = {
+	{ .ctl_name = 0 }
+};
+
+extern void init_irq_proc (void);
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+	if (unlikely(p->unregistering))
+		return 0;
+	p->used++;
+	return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+	if (!--p->used)
+		if (unlikely(p->unregistering))
+			complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+	/*
+	 * if p->used is 0, nobody will ever touch that entry again;
+	 * we'll eliminate all paths to it before dropping sysctl_lock
+	 */
+	if (unlikely(p->used)) {
+		struct completion wait;
+		init_completion(&wait);
+		p->unregistering = &wait;
+		spin_unlock(&sysctl_lock);
+		wait_for_completion(&wait);
+		spin_lock(&sysctl_lock);
+	}
+	/*
+	 * do not remove from the list until nobody holds it; walking the
+	 * list in do_sysctl() relies on that.
+	 */
+	list_del_init(&p->ctl_entry);
+}
+
+void __init sysctl_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	register_proc_table(root_table, proc_sys_root, &root_table_header);
+	init_irq_proc();
+#endif
+}
+
+int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
+	       void __user *newval, size_t newlen)
+{
+	struct list_head *tmp;
+	int error = -ENOTDIR;
+
+	if (nlen <= 0 || nlen >= CTL_MAXNAME)
+		return -ENOTDIR;
+	if (oldval) {
+		int old_len;
+		if (!oldlenp || get_user(old_len, oldlenp))
+			return -EFAULT;
+	}
+	spin_lock(&sysctl_lock);
+	tmp = &root_table_header.ctl_entry;
+	do {
+		struct ctl_table_header *head =
+			list_entry(tmp, struct ctl_table_header, ctl_entry);
+		void *context = NULL;
+
+		if (!use_table(head))
+			continue;
+
+		spin_unlock(&sysctl_lock);
+
+		error = parse_table(name, nlen, oldval, oldlenp, 
+					newval, newlen, head->ctl_table,
+					&context);
+		kfree(context);
+
+		spin_lock(&sysctl_lock);
+		unuse_table(head);
+		if (error != -ENOTDIR)
+			break;
+	} while ((tmp = tmp->next) != &root_table_header.ctl_entry);
+	spin_unlock(&sysctl_lock);
+	return error;
+}
+
+asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+{
+	struct __sysctl_args tmp;
+	int error;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	lock_kernel();
+	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
+			  tmp.newval, tmp.newlen);
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * ctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+	if (!current->euid)
+		mode >>= 6;
+	else if (in_egroup_p(0))
+		mode >>= 3;
+	if ((mode & op & 0007) == op)
+		return 0;
+	return -EACCES;
+}
+
+static inline int ctl_perm(ctl_table *table, int op)
+{
+	int error;
+	error = security_sysctl(table, op);
+	if (error)
+		return error;
+	return test_perm(table->mode, op);
+}
+
+static int parse_table(int __user *name, int nlen,
+		       void __user *oldval, size_t __user *oldlenp,
+		       void __user *newval, size_t newlen,
+		       ctl_table *table, void **context)
+{
+	int n;
+repeat:
+	if (!nlen)
+		return -ENOTDIR;
+	if (get_user(n, name))
+		return -EFAULT;
+	for ( ; table->ctl_name; table++) {
+		if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
+			int error;
+			if (table->child) {
+				if (ctl_perm(table, 001))
+					return -EPERM;
+				if (table->strategy) {
+					error = table->strategy(
+						table, name, nlen,
+						oldval, oldlenp,
+						newval, newlen, context);
+					if (error)
+						return error;
+				}
+				name++;
+				nlen--;
+				table = table->child;
+				goto repeat;
+			}
+			error = do_sysctl_strategy(table, name, nlen,
+						   oldval, oldlenp,
+						   newval, newlen, context);
+			return error;
+		}
+	}
+	return -ENOTDIR;
+}
+
+/* Perform the actual read/write of a sysctl table entry. */
+int do_sysctl_strategy (ctl_table *table, 
+			int __user *name, int nlen,
+			void __user *oldval, size_t __user *oldlenp,
+			void __user *newval, size_t newlen, void **context)
+{
+	int op = 0, rc;
+	size_t len;
+
+	if (oldval)
+		op |= 004;
+	if (newval) 
+		op |= 002;
+	if (ctl_perm(table, op))
+		return -EPERM;
+
+	if (table->strategy) {
+		rc = table->strategy(table, name, nlen, oldval, oldlenp,
+				     newval, newlen, context);
+		if (rc < 0)
+			return rc;
+		if (rc > 0)
+			return 0;
+	}
+
+	/* If there is no strategy routine, or if the strategy returns
+	 * zero, proceed with automatic r/w */
+	if (table->data && table->maxlen) {
+		if (oldval && oldlenp) {
+			if (get_user(len, oldlenp))
+				return -EFAULT;
+			if (len) {
+				if (len > table->maxlen)
+					len = table->maxlen;
+				if(copy_to_user(oldval, table->data, len))
+					return -EFAULT;
+				if(put_user(len, oldlenp))
+					return -EFAULT;
+			}
+		}
+		if (newval && newlen) {
+			len = newlen;
+			if (len > table->maxlen)
+				len = table->maxlen;
+			if(copy_from_user(table->data, newval, len))
+				return -EFAULT;
+		}
+	}
+	return 0;
+}
+
+/**
+ * register_sysctl_table - register a sysctl hierarchy
+ * @table: the top-level table structure
+ * @insert_at_head: whether the entry should be inserted in front or at the end
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. An entry with a ctl_name of 0 terminates the table. 
+ *
+ * The members of the &ctl_table structure are used as follows:
+ *
+ * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
+ *            must be unique within that level of sysctl
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * strategy - the strategy routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * More sophisticated management can be enabled by the provision of a
+ * strategy routine with the table entry.  This will be called before
+ * any automatic read or write of the data is performed.
+ *
+ * The strategy routine may return
+ *
+ * < 0 - Error occurred (error is passed to user process)
+ *
+ * 0   - OK - proceed with automatic read or write.
+ *
+ * > 0 - OK - read or write has been done by the strategy routine, so
+ *       return immediately.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *register_sysctl_table(ctl_table * table, 
+					       int insert_at_head)
+{
+	struct ctl_table_header *tmp;
+	tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
+	if (!tmp)
+		return NULL;
+	tmp->ctl_table = table;
+	INIT_LIST_HEAD(&tmp->ctl_entry);
+	tmp->used = 0;
+	tmp->unregistering = NULL;
+	spin_lock(&sysctl_lock);
+	if (insert_at_head)
+		list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	else
+		list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	spin_unlock(&sysctl_lock);
+#ifdef CONFIG_PROC_FS
+	register_proc_table(table, proc_sys_root, tmp);
+#endif
+	return tmp;
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+	might_sleep();
+	spin_lock(&sysctl_lock);
+	start_unregistering(header);
+#ifdef CONFIG_PROC_FS
+	unregister_proc_table(header->ctl_table, proc_sys_root);
+#endif
+	spin_unlock(&sysctl_lock);
+	kfree(header);
+}
+
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_FS
+
+/* Scan the sysctl entries in table and add them all into /proc */
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
+{
+	struct proc_dir_entry *de;
+	int len;
+	mode_t mode;
+	
+	for (; table->ctl_name; table++) {
+		/* Can't do anything without a proc name. */
+		if (!table->procname)
+			continue;
+		/* Maybe we can't do anything with it... */
+		if (!table->proc_handler && !table->child) {
+			printk(KERN_WARNING "SYSCTL: Can't register %s\n",
+				table->procname);
+			continue;
+		}
+
+		len = strlen(table->procname);
+		mode = table->mode;
+
+		de = NULL;
+		if (table->proc_handler)
+			mode |= S_IFREG;
+		else {
+			mode |= S_IFDIR;
+			for (de = root->subdir; de; de = de->next) {
+				if (proc_match(len, table->procname, de))
+					break;
+			}
+			/* If the subdir exists already, de is non-NULL */
+		}
+
+		if (!de) {
+			de = create_proc_entry(table->procname, mode, root);
+			if (!de)
+				continue;
+			de->set = set;
+			de->data = (void *) table;
+			if (table->proc_handler)
+				de->proc_fops = &proc_sys_file_operations;
+		}
+		table->de = de;
+		if (de->mode & S_IFDIR)
+			register_proc_table(table->child, de, set);
+	}
+}
+
+/*
+ * Unregister a /proc sysctl table and any subdirectories.
+ */
+static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
+{
+	struct proc_dir_entry *de;
+	for (; table->ctl_name; table++) {
+		if (!(de = table->de))
+			continue;
+		if (de->mode & S_IFDIR) {
+			if (!table->child) {
+				printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
+				continue;
+			}
+			unregister_proc_table(table->child, de);
+
+			/* Don't unregister directories which still have entries.. */
+			if (de->subdir)
+				continue;
+		}
+
+		/*
+		 * In any case, mark the entry as goner; we'll keep it
+		 * around if it's busy, but we'll know to do nothing with
+		 * its fields.  We are under sysctl_lock here.
+		 */
+		de->data = NULL;
+
+		/* Don't unregister proc entries that are still being used.. */
+		if (atomic_read(&de->count))
+			continue;
+
+		table->de = NULL;
+		remove_proc_entry(table->procname, root);
+	}
+}
+
+static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
+			  size_t count, loff_t *ppos)
+{
+	int op;
+	struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
+	struct ctl_table *table;
+	size_t res;
+	ssize_t error = -ENOTDIR;
+	
+	spin_lock(&sysctl_lock);
+	if (de && de->data && use_table(de->set)) {
+		/*
+		 * at that point we know that sysctl was not unregistered
+		 * and won't be until we finish
+		 */
+		spin_unlock(&sysctl_lock);
+		table = (struct ctl_table *) de->data;
+		if (!table || !table->proc_handler)
+			goto out;
+		error = -EPERM;
+		op = (write ? 002 : 004);
+		if (ctl_perm(table, op))
+			goto out;
+		
+		/* careful: calling conventions are nasty here */
+		res = count;
+		error = (*table->proc_handler)(table, write, file,
+						buf, &res, ppos);
+		if (!error)
+			error = res;
+	out:
+		spin_lock(&sysctl_lock);
+		unuse_table(de->set);
+	}
+	spin_unlock(&sysctl_lock);
+	return error;
+}
+
+static int proc_opensys(struct inode *inode, struct file *file)
+{
+	if (file->f_mode & FMODE_WRITE) {
+		/*
+		 * sysctl entries that are not writable,
+		 * are _NOT_ writable, capabilities or not.
+		 */
+		if (!(inode->i_mode & S_IWUSR))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
+static ssize_t proc_readsys(struct file * file, char __user * buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_rw_proc(0, file, buf, count, ppos);
+}
+
+static ssize_t proc_writesys(struct file * file, const char __user * buf,
+			     size_t count, loff_t *ppos)
+{
+	return do_rw_proc(1, file, (char __user *) buf, count, ppos);
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t len;
+	char __user *p;
+	char c;
+	
+	if (!table->data || !table->maxlen || !*lenp ||
+	    (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	if (write) {
+		len = 0;
+		p = buffer;
+		while (len < *lenp) {
+			if (get_user(c, p++))
+				return -EFAULT;
+			if (c == 0 || c == '\n')
+				break;
+			len++;
+		}
+		if (len >= table->maxlen)
+			len = table->maxlen-1;
+		if(copy_from_user(table->data, buffer, len))
+			return -EFAULT;
+		((char *) table->data)[len] = 0;
+		*ppos += *lenp;
+	} else {
+		len = strlen(table->data);
+		if (len > table->maxlen)
+			len = table->maxlen;
+		if (len > *lenp)
+			len = *lenp;
+		if (len)
+			if(copy_to_user(buffer, table->data, len))
+				return -EFAULT;
+		if (len < *lenp) {
+			if(put_user('\n', ((char __user *) buffer) + len))
+				return -EFAULT;
+			len++;
+		}
+		*lenp = len;
+		*ppos += len;
+	}
+	return 0;
+}
+
+/*
+ *	Special case of dostring for the UTS structure. This has locks
+ *	to observe. Should this be in kernel/sys.c ????
+ */
+ 
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int r;
+
+	if (!write) {
+		down_read(&uts_sem);
+		r=proc_dostring(table,0,filp,buffer,lenp, ppos);
+		up_read(&uts_sem);
+	} else {
+		down_write(&uts_sem);
+		r=proc_dostring(table,1,filp,buffer,lenp, ppos);
+		up_write(&uts_sem);
+	}
+	return r;
+}
+
+static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+				 int *valp,
+				 int write, void *data)
+{
+	if (write) {
+		*valp = *negp ? -*lvalp : *lvalp;
+	} else {
+		int val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			*lvalp = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			*lvalp = (unsigned long)val;
+		}
+	}
+	return 0;
+}
+
+static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos,
+		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+			      int write, void *data),
+		  void *data)
+{
+#define TMPBUFLEN 21
+	int *i, vleft, first=1, neg, val;
+	unsigned long lval;
+	size_t left, len;
+	
+	char buf[TMPBUFLEN], *p;
+	char __user *s = buffer;
+	
+	if (!table->data || !table->maxlen || !*lenp ||
+	    (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	i = (int *) table->data;
+	vleft = table->maxlen / sizeof(*i);
+	left = *lenp;
+
+	if (!conv)
+		conv = do_proc_dointvec_conv;
+
+	for (; left && vleft--; i++, first=0) {
+		if (write) {
+			while (left) {
+				char c;
+				if (get_user(c, s))
+					return -EFAULT;
+				if (!isspace(c))
+					break;
+				left--;
+				s++;
+			}
+			if (!left)
+				break;
+			neg = 0;
+			len = left;
+			if (len > sizeof(buf) - 1)
+				len = sizeof(buf) - 1;
+			if (copy_from_user(buf, s, len))
+				return -EFAULT;
+			buf[len] = 0;
+			p = buf;
+			if (*p == '-' && left > 1) {
+				neg = 1;
+				left--, p++;
+			}
+			if (*p < '0' || *p > '9')
+				break;
+
+			lval = simple_strtoul(p, &p, 0);
+
+			len = p-buf;
+			if ((len < left) && *p && !isspace(*p))
+				break;
+			if (neg)
+				val = -val;
+			s += len;
+			left -= len;
+
+			if (conv(&neg, &lval, i, 1, data))
+				break;
+		} else {
+			p = buf;
+			if (!first)
+				*p++ = '\t';
+	
+			if (conv(&neg, &lval, i, 0, data))
+				break;
+
+			sprintf(p, "%s%lu", neg ? "-" : "", lval);
+			len = strlen(buf);
+			if (len > left)
+				len = left;
+			if(copy_to_user(s, buf, len))
+				return -EFAULT;
+			left -= len;
+			s += len;
+		}
+	}
+
+	if (!write && !first && left) {
+		if(put_user('\n', s))
+			return -EFAULT;
+		left--, s++;
+	}
+	if (write) {
+		while (left) {
+			char c;
+			if (get_user(c, s++))
+				return -EFAULT;
+			if (!isspace(c))
+				break;
+			left--;
+		}
+	}
+	if (write && first)
+		return -EINVAL;
+	*lenp -= left;
+	*ppos += *lenp;
+	return 0;
+#undef TMPBUFLEN
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+		    	    NULL,NULL);
+}
+
+#define OP_SET	0
+#define OP_AND	1
+#define OP_OR	2
+#define OP_MAX	3
+#define OP_MIN	4
+
+static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
+				      int *valp,
+				      int write, void *data)
+{
+	int op = *(int *)data;
+	if (write) {
+		int val = *negp ? -*lvalp : *lvalp;
+		switch(op) {
+		case OP_SET:	*valp = val; break;
+		case OP_AND:	*valp &= val; break;
+		case OP_OR:	*valp |= val; break;
+		case OP_MAX:	if(*valp < val)
+					*valp = val;
+				break;
+		case OP_MIN:	if(*valp > val)
+				*valp = val;
+				break;
+		}
+	} else {
+		int val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			*lvalp = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			*lvalp = (unsigned long)val;
+		}
+	}
+	return 0;
+}
+
+/*
+ *	init may raise the set.
+ */
+ 
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int op;
+
+	if (!capable(CAP_SYS_MODULE)) {
+		return -EPERM;
+	}
+
+	op = (current->pid == 1) ? OP_SET : OP_AND;
+	return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+				do_proc_dointvec_bset_conv,&op);
+}
+
+struct do_proc_dointvec_minmax_conv_param {
+	int *min;
+	int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
+					int *valp, 
+					int write, void *data)
+{
+	struct do_proc_dointvec_minmax_conv_param *param = data;
+	if (write) {
+		int val = *negp ? -*lvalp : *lvalp;
+		if ((param->min && *param->min > val) ||
+		    (param->max && *param->max < val))
+			return -EINVAL;
+		*valp = val;
+	} else {
+		int val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			*lvalp = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			*lvalp = (unsigned long)val;
+		}
+	}
+	return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dointvec_minmax_conv_param param = {
+		.min = (int *) table->extra1,
+		.max = (int *) table->extra2,
+	};
+	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+				do_proc_dointvec_minmax_conv, &param);
+}
+
+static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+				     struct file *filp,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos,
+				     unsigned long convmul,
+				     unsigned long convdiv)
+{
+#define TMPBUFLEN 21
+	unsigned long *i, *min, *max, val;
+	int vleft, first=1, neg;
+	size_t len, left;
+	char buf[TMPBUFLEN], *p;
+	char __user *s = buffer;
+	
+	if (!table->data || !table->maxlen || !*lenp ||
+	    (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	i = (unsigned long *) table->data;
+	min = (unsigned long *) table->extra1;
+	max = (unsigned long *) table->extra2;
+	vleft = table->maxlen / sizeof(unsigned long);
+	left = *lenp;
+	
+	for (; left && vleft--; i++, min++, max++, first=0) {
+		if (write) {
+			while (left) {
+				char c;
+				if (get_user(c, s))
+					return -EFAULT;
+				if (!isspace(c))
+					break;
+				left--;
+				s++;
+			}
+			if (!left)
+				break;
+			neg = 0;
+			len = left;
+			if (len > TMPBUFLEN-1)
+				len = TMPBUFLEN-1;
+			if (copy_from_user(buf, s, len))
+				return -EFAULT;
+			buf[len] = 0;
+			p = buf;
+			if (*p == '-' && left > 1) {
+				neg = 1;
+				left--, p++;
+			}
+			if (*p < '0' || *p > '9')
+				break;
+			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
+			len = p-buf;
+			if ((len < left) && *p && !isspace(*p))
+				break;
+			if (neg)
+				val = -val;
+			s += len;
+			left -= len;
+
+			if(neg)
+				continue;
+			if ((min && val < *min) || (max && val > *max))
+				continue;
+			*i = val;
+		} else {
+			p = buf;
+			if (!first)
+				*p++ = '\t';
+			sprintf(p, "%lu", convdiv * (*i) / convmul);
+			len = strlen(buf);
+			if (len > left)
+				len = left;
+			if(copy_to_user(s, buf, len))
+				return -EFAULT;
+			left -= len;
+			s += len;
+		}
+	}
+
+	if (!write && !first && left) {
+		if(put_user('\n', s))
+			return -EFAULT;
+		left--, s++;
+	}
+	if (write) {
+		while (left) {
+			char c;
+			if (get_user(c, s++))
+				return -EFAULT;
+			if (!isspace(c))
+				break;
+			left--;
+		}
+	}
+	if (write && first)
+		return -EINVAL;
+	*lenp -= left;
+	*ppos += *lenp;
+	return 0;
+#undef TMPBUFLEN
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+				      struct file *filp,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+				     lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+					 int *valp,
+					 int write, void *data)
+{
+	if (write) {
+		if (*lvalp > LONG_MAX / HZ)
+			return 1;
+		*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+	} else {
+		int val = *valp;
+		unsigned long lval;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = lval / HZ;
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+						int *valp,
+						int write, void *data)
+{
+	if (write) {
+		if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+			return 1;
+		*valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+	} else {
+		int val = *valp;
+		unsigned long lval;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = jiffies_to_clock_t(lval);
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+					    int *valp,
+					    int write, void *data)
+{
+	if (write) {
+		*valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+	} else {
+		int val = *valp;
+		unsigned long lval;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = jiffies_to_msecs(lval);
+	}
+	return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+		    	    do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ * The values read are assumed to be in 1/USER_HZ seconds, and 
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ * The values read are assumed to be in 1/1000 seconds, and 
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+				do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+				      struct file *filp,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+    return -ENOSYS;
+}
+
+
+#endif /* CONFIG_PROC_FS */
+
+
+/*
+ * General sysctl support routines 
+ */
+
+/* The generic string strategy routine: */
+int sysctl_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context)
+{
+	if (!table->data || !table->maxlen) 
+		return -ENOTDIR;
+	
+	if (oldval && oldlenp) {
+		size_t bufsize;
+		if (get_user(bufsize, oldlenp))
+			return -EFAULT;
+		if (bufsize) {
+			size_t len = strlen(table->data), copied;
+
+			/* This shouldn't trigger for a well-formed sysctl */
+			if (len > table->maxlen)
+				len = table->maxlen;
+
+			/* Copy up to a max of bufsize-1 bytes of the string */
+			copied = (len >= bufsize) ? bufsize - 1 : len;
+
+			if (copy_to_user(oldval, table->data, copied) ||
+			    put_user(0, (char __user *)(oldval + copied)))
+				return -EFAULT;
+			if (put_user(len, oldlenp))
+				return -EFAULT;
+		}
+	}
+	if (newval && newlen) {
+		size_t len = newlen;
+		if (len > table->maxlen)
+			len = table->maxlen;
+		if(copy_from_user(table->data, newval, len))
+			return -EFAULT;
+		if (len == table->maxlen)
+			len--;
+		((char *) table->data)[len] = 0;
+	}
+	return 1;
+}
+
+/*
+ * This function makes sure that all of the integers in the vector
+ * are between the minimum and maximum values given in the arrays
+ * table->extra1 and table->extra2, respectively.
+ */
+int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+
+	if (newval && newlen) {
+		int __user *vec = (int __user *) newval;
+		int *min = (int *) table->extra1;
+		int *max = (int *) table->extra2;
+		size_t length;
+		int i;
+
+		if (newlen % sizeof(int) != 0)
+			return -EINVAL;
+
+		if (!table->extra1 && !table->extra2)
+			return 0;
+
+		if (newlen > table->maxlen)
+			newlen = table->maxlen;
+		length = newlen / sizeof(int);
+
+		for (i = 0; i < length; i++) {
+			int value;
+			if (get_user(value, vec + i))
+				return -EFAULT;
+			if (min && value < min[i])
+				return -EINVAL;
+			if (max && value > max[i])
+				return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/* Strategy function to convert jiffies to seconds */ 
+int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	if (oldval) {
+		size_t olen;
+		if (oldlenp) { 
+			if (get_user(olen, oldlenp))
+				return -EFAULT;
+			if (olen!=sizeof(int))
+				return -EINVAL; 
+		}
+		if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) ||
+		    (oldlenp && put_user(sizeof(int),oldlenp)))
+			return -EFAULT;
+	}
+	if (newval && newlen) { 
+		int new;
+		if (newlen != sizeof(int))
+			return -EINVAL; 
+		if (get_user(new, (int __user *)newval))
+			return -EFAULT;
+		*(int *)(table->data) = new*HZ; 
+	}
+	return 1;
+}
+
+/* Strategy function to convert jiffies to seconds */ 
+int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	if (oldval) {
+		size_t olen;
+		if (oldlenp) { 
+			if (get_user(olen, oldlenp))
+				return -EFAULT;
+			if (olen!=sizeof(int))
+				return -EINVAL; 
+		}
+		if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) ||
+		    (oldlenp && put_user(sizeof(int),oldlenp)))
+			return -EFAULT;
+	}
+	if (newval && newlen) { 
+		int new;
+		if (newlen != sizeof(int))
+			return -EINVAL; 
+		if (get_user(new, (int __user *)newval))
+			return -EFAULT;
+		*(int *)(table->data) = msecs_to_jiffies(new);
+	}
+	return 1;
+}
+
+#else /* CONFIG_SYSCTL */
+
+
+asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+{
+	return -ENOSYS;
+}
+
+int sysctl_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+				      struct file *filp,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+    return -ENOSYS;
+}
+
+struct ctl_table_header * register_sysctl_table(ctl_table * table, 
+						int insert_at_head)
+{
+	return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
+/*
+ * No sense putting this after each symbol definition, twice,
+ * exception granted :-)
+ */
+EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+EXPORT_SYMBOL(proc_dostring);
+EXPORT_SYMBOL(proc_doulongvec_minmax);
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(sysctl_intvec);
+EXPORT_SYMBOL(sysctl_jiffies);
+EXPORT_SYMBOL(sysctl_ms_jiffies);
+EXPORT_SYMBOL(sysctl_string);
+EXPORT_SYMBOL(unregister_sysctl_table);
diff -urN ./linux-2.6.18.1/kernel/time/Kconfig linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/Kconfig
--- ./linux-2.6.18.1/kernel/time/Kconfig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/Kconfig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,30 @@
+#
+# Timer subsystem related configuration options
+#
+config HIGH_RES_TIMERS
+	bool "High Resolution Timer Support"
+	depends on GENERIC_TIME
+	help
+	  This option enables high resolution timer support. If your
+	  hardware is not capable then this option only increases
+	  the size of the kernel image.
+
+config HIGH_RES_RESOLUTION
+	int "High Resolution Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+	  This sets the resolution in nanoseconds of the high resolution
+	  timers. Too fine a resolution (small a number) will usually
+	  not be observable due to normal system latencies.  For an
+          800 MHz processor about 10,000 (10 microseconds) is recommended as a
+	  finest resolution.  If you don't need that sort of resolution,
+	  larger values may generate less overhead.
+
+config NO_HZ
+	bool "Tickless System (Dynamic Ticks)"
+	depends on GENERIC_TIME && HIGH_RES_TIMERS
+	help
+	  This option enables a tickless system: timer interrupts will
+	  only trigger on an as-needed basis both when the system is
+	  busy and when the system is idle.
diff -urN ./linux-2.6.18.1/kernel/time/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/Makefile
--- ./linux-2.6.18.1/kernel/time/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -1 +1,4 @@
-obj-y += clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o
+
+obj-$(CONFIG_GENERIC_TIME)	+= clockevents.o
+obj-$(CONFIG_TIMER_STATS)	+= timer_stats.o
diff -urN ./linux-2.6.18.1/kernel/time/clockevents.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/clockevents.c
--- ./linux-2.6.18.1/kernel/time/clockevents.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/clockevents.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,525 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event drivers.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We have two types of clock event devices:
+ * - global events (one device per system)
+ * - local events (one device per cpu)
+ *
+ * We assign the various time(r) related interrupts to those devices
+ *
+ * - global tick
+ * - profiling (per cpu)
+ * - next timer events (per cpu)
+ *
+ * TODO:
+ * - implement variable frequency profiling
+ */
+
+#include <linux/clockchips.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sysdev.h>
+#include <linux/hrtimer.h>
+
+#define MAX_CLOCK_EVENTS	4
+#define GLOBAL_CLOCK_EVENT	MAX_CLOCK_EVENTS
+
+struct event_descr {
+	struct clock_event *event;
+	unsigned int mode;
+	unsigned int real_caps;
+	struct irqaction action;
+};
+
+struct local_events {
+	int installed;
+	struct event_descr events[MAX_CLOCK_EVENTS];
+	struct clock_event *nextevt;
+};
+
+/* Variables related to the global event source */
+static __read_mostly struct event_descr global_eventsource;
+
+/* Variables related to the per cpu local event sources */
+static DEFINE_PER_CPU(struct local_events, local_eventsources);
+
+/* lock to protect the above */
+static DEFINE_RAW_SPINLOCK(events_lock);
+
+/*
+ * Math helper. Convert a latch value to ns
+ */
+unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt)
+{
+	u64 clc = ((u64) latch << evt->shift);
+
+	do_div(clc, evt->mult);
+	if (clc < KTIME_MONOTONIC_RES.tv64)
+		clc = KTIME_MONOTONIC_RES.tv64;
+	if (clc > LONG_MAX)
+		clc = LONG_MAX;
+
+	return (unsigned long) clc;
+}
+
+/*
+ * Bootup and lowres handler: ticks only
+ */
+static void handle_tick(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+}
+
+/*
+ * Bootup and lowres handler: ticks and update_process_times
+ */
+static void handle_tick_update(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Bootup and lowres handler: ticks and profileing
+ */
+static void handle_tick_profile(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Bootup and lowres handler: ticks, update_process_times and profiling
+ */
+static void handle_tick_update_profile(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Bootup and lowres handler: update_process_times
+ */
+static void handle_update(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Bootup and lowres handler: update_process_times and profiling
+ */
+static void handle_update_profile(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Bootup and lowres handler: profiling
+ */
+static void handle_profile(struct pt_regs *regs)
+{
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Noop handler when we shut down an event source
+ */
+static void handle_noop(struct pt_regs *regs)
+{
+}
+
+/*
+ * Lookup table for bootup and lowres event assignment
+ */
+static void __read_mostly *event_handlers[] = {
+	handle_noop,			/* 0: No capability selected */
+	handle_tick,			/* 1: Tick only	*/
+	handle_update,			/* 2: Update process times */
+	handle_tick_update,		/* 3: Tick + update process times */
+	handle_profile,			/* 4: Profiling int */
+	handle_tick_profile,		/* 5: Tick + Profiling int */
+	handle_update_profile,		/* 6: Update process times +
+					      profiling */
+	handle_tick_update_profile,	/* 7: Tick + update process times +
+					      profiling */
+#ifdef CONFIG_HIGH_RES_TIMERS
+	hrtimer_interrupt,		/* 8: Reprogrammable event source */
+#endif
+};
+
+/*
+ * Start up an event source
+ */
+static void startup_event(struct clock_event *evt, unsigned int caps)
+{
+	int mode;
+
+	if (caps == CLOCK_CAP_NEXTEVT)
+		mode = CLOCK_EVT_ONESHOT;
+	else
+		mode = CLOCK_EVT_PERIODIC;
+
+	evt->set_mode(mode, evt);
+}
+
+/*
+ * Setup an event source. Assign an handler and start it up
+ * When the event source has no own interrupt handler we setup
+ * the interrupt too.
+ */
+static void setup_event(struct event_descr *descr, struct clock_event *evt,
+			unsigned int caps)
+{
+	void *handler = event_handlers[caps];
+
+	/* Set the event handler */
+	evt->event_handler = handler;
+
+	/* Store all relevant information */
+	descr->real_caps = caps;
+
+	startup_event(evt, caps);
+
+	printk(KERN_INFO "Event source %s configured with caps set: "
+	       "%02x\n", evt->name, descr->real_caps);
+}
+
+/**
+ * register_global_clockevent - register the device which generates
+ *			     global clock events
+ *
+ * @evt:	The device which generates global clock events (ticks)
+ *
+ * This can be a device which is only necessary for bootup. On UP systems this
+ * might be the only event source which is used for everything including
+ * high resolution events.
+ *
+ * When a cpu local event source is installed the global event source is
+ * switched off in the high resolution timer / tickless mode.
+ */
+int __init register_global_clockevent(struct clock_event *evt)
+{
+	/* Already installed? */
+	if (global_eventsource.event) {
+		printk(KERN_ERR "Global clock event source already installed: "
+		       "%s. Ignoring new global eventsoruce %s\n",
+		       global_eventsource.event->name,
+		       evt->name);
+		return -EBUSY;
+	}
+
+	/* Preset the handler in any case */
+	evt->event_handler = handle_noop;
+
+	/*
+	 * Check, whether it is a valid global event source
+	 */
+	if (!(evt->capabilities & CLOCK_BASE_CAPS_MASK)) {
+		printk(KERN_ERR "Unsupported event source %s\n", evt->name);
+		return -EINVAL;
+	}
+
+	/* Mask out high resolution capabilities for now */
+	global_eventsource.event = evt;
+	setup_event(&global_eventsource, evt,
+		    evt->capabilities & CLOCK_BASE_CAPS_MASK);
+	return 0;
+}
+
+/*
+ * Mask out the functionality which is covered by the new event source
+ * and assign a new event handler.
+ */
+static void recalc_active_event(struct event_descr *descr,
+				unsigned int newcaps)
+{
+	unsigned int caps;
+
+	if (!descr->real_caps)
+		return;
+
+	/* Mask the overlapping bits */
+	caps = descr->real_caps & ~newcaps;
+
+	/* Assign the new event handler */
+	if (caps) {
+		descr->event->event_handler = event_handlers[caps];
+		printk(KERN_INFO "Event source %s new caps set: %02x\n" ,
+		       descr->event->name, caps);
+	} else {
+		descr->event->event_handler = handle_noop;
+
+		if (descr->event->set_mode)
+			descr->event->set_mode(CLOCK_EVT_SHUTDOWN,
+					       descr->event);
+
+		printk(KERN_INFO "Event source %s disabled\n" ,
+		       descr->event->name);
+	}
+	descr->real_caps = caps;
+}
+
+/*
+ * Recalc the events and reassign the handlers if necessary
+ */
+static int recalc_events(struct local_events *sources, struct clock_event *evt,
+			 unsigned int caps, int new)
+{
+	int i;
+
+	if (new && sources->installed == MAX_CLOCK_EVENTS)
+		return -ENOSPC;
+
+	/*
+	 * If there is no handler and this is not a next-event capable
+	 * event source, refuse to handle it
+	 */
+	if (!evt->capabilities & CLOCK_CAP_NEXTEVT && !event_handlers[caps]) {
+		printk(KERN_ERR "Unsupported event source %s\n", evt->name);
+		return -EINVAL;
+	}
+
+	if (caps && global_eventsource.event && global_eventsource.event != evt)
+		recalc_active_event(&global_eventsource, caps);
+
+	for (i = 0; i < sources->installed; i++) {
+		if (sources->events[i].event != evt)
+			recalc_active_event(&sources->events[i], caps);
+	}
+
+	if (new)
+		sources->events[sources->installed++].event = evt;
+
+	if (caps) {
+		/* Is next_event event source going to be installed? */
+		if (caps & CLOCK_CAP_NEXTEVT)
+			caps = CLOCK_CAP_NEXTEVT;
+
+		setup_event(&sources->events[sources->installed],
+			    evt, caps);
+	} else
+		printk(KERN_INFO "Inactive event source %s registered\n",
+		       evt->name);
+
+	return 0;
+}
+
+/**
+ * register_local_clockevent - Set up a cpu local clock event device
+ *
+ * @evt:	event device to be registered
+ */
+int register_local_clockevent(struct clock_event *evt)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&events_lock, flags);
+
+	/* Preset the handler in any case */
+	evt->event_handler = handle_noop;
+
+	/* Recalc event sources and maybe reassign handlers */
+	ret = recalc_events(sources, evt,
+			    evt->capabilities & CLOCK_BASE_CAPS_MASK, 1);
+
+	spin_unlock_irqrestore(&events_lock, flags);
+
+	/*
+	 * Trigger hrtimers, when the event source is next-event
+	 * capable
+	 */
+	if (!ret && (evt->capabilities & CLOCK_CAP_NEXTEVT))
+		hrtimer_clock_notify();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_local_clockevent);
+
+/*
+ * Find a next-event capable event source
+ */
+static int get_next_event_source(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	int i;
+
+	for (i = 0; i < sources->installed; i++) {
+		struct clock_event *evt;
+
+		evt = sources->events[i].event;
+		if (evt->capabilities & CLOCK_CAP_NEXTEVT)
+			return i;
+	}
+
+#ifndef CONFIG_SMP
+	if (global_eventsource.event->capabilities & CLOCK_CAP_NEXTEVT)
+		return GLOBAL_CLOCK_EVENT;
+#endif
+	return -ENODEV;
+}
+
+/**
+ * clockevents_next_event_available - Check for a installed next-event source
+ */
+int clockevents_next_event_available(void)
+{
+	unsigned long flags;
+	int idx;
+
+	spin_lock_irqsave(&events_lock, flags);
+	idx = get_next_event_source();
+	spin_unlock_irqrestore(&events_lock, flags);
+
+	return idx < 0 ? 0 : 1;
+}
+
+int clockevents_init_next_event(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	struct clock_event *nextevt;
+	unsigned long flags;
+	int idx, ret = -ENODEV;
+
+	if (sources->nextevt)
+		return -EBUSY;
+
+	spin_lock_irqsave(&events_lock, flags);
+
+	idx = get_next_event_source();
+	if (idx < 0)
+		goto out_unlock;
+
+	if (idx == GLOBAL_CLOCK_EVENT)
+		nextevt = global_eventsource.event;
+	else
+		nextevt = sources->events[idx].event;
+
+	ret = recalc_events(sources, nextevt, CLOCK_CAPS_MASK, 0);
+	if (!ret)
+		sources->nextevt = nextevt;
+ out_unlock:
+	spin_unlock_irqrestore(&events_lock, flags);
+
+	return ret;
+}
+
+int clockevents_set_next_event(ktime_t expires, int force)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	int64_t delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+	struct clock_event *nextevt = sources->nextevt;
+	unsigned long long clc;
+
+	if (delta <= 0 && !force)
+		return -ETIME;
+
+	if (delta > nextevt->max_delta_ns)
+		delta = nextevt->max_delta_ns;
+	if (delta < nextevt->min_delta_ns)
+		delta = nextevt->min_delta_ns;
+
+	clc = delta * nextevt->mult;
+	clc >>= nextevt->shift;
+	nextevt->set_next_event((unsigned long)clc, sources->nextevt);
+	hrtimer_trace(expires, clc);
+
+	return 0;
+}
+
+/*
+ * Resume the cpu local clock events
+ */
+static void clockevents_resume_local_events(void *arg)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	int i;
+
+	for (i = 0; i < sources->installed; i++) {
+		if (sources->events[i].real_caps)
+			startup_event(sources->events[i].event,
+				      sources->events[i].real_caps);
+	}
+}
+
+/*
+ * Called after timekeeping is functional again
+ */
+void clockevents_resume_events(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	/* Resume global event source */
+	if (global_eventsource.real_caps)
+		startup_event(global_eventsource.event,
+			      global_eventsource.real_caps);
+
+	clockevents_resume_local_events(NULL);
+	local_irq_restore(flags);
+
+	touch_softlockup_watchdog();
+
+	if (smp_call_function(clockevents_resume_local_events, NULL, 1, 1))
+		BUG();
+
+}
+
+/*
+ * Functions related to initialization and hotplug
+ */
+static int clockevents_cpu_notify(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	switch(action) {
+	case CPU_UP_PREPARE:
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		/*
+		 * Do something sensible here !
+		 * Disable the cpu local clocksources
+		 */
+		break;
+#endif
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata clockevents_nb = {
+	.notifier_call	= clockevents_cpu_notify,
+};
+
+void __init clockevents_init(void)
+{
+	clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&clockevents_nb);
+}
diff -urN ./linux-2.6.18.1/kernel/time/clocksource.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/clocksource.c
--- ./linux-2.6.18.1/kernel/time/clocksource.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/clocksource.c	2007-05-19 23:58:35.000000000 +0900
@@ -48,7 +48,7 @@
 static struct clocksource *curr_clocksource = &clocksource_jiffies;
 static struct clocksource *next_clocksource;
 static LIST_HEAD(clocksource_list);
-static DEFINE_SPINLOCK(clocksource_lock);
+static DEFINE_RAW_SPINLOCK(clocksource_lock);
 static char override_name[32];
 static int finished_booting;
 
diff -urN ./linux-2.6.18.1/kernel/time/ntp.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/ntp.c
--- ./linux-2.6.18.1/kernel/time/ntp.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/ntp.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,350 @@
+/*
+ * linux/kernel/time/ntp.c
+ *
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#include <asm/div64.h>
+#include <asm/timex.h>
+
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
+unsigned long tick_nsec;			/* ACTHZ period (nsec) */
+static u64 tick_length, tick_length_base;
+
+#define MAX_TICKADJ		500		/* microsecs */
+#define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
+				  TICK_LENGTH_SHIFT) / HZ)
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+static int time_state = TIME_OK;	/* clock synchronization status	*/
+int time_status = STA_UNSYNC;		/* clock status bits		*/
+static long time_offset;		/* time adjustment (ns)		*/
+static long time_constant = 2;		/* pll time constant		*/
+long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
+long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
+long time_freq;				/* frequency offset (scaled ppm)*/
+static long time_reftime;		/* time at last adjustment (s)	*/
+long time_adjust;
+
+#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
+					(s64)CLOCK_TICK_RATE)
+
+static void ntp_update_frequency(void)
+{
+	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+
+	do_div(tick_length_base, HZ);
+
+	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+void ntp_clear(void)
+{
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+
+	ntp_update_frequency();
+
+	tick_length = tick_length_base;
+	time_offset = 0;
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+	long time_adj;
+
+	/* Bump the maxerror field */
+	time_maxerror += MAXFREQ >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
+	}
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
+	}
+
+	/*
+	 * Compute the phase adjustment for the next second. The offset is
+	 * reduced by a fixed factor times the time constant.
+	 */
+	tick_length = tick_length_base;
+	time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
+	time_offset -= time_adj;
+	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
+
+	if (unlikely(time_adjust)) {
+		if (time_adjust > MAX_TICKADJ) {
+			time_adjust -= MAX_TICKADJ;
+			tick_length += MAX_TICKADJ_SCALED;
+		} else if (time_adjust < -MAX_TICKADJ) {
+			time_adjust += MAX_TICKADJ;
+			tick_length -= MAX_TICKADJ_SCALED;
+		} else {
+			time_adjust = 0;
+			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
+					     HZ) << TICK_LENGTH_SHIFT;
+		}
+	}
+}
+
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+	return tick_length;
+}
+
+
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+	return;
+}
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+	long ltemp, mtemp, save_adjust;
+	s64 freq_adj, temp64;
+	int result;
+
+	/* In order to modify anything, you gotta be super-user! */
+	if (txc->modes && !capable(CAP_SYS_TIME))
+		return -EPERM;
+
+	/* Now we validate the data before disabling interrupts */
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	  /* singleshot must not be used with any other mode bits */
+		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
+			return -EINVAL;
+
+	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+	  /* adjustment Offset limited to +- .512 seconds */
+		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+			return -EINVAL;
+
+	/* if the quartz is off by more than 10% something is VERY wrong ! */
+	if (txc->modes & ADJ_TICK)
+		if (txc->tick <  900000/USER_HZ ||
+		    txc->tick > 1100000/USER_HZ)
+			return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+	result = time_state;	/* mostly `TIME_OK' */
+
+	/* Save for later - semantics of adjtime is to return old value */
+	save_adjust = time_adjust;
+
+#if 0	/* STA_CLOCKERR is never set yet */
+	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
+#endif
+	/* If there are input parameters, then process them */
+	if (txc->modes)
+	{
+	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
+		time_status =  (txc->status & ~STA_RONLY) |
+			      (time_status & STA_RONLY);
+
+	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
+		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
+	    }
+
+	    if (txc->modes & ADJ_MAXERROR) {
+		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_maxerror = txc->maxerror;
+	    }
+
+	    if (txc->modes & ADJ_ESTERROR) {
+		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_esterror = txc->esterror;
+	    }
+
+	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
+		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_constant = min(txc->constant + 4, (long)MAXTC);
+	    }
+
+	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
+		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+		    /* adjtime() is independent from ntp_adjtime() */
+		    time_adjust = txc->offset;
+		}
+		else if (time_status & STA_PLL) {
+		    ltemp = txc->offset * NSEC_PER_USEC;
+
+		    /*
+		     * Scale the phase adjustment and
+		     * clamp to the operating range.
+		     */
+		    time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
+		    time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
+
+		    /*
+		     * Select whether the frequency is to be controlled
+		     * and in which mode (PLL or FLL). Clamp to the operating
+		     * range. Ugly multiply/divide should be replaced someday.
+		     */
+
+		    if (time_status & STA_FREQHOLD || time_reftime == 0)
+		        time_reftime = xtime.tv_sec;
+		    mtemp = xtime.tv_sec - time_reftime;
+		    time_reftime = xtime.tv_sec;
+
+		    freq_adj = (s64)time_offset * mtemp;
+		    freq_adj = shift_right(freq_adj, time_constant * 2 +
+					   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+		    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+			temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
+			if (time_offset < 0) {
+			    temp64 = -temp64;
+			    do_div(temp64, mtemp);
+			    freq_adj -= temp64;
+			} else {
+			    do_div(temp64, mtemp);
+			    freq_adj += temp64;
+			}
+		    }
+		    freq_adj += time_freq;
+		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
+		    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+		    time_offset = (time_offset / HZ) << SHIFT_UPDATE;
+		} /* STA_PLL */
+	    } /* txc->modes & ADJ_OFFSET */
+	    if (txc->modes & ADJ_TICK)
+		tick_usec = txc->tick;
+
+	    if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+		    ntp_update_frequency();
+	} /* txc->modes */
+leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
+		result = TIME_ERROR;
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	    txc->offset	   = save_adjust;
+	else
+	    txc->offset    = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
+	txc->freq	   = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
+	txc->maxerror	   = time_maxerror;
+	txc->esterror	   = time_esterror;
+	txc->status	   = time_status;
+	txc->constant	   = time_constant;
+	txc->precision	   = 1;
+	txc->tolerance	   = MAXFREQ;
+	txc->tick	   = tick_usec;
+
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
+	write_sequnlock_irq(&xtime_lock);
+	do_gettimeofday(&txc->time);
+	notify_arch_cmos_timer();
+	return(result);
+}
diff -urN ./linux-2.6.18.1/kernel/time/timer_stats.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/timer_stats.c
--- ./linux-2.6.18.1/kernel/time/timer_stats.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time/timer_stats.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,227 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006, Thomas Gleixner <tglx@timesys.com>
+ *
+ * Based on: timer_top.c
+ *	Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ *	Written by Daniel Petrini <d.pensator@gmail.com>
+ *
+ * Collect timer usage statistics.
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo 1[0] >/proc/tstats
+ *
+ * Display the collected information:
+ * # cat /proc/tstats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+static DEFINE_RAW_SPINLOCK(tstats_lock);
+static int tstats_status;
+static ktime_t tstats_time;
+
+enum tstats_stat {
+	TSTATS_INACTIVE,
+	TSTATS_ACTIVE,
+	TSTATS_READOUT,
+	TSTATS_RESET,
+};
+
+struct tstats_entry {
+	void			*timer;
+	void			*start_func;
+	void			*expire_func;
+	unsigned long		counter;
+	pid_t			pid;
+	char			comm[TASK_COMM_LEN + 1];
+};
+
+#define TSTATS_MAX_ENTRIES	1024
+
+static struct tstats_entry tstats[TSTATS_MAX_ENTRIES];
+
+void tstats_update_stats(void *timer, pid_t pid, void *startf,
+			 void *timerf, char * comm)
+{
+	struct tstats_entry *entry = tstats;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&tstats_lock, flags);
+	if (tstats_status != TSTATS_ACTIVE)
+		goto out_unlock;
+
+	for (i = 0; i < TSTATS_MAX_ENTRIES; i++, entry++) {
+		if (entry->timer == timer &&
+		    entry->start_func == startf &&
+		    entry->expire_func == timerf &&
+		    entry->pid == pid) {
+
+			entry->counter++;
+			break;
+		}
+		if (!entry->timer) {
+			entry->timer = timer;
+			entry->start_func = startf;
+			entry->expire_func = timerf;
+			entry->counter = 1;
+			entry->pid = pid;
+			memcpy(entry->comm, comm, TASK_COMM_LEN);
+			entry->comm[TASK_COMM_LEN] = 0;
+			break;
+		}
+	}
+
+ out_unlock:
+	spin_unlock_irqrestore(&tstats_lock, flags);
+}
+
+static void tstats_reset(void)
+{
+	memset(tstats, 0, sizeof(tstats));
+}
+
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s", sym_name);
+	else
+		seq_printf(m, "<%p>", (void *)addr);
+}
+
+static int tstats_show(struct seq_file *m, void *v)
+{
+	struct tstats_entry *entry = tstats;
+	struct timespec period;
+	unsigned long ms;
+	long events = 0;
+	int i;
+
+	spin_lock_irq(&tstats_lock);
+	switch(tstats_status) {
+	case TSTATS_ACTIVE:
+		tstats_time = ktime_sub(ktime_get(), tstats_time);
+	case TSTATS_INACTIVE:
+		tstats_status = TSTATS_READOUT;
+		break;
+	default:
+		spin_unlock_irq(&tstats_lock);
+		return -EBUSY;
+	}
+	spin_unlock_irq(&tstats_lock);
+
+	period = ktime_to_timespec(tstats_time);
+	ms = period.tv_nsec % 1000000;
+
+	seq_printf(m, "Timerstats sample period: %ld.%3ld s\n",
+		   period.tv_sec, ms);
+
+	for (i = 0; i < TSTATS_MAX_ENTRIES && entry->timer; i++, entry++) {
+		seq_printf(m, "%4lu, %5d %-16s ", entry->counter, entry->pid,
+			   entry->comm);
+
+		print_name_offset(m, (unsigned long)entry->start_func);
+		seq_puts(m, " (");
+		print_name_offset(m, (unsigned long)entry->expire_func);
+		seq_puts(m, ")\n");
+		events += entry->counter;
+	}
+
+	ms += period.tv_sec * 1000;
+	if (events && period.tv_sec)
+		seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
+			   events / period.tv_sec, events * 1000 / ms);
+	else
+		seq_printf(m, "%ld total events\n", events);
+
+	tstats_status = TSTATS_INACTIVE;
+	return 0;
+}
+
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	char ctl[2];
+
+	if (count != 2 || *offs)
+		return -EINVAL;
+
+	if (copy_from_user(ctl, buf, count))
+		return -EFAULT;
+
+	switch (ctl[0]) {
+	case '0':
+		spin_lock_irq(&tstats_lock);
+		if (tstats_status == TSTATS_ACTIVE) {
+			tstats_status = TSTATS_INACTIVE;
+			tstats_time = ktime_sub(ktime_get(), tstats_time);
+		}
+		spin_unlock_irq(&tstats_lock);
+		break;
+	case '1':
+		spin_lock_irq(&tstats_lock);
+		if (tstats_status == TSTATS_INACTIVE) {
+			tstats_status = TSTATS_RESET;
+			spin_unlock_irq(&tstats_lock);
+			tstats_reset();
+			tstats_time = ktime_get();
+			tstats_status = TSTATS_ACTIVE;
+		} else
+			spin_unlock_irq(&tstats_lock);
+		break;
+	default:
+		count = -EINVAL;
+	}
+
+	return count;
+}
+
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, tstats_show, NULL);
+}
+
+static struct file_operations tstats_fops = {
+	.open		= tstats_open,
+	.read		= seq_read,
+	.write		= tstats_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_tstats(void)
+{
+	struct proc_dir_entry *pe = create_proc_entry("tstats", 0666, NULL);
+
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &tstats_fops;
+
+	return 0;
+}
+module_init(init_tstats);
diff -urN ./linux-2.6.18.1/kernel/time.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time.c
--- ./linux-2.6.18.1/kernel/time.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/time.c	2007-05-19 23:58:35.000000000 +0900
@@ -202,179 +202,6 @@
 	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-/* we call this to notify the arch when the clock is being
- * controlled.  If no such arch routine, do nothing.
- */
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
-{
-	return;
-}
-
-/* adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
- */
-int do_adjtimex(struct timex *txc)
-{
-        long ltemp, mtemp, save_adjust;
-	int result;
-
-	/* In order to modify anything, you gotta be super-user! */
-	if (txc->modes && !capable(CAP_SYS_TIME))
-		return -EPERM;
-		
-	/* Now we validate the data before disabling interrupts */
-
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	  /* singleshot must not be used with any other mode bits */
-		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
-			return -EINVAL;
-
-	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
-	  /* adjustment Offset limited to +- .512 seconds */
-		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
-			return -EINVAL;	
-
-	/* if the quartz is off by more than 10% something is VERY wrong ! */
-	if (txc->modes & ADJ_TICK)
-		if (txc->tick <  900000/USER_HZ ||
-		    txc->tick > 1100000/USER_HZ)
-			return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	result = time_state;	/* mostly `TIME_OK' */
-
-	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
-
-#if 0	/* STA_CLOCKERR is never set yet */
-	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
-#endif
-	/* If there are input parameters, then process them */
-	if (txc->modes)
-	{
-	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
-		time_status =  (txc->status & ~STA_RONLY) |
-			      (time_status & STA_RONLY);
-
-	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
-		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_freq = txc->freq;
-	    }
-
-	    if (txc->modes & ADJ_MAXERROR) {
-		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_maxerror = txc->maxerror;
-	    }
-
-	    if (txc->modes & ADJ_ESTERROR) {
-		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_esterror = txc->esterror;
-	    }
-
-	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
-		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_constant = txc->constant;
-	    }
-
-	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
-		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
-		    /* adjtime() is independent from ntp_adjtime() */
-		    if ((time_next_adjust = txc->offset) == 0)
-			 time_adjust = 0;
-		}
-		else if (time_status & STA_PLL) {
-		    ltemp = txc->offset;
-
-		    /*
-		     * Scale the phase adjustment and
-		     * clamp to the operating range.
-		     */
-		    if (ltemp > MAXPHASE)
-		        time_offset = MAXPHASE << SHIFT_UPDATE;
-		    else if (ltemp < -MAXPHASE)
-			time_offset = -(MAXPHASE << SHIFT_UPDATE);
-		    else
-		        time_offset = ltemp << SHIFT_UPDATE;
-
-		    /*
-		     * Select whether the frequency is to be controlled
-		     * and in which mode (PLL or FLL). Clamp to the operating
-		     * range. Ugly multiply/divide should be replaced someday.
-		     */
-
-		    if (time_status & STA_FREQHOLD || time_reftime == 0)
-		        time_reftime = xtime.tv_sec;
-		    mtemp = xtime.tv_sec - time_reftime;
-		    time_reftime = xtime.tv_sec;
-		    if (time_status & STA_FLL) {
-		        if (mtemp >= MINSEC) {
-			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
-							      SHIFT_UPDATE);
-			    time_freq += shift_right(ltemp, SHIFT_KH);
-			} else /* calibration interval too short (p. 12) */
-				result = TIME_ERROR;
-		    } else {	/* PLL mode */
-		        if (mtemp < MAXSEC) {
-			    ltemp *= mtemp;
-			    time_freq += shift_right(ltemp,(time_constant +
-						       time_constant +
-						       SHIFT_KF - SHIFT_USEC));
-			} else /* calibration interval too long (p. 12) */
-				result = TIME_ERROR;
-		    }
-		    time_freq = min(time_freq, time_tolerance);
-		    time_freq = max(time_freq, -time_tolerance);
-		} /* STA_PLL */
-	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK) {
-		tick_usec = txc->tick;
-		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
-	    }
-	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
-		result = TIME_ERROR;
-	
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	    txc->offset	   = save_adjust;
-	else {
-	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
-	}
-	txc->freq	   = time_freq;
-	txc->maxerror	   = time_maxerror;
-	txc->esterror	   = time_esterror;
-	txc->status	   = time_status;
-	txc->constant	   = time_constant;
-	txc->precision	   = time_precision;
-	txc->tolerance	   = time_tolerance;
-	txc->tick	   = tick_usec;
-
-	/* PPS is not implemented, so these are zero */
-	txc->ppsfreq	   = 0;
-	txc->jitter	   = 0;
-	txc->shift	   = 0;
-	txc->stabil	   = 0;
-	txc->jitcnt	   = 0;
-	txc->calcnt	   = 0;
-	txc->errcnt	   = 0;
-	txc->stbcnt	   = 0;
-	write_sequnlock_irq(&xtime_lock);
-	do_gettimeofday(&txc->time);
-	notify_arch_cmos_timer();
-	return(result);
-}
-
 asmlinkage long sys_adjtimex(struct timex __user *txc_p)
 {
 	struct timex txc;		/* Local copy of parameter */
@@ -643,6 +470,279 @@
 	return tv;
 }
 
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+	return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+	return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+	return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+	return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+
+EXPORT_SYMBOL(jiffies_to_usecs);
+
+/*
+ * When we convert to jiffies then we interpret incoming values
+ * the following way:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor
+ *
+ * We must also be careful about 32-bit overflows.
+ */
+unsigned long msecs_to_jiffies(const unsigned int m)
+{
+	/*
+	 * Negative value, means infinite timeout:
+	 */
+	if ((int)m < 0)
+		return MAX_JIFFY_OFFSET;
+
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+	/*
+	 * HZ is equal to or smaller than 1000, and 1000 is a nice
+	 * round multiple of HZ, divide with the factor between them,
+	 * but round upwards:
+	 */
+	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+	/*
+	 * HZ is larger than 1000, and HZ is a nice round multiple of
+	 * 1000 - simply multiply with the factor between them.
+	 *
+	 * But first make sure the multiplication result cannot
+	 * overflow:
+	 */
+	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+
+	return m * (HZ / MSEC_PER_SEC);
+#else
+	/*
+	 * Generic case - multiply, round and divide. But first
+	 * check that if we are doing a net multiplication, that
+	 * we wouldnt overflow:
+	 */
+	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+
+	return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
+#endif
+}
+
+EXPORT_SYMBOL(msecs_to_jiffies);
+
+unsigned long usecs_to_jiffies(const unsigned int u)
+{
+	if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+	return u * (HZ / USEC_PER_SEC);
+#else
+	return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
+#endif
+}
+
+EXPORT_SYMBOL(usecs_to_jiffies);
+
+/*
+ * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
+ * that a remainder subtract here would not do the right thing as the
+ * resolution values don't fall on second boundries.  I.e. the line:
+ * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ *
+ * Rather, we just shift the bits off the right.
+ *
+ * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
+ * value to a scaled second value.
+ */
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+	unsigned long sec = value->tv_sec;
+	long nsec = value->tv_nsec + TICK_NSEC - 1;
+
+	if (sec >= MAX_SEC_IN_JIFFIES){
+		sec = MAX_SEC_IN_JIFFIES;
+		nsec = 0;
+	}
+	return (((u64)sec * SEC_CONVERSION) +
+		(((u64)nsec * NSEC_CONVERSION) >>
+		 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+
+}
+
+EXPORT_SYMBOL(timespec_to_jiffies);
+
+void
+jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+{
+	/*
+	 * Convert jiffies to nanoseconds and separate with
+	 * one divide.
+	 */
+	u64 nsec = (u64)jiffies * TICK_NSEC;
+	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
+}
+
+EXPORT_SYMBOL(jiffies_to_timespec);
+
+/* Same for "timeval"
+ *
+ * Well, almost.  The problem here is that the real system resolution is
+ * in nanoseconds and the value being converted is in micro seconds.
+ * Also for some machines (those that use HZ = 1024, in-particular),
+ * there is a LARGE error in the tick size in microseconds.
+
+ * The solution we use is to do the rounding AFTER we convert the
+ * microsecond part.  Thus the USEC_ROUND, the bits to be shifted off.
+ * Instruction wise, this should cost only an additional add with carry
+ * instruction above the way it was done above.
+ */
+unsigned long
+timeval_to_jiffies(const struct timeval *value)
+{
+	unsigned long sec = value->tv_sec;
+	long usec = value->tv_usec;
+
+	if (sec >= MAX_SEC_IN_JIFFIES){
+		sec = MAX_SEC_IN_JIFFIES;
+		usec = 0;
+	}
+	return (((u64)sec * SEC_CONVERSION) +
+		(((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
+		 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+
+void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
+{
+	/*
+	 * Convert jiffies to nanoseconds and separate with
+	 * one divide.
+	 */
+	u64 nsec = (u64)jiffies * TICK_NSEC;
+	long tv_usec;
+
+	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
+	tv_usec /= NSEC_PER_USEC;
+	value->tv_usec = tv_usec;
+}
+
+/*
+ * Convert jiffies/jiffies_64 to clock_t and back.
+ */
+clock_t jiffies_to_clock_t(long x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+	return x / (HZ / USER_HZ);
+#else
+	u64 tmp = (u64)x * TICK_NSEC;
+	do_div(tmp, (NSEC_PER_SEC / USER_HZ));
+	return (long)tmp;
+#endif
+}
+
+EXPORT_SYMBOL(jiffies_to_clock_t);
+
+unsigned long clock_t_to_jiffies(unsigned long x)
+{
+#if (HZ % USER_HZ)==0
+	if (x >= ~0UL / (HZ / USER_HZ))
+		return ~0UL;
+	return x * (HZ / USER_HZ);
+#else
+	u64 jif;
+
+	/* Don't worry about loss of precision here .. */
+	if (x >= ~0UL / HZ * USER_HZ)
+		return ~0UL;
+
+	/* .. but do try to contain it here */
+	jif = x * (u64) HZ;
+	do_div(jif, USER_HZ);
+	return jif;
+#endif
+}
+
+EXPORT_SYMBOL(clock_t_to_jiffies);
+
+u64 jiffies_64_to_clock_t(u64 x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+	do_div(x, HZ / USER_HZ);
+#else
+	/*
+	 * There are better ways that don't overflow early,
+	 * but even this doesn't overflow in hundreds of years
+	 * in 64 bits, so..
+	 */
+	x *= TICK_NSEC;
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#endif
+	return x;
+}
+
+EXPORT_SYMBOL(jiffies_64_to_clock_t);
+
+u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+	x *= USER_HZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/*
+         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
+	                          / USER_HZ));
+#endif
+	return x;
+}
+
+int nsec_to_timestamp(char *s, u64 t)
+{
+	unsigned long nsec_rem = do_div(t, NSEC_PER_SEC);
+	return sprintf(s, "[%5lu.%06lu]", (unsigned long)t,
+		       nsec_rem/NSEC_PER_USEC);
+}
+__attribute__((weak)) unsigned long long timestamp_clock(void)
+{
+	return sched_clock();
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
diff -urN ./linux-2.6.18.1/kernel/timer.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/timer.c
--- ./linux-2.6.18.1/kernel/timer.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/timer.c	2007-05-19 23:58:35.000000000 +0900
@@ -33,7 +33,9 @@
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 #include <linux/delay.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -41,11 +43,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
-#ifdef CONFIG_TIME_INTERPOLATION
-static void time_interpolator_update(long delta_nsec);
-#else
-#define time_interpolator_update(x)
-#endif
+/* jiffies at the most recent update of wall time */
+unsigned long wall_jiffies = INITIAL_JIFFIES;
 
 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
@@ -69,9 +68,12 @@
 	struct list_head vec[TVR_SIZE];
 } tvec_root_t;
 
+unsigned int __read_mostly timeout_granularity = 1;
+
 struct tvec_t_base_s {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	unsigned long timer_jiffies;
 	tvec_root_t tv1;
 	tvec_t tv2;
@@ -89,9 +91,7 @@
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
-#ifdef CONFIG_SMP
 	base->running_timer = timer;
-#endif
 }
 
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
@@ -136,6 +136,17 @@
 	list_add_tail(&timer->entry, vec);
 }
 
+#ifdef CONFIG_TIMER_STATS
+void __tstats_timer_set_start_info(struct timer_list *timer, void *addr)
+{
+	if (timer->start_site)
+		return;
+
+	timer->start_site = addr;
+	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+	timer->start_pid = current->pid;
+}
+#endif
 /***
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
@@ -147,11 +158,16 @@
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
+#ifdef CONFIG_TIMER_STATS
+	timer->start_site = NULL;
+	timer->start_pid = -1;
+	memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
 }
 EXPORT_SYMBOL(init_timer);
 
 static inline void detach_timer(struct timer_list *timer,
-					int clear_pending)
+				int clear_pending)
 {
 	struct list_head *entry = &timer->entry;
 
@@ -195,8 +211,9 @@
 {
 	tvec_base_t *base, *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, cpu;
 
+	tstats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
 
 	base = lock_timer_base(timer, &flags);
@@ -206,7 +223,8 @@
 		ret = 1;
 	}
 
-	new_base = __get_cpu_var(tvec_bases);
+	cpu = raw_smp_processor_id();
+	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
 		/*
@@ -247,6 +265,7 @@
 	tvec_base_t *base = per_cpu(tvec_bases, cpu);
   	unsigned long flags;
 
+	tstats_timer_set_start_info(timer);
   	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
 	timer->base = base;
@@ -255,6 +274,18 @@
 }
 
 
+/*
+ * Wait for a running timer
+ */
+void wait_for_running_timer(struct timer_list *timer)
+{
+	tvec_base_t *base = timer->base;
+
+	if (base->running_timer == timer)
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+}
+
 /***
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -278,6 +309,7 @@
 {
 	BUG_ON(!timer->function);
 
+	tstats_timer_set_start_info(timer);
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -308,6 +340,7 @@
 	unsigned long flags;
 	int ret = 0;
 
+	tstats_timer_clear_start_info(timer);
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		if (timer_pending(timer)) {
@@ -322,7 +355,34 @@
 
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+/*
+ * This function checks whether a timer is active and not running on any
+ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the
+ * handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int timer_pending_sync(struct timer_list *timer)
+{
+	tvec_base_t *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer))
+		ret = 1;
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /*
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
@@ -351,6 +411,7 @@
 	return ret;
 }
 
+
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -374,7 +435,7 @@
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
-		cpu_relax();
+		wait_for_running_timer(timer);
 	}
 }
 
@@ -415,11 +476,27 @@
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
-	while (time_after_eq(jiffies, base->timer_jiffies)) {
+
+	while (time_before_eq(base->timer_jiffies, jiffies)) {
+
 		struct list_head work_list;
 		struct list_head *head = &work_list;
  		int index = base->timer_jiffies & TVR_MASK;
 
+		if (softirq_need_resched()) {
+			spin_unlock_irq(&base->lock);
+			wake_up(&base->wait_for_running_timer);
+			cond_resched_all();
+			cpu_relax();
+			spin_lock_irq(&base->lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
+
 		/*
 		 * Cascade timers:
 		 */
@@ -438,6 +515,8 @@
  			fn = timer->function;
  			data = timer->data;
 
+			tstats_account_timer(timer);
+
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
 			spin_unlock_irq(&base->lock);
@@ -445,50 +524,48 @@
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_WARNING "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			set_running_timer(base, NULL);
+			cond_resched_all();
 			spin_lock_irq(&base->lock);
 		}
 	}
-	set_running_timer(base, NULL);
+	wake_up(&base->wait_for_running_timer);
 	spin_unlock_irq(&base->lock);
 }
 
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
 /*
  * Find out when the next timer event is due to happen. This
  * is used on S/390 to stop all activity when a cpus is idle.
  * This functions needs to be called disabled.
  */
-unsigned long next_timer_interrupt(void)
+unsigned long __next_timer_interrupt(tvec_base_t *base, unsigned long now)
 {
-	tvec_base_t *base;
 	struct list_head *list;
-	struct timer_list *nte;
+	struct timer_list *nte, *found = NULL;
 	unsigned long expires;
-	unsigned long hr_expires = MAX_JIFFY_OFFSET;
-	ktime_t hr_delta;
 	tvec_t *varray[4];
 	int i, j;
 
-	hr_delta = hrtimer_get_next_event();
+#ifndef CONFIG_NO_HZ
+	unsigned long hr_expires = MAX_JIFFY_OFFSET;
+	ktime_t hr_delta = hrtimer_get_next_event();
+
 	if (hr_delta.tv64 != KTIME_MAX) {
 		struct timespec tsdelta;
 		tsdelta = ktime_to_timespec(hr_delta);
 		hr_expires = timespec_to_jiffies(&tsdelta);
 		if (hr_expires < 3)
-			return hr_expires + jiffies;
+			return hr_expires + now;
 	}
-	hr_expires += jiffies;
+	hr_expires += now;
+#endif
 
-	base = __get_cpu_var(tvec_bases);
-	spin_lock(&base->lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = NULL;
 
@@ -497,6 +574,7 @@
 	do {
 		list_for_each_entry(nte, base->tv1.vec + j, entry) {
 			expires = nte->expires;
+			found = nte;
 			if (j < (base->timer_jiffies & TVR_MASK))
 				list = base->tv2.vec + (INDEX(0));
 			goto found;
@@ -516,9 +594,12 @@
 				j = (j + 1) & TVN_MASK;
 				continue;
 			}
-			list_for_each_entry(nte, varray[i]->vec + j, entry)
-				if (time_before(nte->expires, expires))
+			list_for_each_entry(nte, varray[i]->vec + j, entry) {
+				if (time_before(nte->expires, expires)) {
 					expires = nte->expires;
+					found = nte;
+				}
+			}
 			if (j < (INDEX(i)) && i < 3)
 				list = varray[i + 1]->vec + (INDEX(i + 1));
 			goto found;
@@ -532,12 +613,15 @@
 		 * where we found the timer element.
 		 */
 		list_for_each_entry(nte, list, entry) {
-			if (time_before(nte->expires, expires))
+			if (time_before(nte->expires, expires)) {
 				expires = nte->expires;
+				found = nte;
+			}
 		}
 	}
-	spin_unlock(&base->lock);
+	WARN_ON(!found);
 
+#ifndef CONFIG_NO_HZ
 	/*
 	 * It can happen that other CPUs service timer IRQs and increment
 	 * jiffies, but we have not yet got a local timer tick to process
@@ -551,240 +635,74 @@
 	 * would falsely evaluate to true.  If that is the case, just
 	 * return jiffies so that we can immediately fire the local timer
 	 */
-	if (time_before(expires, jiffies))
-		return jiffies;
-
-	if (time_before(hr_expires, expires))
-		return hr_expires;
-
-	return expires;
-}
+	if (time_before(expires, now))
+		expires = now;
+	else if (time_before(hr_expires, expires))
+		expires = hr_expires;
 #endif
-
-/******************************************************************/
-
-/*
- * Timekeeping variables
- */
-unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
-unsigned long tick_nsec = TICK_NSEC;		/* ACTHZ period (nsec) */
-
-/* 
- * The current time 
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
- * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-
-EXPORT_SYMBOL(xtime);
-
-/* Don't completely fail for HZ > 500.  */
-int tickadj = 500/HZ ? : 1;		/* microsecs */
-
-
-/*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK;		/* clock synchronization status	*/
-int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (us)		*/
-long time_constant = 2;			/* pll time constant		*/
-long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
-long time_precision = 1;		/* clock precision (us)		*/
-long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
-long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
-					/* frequency offset (scaled ppm)*/
-static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
-long time_reftime;			/* time at last adjustment (s)	*/
-long time_adjust;
-long time_next_adjust;
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- */
-static void second_overflow(void)
-{
-	long ltemp;
-
-	/* Bump the maxerror field */
-	time_maxerror += time_tolerance >> SHIFT_USEC;
-	if (time_maxerror > NTP_PHASE_LIMIT) {
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_status |= STA_UNSYNC;
-	}
-
 	/*
-	 * Leap second processing. If in leap-insert state at the end of the
-	 * day, the system clock is set back one second; if in leap-delete
-	 * state, the system clock is set ahead one second. The microtime()
-	 * routine or external clock driver will insure that reported time is
-	 * always monotonic. The ugly divides should be replaced.
+	 * 'Timer wheel time' can lag behind 'jiffies time' due to
+	 * delayed processing, so make sure we return a value that
+	 * makes sense externally:
 	 */
-	switch (time_state) {
-	case TIME_OK:
-		if (time_status & STA_INS)
-			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
-			time_state = TIME_DEL;
-		break;
-	case TIME_INS:
-		if (xtime.tv_sec % 86400 == 0) {
-			xtime.tv_sec--;
-			wall_to_monotonic.tv_sec++;
-			/*
-			 * The timer interpolator will make time change
-			 * gradually instead of an immediate jump by one second
-			 */
-			time_interpolator_update(-NSEC_PER_SEC);
-			time_state = TIME_OOP;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: inserting leap second "
-					"23:59:60 UTC\n");
-		}
-		break;
-	case TIME_DEL:
-		if ((xtime.tv_sec + 1) % 86400 == 0) {
-			xtime.tv_sec++;
-			wall_to_monotonic.tv_sec--;
-			/*
-			 * Use of time interpolator for a gradual change of
-			 * time
-			 */
-			time_interpolator_update(NSEC_PER_SEC);
-			time_state = TIME_WAIT;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: deleting leap second "
-					"23:59:59 UTC\n");
-		}
-		break;
-	case TIME_OOP:
-		time_state = TIME_WAIT;
-		break;
-	case TIME_WAIT:
-		if (!(time_status & (STA_INS | STA_DEL)))
-		time_state = TIME_OK;
-	}
+	expires -= (now - base->timer_jiffies);
 
 	/*
-	 * Compute the phase adjustment for the next second. In PLL mode, the
-	 * offset is reduced by a fixed factor times the time constant. In FLL
-	 * mode the offset is used directly. In either mode, the maximum phase
-	 * adjustment for each second is clamped so as to spread the adjustment
-	 * over not more than the number of seconds between updates.
+	 * Round it up per timeout_granularity:
 	 */
-	ltemp = time_offset;
-	if (!(time_status & STA_FLL))
-		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
-	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	time_offset -= ltemp;
-	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+	expires += timeout_granularity - 1;
+	expires -= expires % timeout_granularity;
 
-	/*
-	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second.
-	 */
-	ltemp = time_freq;
-	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
+	return expires;
+}
 
-#if HZ == 100
-	/*
-	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
-	 * get 128.125; => only 0.125% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
-	/*
-	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
+unsigned long get_next_timer_interrupt(unsigned long now)
+{
+	tvec_base_t *base = __get_cpu_var(tvec_bases);
+	unsigned long expires;
+
+#ifdef CONFIG_PREEMPT_RT
 	/*
-	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 * On PREEMPT_RT we cannot sleep here. If the trylock does not
+	 * succeed then we return the worst-case 'expires in 1 tick'
+	 * value:
 	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	if (spin_trylock(&base->lock)) {
+		expires = __next_timer_interrupt(base, now);
+		spin_unlock(&base->lock);
+	} else
+		expires = now + 1;
+#else
+	spin_lock(&base->lock);
+	expires = __next_timer_interrupt(base, now);
+	spin_unlock(&base->lock);
 #endif
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
-	long time_adjust_step;
 
-	time_adjust_step = time_adjust;
-	if (time_adjust_step) {
-		/*
-		 * We are doing an adjtime thing.  Prepare time_adjust_step to
-		 * be within bounds.  Note that a positive time_adjust means we
-		 * want the clock to run faster.
-		 *
-		 * Limit the amount of the step to be in the range
-		 * -tickadj .. +tickadj
-		 */
-		time_adjust_step = min(time_adjust_step, (long)tickadj);
-		time_adjust_step = max(time_adjust_step, (long)-tickadj);
-	}
-	return time_adjust_step;
+	return expires;
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_ntp_one_tick(void)
+unsigned long next_timer_interrupt(void)
 {
-	long time_adjust_step;
+	return get_next_timer_interrupt(jiffies);
+}
 
-	time_adjust_step = adjtime_adjustment();
-	if (time_adjust_step)
-		/* Reduce by this step the amount of time left  */
-		time_adjust -= time_adjust_step;
+#endif
 
-	/* Changes by adjtime() do not take effect till next tick. */
-	if (time_next_adjust != 0) {
-		time_adjust = time_next_adjust;
-		time_next_adjust = 0;
-	}
-}
+/******************************************************************/
 
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
+/* 
+ * The current time 
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
  */
-u64 current_tick_length(void)
-{
-	long delta_nsec;
-	u64 ret;
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 
-	/* calculate the finest interval NTP will allow.
-	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
-	 */
-	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
-	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+EXPORT_SYMBOL(xtime);
 
-	return ret;
-}
 
 /* XXX - all of this timekeeping code should be later moved to time.c */
 #include <linux/clocksource.h>
@@ -809,12 +727,27 @@
 	/* calculate the delta since the last update_wall_time: */
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
+	trace_special_u64(cycle_now, 1);
+	trace_special_u64(clock->cycle_last, 0x10);
+	trace_special_u64(cycle_delta, 2);
+
 	/* convert to nanoseconds: */
 	ns_offset = cyc2ns(clock, cycle_delta);
+	trace_special_u64(ns_offset, 0x20);
 
 	return ns_offset;
 }
 
+static DEFINE_PER_CPU(ktime_t, timestamp);
+
+void warp_check_clock_was_changed(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		per_cpu(timestamp, cpu).tv64 = 0;
+}
+
 /**
  * __get_realtime_clock_ts - Returns the time of day in a timespec
  * @ts:		pointer to the timespec to be set
@@ -827,15 +760,47 @@
 	unsigned long seq;
 	s64 nsecs;
 
+	unsigned long flags;
+	static int once = 1;
+	ktime_t prev, now;
+	int cpu;
+
+	local_irq_save(flags);
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
 		*ts = xtime;
 		nsecs = __get_nsec_offset();
 
+		now = timespec_to_ktime(*ts);
+		hrtimer_trace(now, 0x21);
+
 	} while (read_seqretry(&xtime_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
+
+	now = timespec_to_ktime(*ts);
+
+	cpu = raw_smp_processor_id();
+	prev = per_cpu(timestamp, cpu);
+	per_cpu(timestamp, cpu) = now;
+
+	hrtimer_trace(prev, 3);
+	hrtimer_trace(now, 4);
+
+	if (once && prev.tv64 > now.tv64) {
+		once = 0;
+		stop_trace();
+		user_trace_stop();
+
+		printk("BUG: time warp detected!\n");
+		printk("prev > now, %016Lx > %016Lx:\n", prev.tv64, now.tv64);
+		printk("= %Ld delta, on CPU#%d\n", prev.tv64 - now.tv64, cpu);
+		dump_stack();
+
+		local_irq_restore(flags);
+	} else
+		local_irq_restore(flags);
 }
 
 /**
@@ -953,23 +918,41 @@
 	return ret;
 }
 
+/* Weak dummy function for arches that do not yet support it.
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+	return 0;
+}
+
 /*
  * timekeeping_init - Initializes the clocksource and common timekeeping values
  */
 void __init timekeeping_init(void)
 {
-	unsigned long flags;
+	unsigned long flags, sec = read_persistent_clock();
 
 	write_seqlock_irqsave(&xtime_lock, flags);
+
+	ntp_clear();
+
 	clock = clocksource_get_next();
 	clocksource_calculate_interval(clock, tick_nsec);
 	clock->cycle_last = clocksource_read(clock);
-	ntp_clear();
+
+	xtime.tv_sec = sec;
+	xtime.tv_nsec = (jiffies % HZ) * (NSEC_PER_SEC / HZ);
+	set_normalized_timespec(&wall_to_monotonic,
+		-xtime.tv_sec, -xtime.tv_nsec);
+
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 
 
 static int timekeeping_suspended;
+static unsigned long timekeeping_suspend_time;
+
 /*
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
@@ -980,14 +963,24 @@
  */
 static int timekeeping_resume(struct sys_device *dev)
 {
-	unsigned long flags;
+	unsigned long flags, now = read_persistent_clock();
 
 	write_seqlock_irqsave(&xtime_lock, flags);
-	/* restart the last cycle value */
+
+	if (now && (now > timekeeping_suspend_time)) {
+		unsigned long sleep_length = now - timekeeping_suspend_time;
+		xtime.tv_sec += sleep_length;
+		jiffies_64 += sleep_length * HZ;
+		wall_jiffies += sleep_length * HZ;
+	}
+	/* re-base the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
 	clock->error = 0;
 	timekeeping_suspended = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	hrtimer_notify_resume();
+
 	return 0;
 }
 
@@ -997,6 +990,7 @@
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 	timekeeping_suspended = 1;
+	timekeeping_suspend_time = read_persistent_clock();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -1114,6 +1108,7 @@
 static void update_wall_time(void)
 {
 	cycle_t offset;
+	int shift = 0;
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1126,30 +1121,39 @@
 #endif
 	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
 
+	while (offset > clock->cycle_interval << (shift + 1))
+		shift++;
+
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.
 	 */
 	while (offset >= clock->cycle_interval) {
+		if (offset < (clock->cycle_interval << shift)) {
+			shift--;
+			continue;
+		}
+
 		/* accumulate one interval */
-		clock->xtime_nsec += clock->xtime_interval;
-		clock->cycle_last += clock->cycle_interval;
-		offset -= clock->cycle_interval;
+		clock->xtime_nsec += clock->xtime_interval << shift;
+		clock->cycle_last += clock->cycle_interval << shift;
+		offset -= clock->cycle_interval << shift;
 
-		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+		while (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
 			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
 			xtime.tv_sec++;
 			second_overflow();
 		}
 
 		/* interpolator bits */
-		time_interpolator_update(clock->xtime_interval
-						>> clock->shift);
-		/* increment the NTP state machine */
-		update_ntp_one_tick();
+		time_interpolator_update((clock->xtime_interval
+						>> clock->shift)<<shift);
 
 		/* accumulate error between NTP and clock interval */
-		clock->error += current_tick_length();
-		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+		clock->error += current_tick_length() << shift;
+		clock->error -= (clock->xtime_interval
+			<< (TICK_LENGTH_SHIFT - clock->shift))<<shift;
+
+		shift--;
 	}
 
 	/* correct the clock when NTP error is too big */
@@ -1163,8 +1167,10 @@
 	if (change_clocksource()) {
 		clock->error = 0;
 		clock->xtime_nsec = 0;
+		hrtimer_clock_notify();
 		clocksource_calculate_interval(clock, tick_nsec);
 	}
+	update_vsyscall(&xtime, clock);
 }
 
 /*
@@ -1173,8 +1179,8 @@
  */
 void update_process_times(int user_tick)
 {
-	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	struct task_struct *p = current;
 
 	/* Note: this timer irq context must be accounted for as well. */
 	if (user_tick)
@@ -1193,9 +1199,30 @@
  */
 static unsigned long count_active_tasks(void)
 {
+	/*
+	 * On PREEMPT_RT, we are running in the timer softirq thread,
+	 * so consider 1 less running tasks:
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	return (nr_active() - 1) * FIXED_1;
+#else
 	return nr_active() * FIXED_1;
+#endif
 }
 
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_rt_tasks(void)
+{
+	extern unsigned long rt_nr_running(void);
+	extern unsigned long rt_nr_uninterruptible(void);
+
+	return (rt_nr_running() + rt_nr_uninterruptible()) * FIXED_1;
+}
+#endif
+
 /*
  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
  * imply that avenrun[] is the standard name for this kind of thing.
@@ -1208,6 +1235,8 @@
 
 EXPORT_SYMBOL(avenrun);
 
+unsigned long avenrun_rt[3];
+
 /*
  * calc_load - given tick count, update the avenrun load estimates.
  * This is called while holding a write_lock on xtime_lock.
@@ -1224,57 +1253,71 @@
 		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+#ifdef CONFIG_PREEMPT_RT
+		active_tasks = count_active_rt_tasks();
+		CALC_LOAD(avenrun_rt[0], EXP_1, active_tasks);
+		CALC_LOAD(avenrun_rt[1], EXP_5, active_tasks);
+		CALC_LOAD(avenrun_rt[2], EXP_15, active_tasks);
+#endif
 	}
 }
 
-/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies = INITIAL_JIFFIES;
-
 /*
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
 #endif
 
 /*
- * This function runs timers and the timer-tq in bottom half context.
- */
-static void run_timer_softirq(struct softirq_action *h)
-{
-	tvec_base_t *base = __get_cpu_var(tvec_bases);
-
- 	hrtimer_run_queues();
-	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
-}
-
-/*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
 {
-	raise_softirq(TIMER_SOFTIRQ);
+	tvec_base_t *base = per_cpu(tvec_bases, smp_processor_id());
+	/*
+	 * Only wake up the TIMER_SOFTIRQ every timeout_granularity
+	 * jiffies:
+	 */
+	if (time_before_eq(base->timer_jiffies + timeout_granularity, jiffies))
+		raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
+ * Time of day handling:
  */
 static inline void update_times(void)
 {
-	unsigned long ticks;
+	unsigned long ticks, flags;
 
 	ticks = jiffies - wall_jiffies;
-	wall_jiffies += ticks;
-	update_wall_time();
-	calc_load(ticks);
+	if (ticks) {
+		write_seqlock_irqsave(&xtime_lock, flags);
+		wall_jiffies += ticks;
+		update_wall_time();
+		calc_load(ticks);
+		write_sequnlock_irqrestore(&xtime_lock, flags);
+	}
 }
-  
+
+/*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	tvec_base_t *base = __get_cpu_var(tvec_bases);
+
+	update_times();
+	hrtimer_run_queues();
+
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
+}
+
 /*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
@@ -1286,7 +1329,6 @@
 	jiffies_64++;
 	/* prevent loading jiffies before storing new jiffies_64 value. */
 	barrier();
-	update_times();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1602,6 +1644,7 @@
 
 	spin_lock_init(&base->lock);
 	lockdep_set_class(&base->lock, base_lock_keys + cpu);
+	init_waitqueue_head(&base->wait_for_running_timer);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1639,8 +1682,7 @@
 	old_base = per_cpu(tvec_bases, cpu);
 	new_base = get_cpu_var(tvec_bases);
 
-	local_irq_disable();
-	spin_lock(&new_base->lock);
+	spin_lock_irq(&new_base->lock);
 	spin_lock(&old_base->lock);
 
 	BUG_ON(old_base->running_timer);
@@ -1655,8 +1697,7 @@
 	}
 
 	spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
-	local_irq_enable();
+	spin_unlock_irq(&new_base->lock);
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -1774,7 +1815,7 @@
 #define INTERPOLATOR_ADJUST 65536
 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
 
-static void time_interpolator_update(long delta_nsec)
+void time_interpolator_update(long delta_nsec)
 {
 	u64 counter;
 	unsigned long offset;
diff -urN ./linux-2.6.18.1/kernel/unwind.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/unwind.c
--- ./linux-2.6.18.1/kernel/unwind.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/unwind.c	2007-05-19 23:58:35.000000000 +0900
@@ -102,7 +102,11 @@
 	unsigned long size;
 	struct unwind_table *link;
 	const char *name;
-} root_table, *last_table;
+} root_table;
+
+#ifdef CONFIG_MODULES
+static struct unwind_table *last_table;
+#endif
 
 struct unwind_item {
 	enum item_location {
diff -urN ./linux-2.6.18.1/kernel/user.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/user.c
--- ./linux-2.6.18.1/kernel/user.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/user.c	2007-05-19 23:58:35.000000000 +0900
@@ -108,15 +108,16 @@
 	if (!up)
 		return;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
-		spin_unlock_irqrestore(&uidhash_lock, flags);
+		spin_unlock(&uidhash_lock);
+		local_irq_restore_nort(flags);
 		key_put(up->uid_keyring);
 		key_put(up->session_keyring);
 		kmem_cache_free(uid_cachep, up);
 	} else {
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 }
 
diff -urN ./linux-2.6.18.1/kernel/workqueue.c linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/workqueue.c
--- ./linux-2.6.18.1/kernel/workqueue.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/kernel/workqueue.c	2007-05-19 23:58:35.000000000 +0900
@@ -28,6 +28,7 @@
 #include <linux/notifier.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
+#include <linux/syscalls.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -102,10 +103,12 @@
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0, cpu = get_cpu();
+	int ret = 0, cpu = raw_smp_processor_id();
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
@@ -114,17 +117,18 @@
 		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
-	put_cpu();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
 {
 	struct work_struct *work = (struct work_struct *)__data;
 	struct workqueue_struct *wq = work->wq_data;
 	int cpu = smp_processor_id();
+	struct list_head *head;
 
+	head = &per_cpu_ptr(wq->cpu_wq, cpu)->more_work.task_list;
 	if (unlikely(is_single_threaded(wq)))
 		cpu = singlethread_cpu;
 
@@ -140,11 +144,12 @@
  * Returns non-zero if it was successfully added.
  */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
-			struct work_struct *work, unsigned long delay)
+				struct work_struct *work, unsigned long delay)
 {
 	int ret = 0;
 	struct timer_list *timer = &work->timer;
 
+	tstats_timer_set_start_info(&work->timer);
 	if (!test_and_set_bit(0, &work->pending)) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
@@ -418,6 +423,42 @@
 		kthread_stop(p);
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+			       int policy, int rt_priority, int nice)
+{
+	struct sched_param param = { .sched_priority = rt_priority };
+	struct cpu_workqueue_struct *cwq;
+	struct task_struct *p;
+	unsigned long flags;
+	int ret;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	set_user_nice(p, nice);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	WARN_ON(ret);
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	mutex_lock(&workqueue_mutex);
+	if (is_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	mutex_unlock(&workqueue_mutex);
+}
+
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -469,6 +510,7 @@
  */
 int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
 {
+	tstats_timer_set_start_info(&work->timer);
 	return queue_delayed_work(keventd_wq, work, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
@@ -693,5 +735,6 @@
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
 
diff -urN ./linux-2.6.18.1/lib/Kconfig.debug linux-2.6.18.1-cabi-20070529-RT_HRT/lib/Kconfig.debug
--- ./linux-2.6.18.1/lib/Kconfig.debug	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/Kconfig.debug	2007-05-19 23:58:35.000000000 +0900
@@ -93,6 +93,17 @@
 	  application, you can say N to avoid the very slight overhead
 	  this adds.
 
+config TIMER_STATS
+	bool "Collect kernel timers statistics"
+	depends on DEBUG_KERNEL && PROC_FS
+	help
+	  If you say Y here, additional code will be inserted into the
+	  timer routines to collect statistics about kernel timers being
+	  reprogrammed. The statistics can be read from /proc/tstats.
+	  The statistics collection is started by writing 1 to /proc/tstats,
+	  writing 0 stops it. This feature is useful to collect information
+	  about timer usage patterns in kernel and userspace.
+
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"
 	depends on DEBUG_KERNEL && SLAB
@@ -121,6 +132,8 @@
 	help
 	 This allows rt mutex semantics violations and rt mutex related
 	 deadlocks (lockups) to be detected and reported automatically.
+	 When realtime preemption is enabled this includes spinlocks,
+	 rwlocks, mutexes and (rw)semaphores
 
 config DEBUG_PI_LIST
 	bool
@@ -144,7 +157,7 @@
 
 config DEBUG_MUTEXES
 	bool "Mutex debugging: basic checks"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !PREEMPT_RT
 	help
 	 This feature allows mutex semantics violations to be detected and
 	 reported.
@@ -260,6 +273,154 @@
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 
+config WAKEUP_TIMING
+	bool "Wakeup latency timing"
+	default y
+	help
+	  This option measures the time spent from a highprio thread being
+	  woken up to it getting scheduled on a CPU, with microsecond
+	  accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config WAKEUP_LATENCY_HIST
+	bool "wakeup latency histogram"
+	default n
+	depends on WAKEUP_TIMING
+	help
+	  This option logs all the wakeup latency timing to a big histogram
+	  bucket, in the meanwhile, it also dummies up printk produced by
+	  wakeup latency timing.
+
+	  The wakeup latency timing histogram can be viewed via:
+
+	      cat /proc/latency_hist/wakeup_latency/CPU*
+
+	  (Note: * presents CPU ID.)
+
+config PREEMPT_TRACE
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+config CRITICAL_PREEMPT_TIMING
+	bool "Non-preemptible critical section latency timing"
+	default n
+	depends on PREEMPT
+	help
+	  This option measures the time spent in preempt-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_OFF_HIST
+        bool "non-preemptible critical section latency histogram"
+        default n
+        depends on CRITICAL_PREEMPT_TIMING
+        help
+          This option logs all the non-preemptible critical section latency
+	  timing to a big histogram bucket, in the meanwhile, it also
+	  dummies up printk produced by non-preemptible critical section
+	  latency timing.
+
+          The non-preemptible critical section latency timing histogram can
+	  be viewed via:
+
+              cat /proc/latency_hist/preempt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_IRQSOFF_TIMING
+	bool "Interrupts-off critical section latency timing"
+	default n
+	select TRACE_IRQFLAGS
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config INTERRUPT_OFF_HIST
+        bool "interrupts-off critical section latency histogram"
+        default n
+        depends on CRITICAL_IRQSOFF_TIMING
+        help
+          This option logs all the interrupts-off critical section latency
+          timing to a big histogram bucket, in the meanwhile, it also
+          dummies up printk produced by interrupts-off critical section
+          latency timing.
+
+          The interrupts-off critical section latency timing histogram can
+          be viewed via:
+
+              cat /proc/latency_hist/interrupt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_TIMING
+	bool
+	default y
+	depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config DEBUG_TRACE_IRQFLAGS
+	bool
+	default y
+	depends on CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+	bool
+	default y
+	depends on WAKEUP_TIMING || CRITICAL_TIMING
+	select SYSCTL
+
+config CRITICAL_LATENCY_HIST
+	bool
+	default y
+	depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST
+
+config LATENCY_HIST
+	bool
+	default y
+	depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST
+
+config LATENCY_TRACE
+	bool "Latency tracing"
+	default n
+	depends on LATENCY_TIMING && !REORDER
+	select FRAME_POINTER
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  precise function-call granularity kernel execution during
+	  wakeup paths or critical sections.  When this option is enabled
+	  then the last maximum latency timing event's full trace can be
+	  found in /proc/latency_trace, in a human-readable (or rather as
+	  some would say, in a kernel-developer-readable) form.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config MCOUNT
+	bool
+	depends on LATENCY_TRACE
+	default y
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
diff -urN ./linux-2.6.18.1/lib/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/lib/Makefile
--- ./linux-2.6.18.1/lib/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -20,7 +20,8 @@
 
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
diff -urN ./linux-2.6.18.1/lib/debug_locks.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/debug_locks.c
--- ./linux-2.6.18.1/lib/debug_locks.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/debug_locks.c	2007-05-19 23:58:35.000000000 +0900
@@ -36,7 +36,14 @@
 int debug_locks_off(void)
 {
 	if (xchg(&debug_locks, 0)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		if (spin_is_locked(&current->pi_lock))
+			spin_unlock(&current->pi_lock);
+#endif
 		if (!debug_locks_silent) {
+			stop_trace();
+			user_trace_stop();
+			printk("stopped custom tracer.\n");
 			console_verbose();
 			return 1;
 		}
diff -urN ./linux-2.6.18.1/lib/dec_and_lock.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/dec_and_lock.c
--- ./linux-2.6.18.1/lib/dec_and_lock.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/dec_and_lock.c	2007-05-19 23:58:35.000000000 +0900
@@ -17,7 +17,7 @@
  * because the spin-lock and the decrement must be
  * "atomic".
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int __atomic_dec_and_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 #ifdef CONFIG_SMP
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
@@ -32,4 +32,4 @@
 	return 0;
 }
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(__atomic_dec_and_spin_lock);
diff -urN ./linux-2.6.18.1/lib/kernel_lock.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/kernel_lock.c
--- ./linux-2.6.18.1/lib/kernel_lock.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/kernel_lock.c	2007-05-19 23:58:35.000000000 +0900
@@ -24,7 +24,7 @@
  *
  * Don't use in new code.
  */
-static DECLARE_MUTEX(kernel_sem);
+DECLARE_MUTEX(kernel_sem);
 
 /*
  * Re-acquire the kernel semaphore.
@@ -35,22 +35,25 @@
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 
+	local_irq_disable();
+
 	return 0;
 }
 
@@ -67,11 +70,15 @@
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -82,8 +89,12 @@
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth == -1)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 #else
@@ -116,11 +127,13 @@
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
+	local_irq_enable();
 	while (!_raw_spin_trylock(&kernel_flag)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
 			return -EAGAIN;
 		cpu_relax();
 	}
+	local_irq_disable();
 	preempt_disable();
 	return 0;
 }
diff -urN ./linux-2.6.18.1/lib/locking-selftest.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/locking-selftest.c
--- ./linux-2.6.18.1/lib/locking-selftest.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/locking-selftest.c	2007-05-19 23:58:35.000000000 +0900
@@ -158,7 +158,7 @@
 		local_bh_disable();		\
 		local_irq_disable();		\
 		trace_softirq_enter();		\
-		WARN_ON(!in_softirq());
+		/* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */
 
 #define SOFTIRQ_EXIT()				\
 		trace_softirq_exit();		\
@@ -550,6 +550,11 @@
 #undef E
 
 /*
+ * FIXME: turns these into raw-spinlock tests on -rt
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+/*
  * locking an irq-safe lock with irqs enabled:
  */
 #define E1()				\
@@ -890,6 +895,8 @@
 #include "locking-selftest-softirq.h"
 // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft)
 
+#endif /* !CONFIG_PREEMPT_RT */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define I_SPINLOCK(x)	lockdep_reset_lock(&lock_##x.dep_map)
 # define I_RWLOCK(x)	lockdep_reset_lock(&rwlock_##x.dep_map)
@@ -940,6 +947,9 @@
 {
 	unsigned long saved_preempt_count = preempt_count();
 	int expected_failure = 0;
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES)
+        int saved_lock_count = current->lock_count;
+#endif
 
 	WARN_ON(irqs_disabled());
 
@@ -963,7 +973,9 @@
 			printk("failed|");
 		} else {
 			unexpected_testcase_failures++;
+
 			printk("FAILED|");
+			dump_stack();
 		}
 	} else {
 		testcase_successes++;
@@ -987,6 +999,9 @@
 #endif
 
 	reset_locks();
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES)
+        current->lock_count = saved_lock_count;
+#endif
 }
 
 static inline void print_testname(const char *testname)
@@ -994,9 +1009,18 @@
 	printk("%33s:", testname);
 }
 
+/*
+ * Read-locks are exclusive locks under PREEMPT_RT:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define READ_SUCCESS	FAILURE
+#else
+# define READ_SUCCESS	SUCCESS
+#endif
+
 #define DO_TESTCASE_1(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
-	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_##nr, READ_SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
 #define DO_TESTCASE_1B(desc, name, nr)				\
@@ -1004,18 +1028,18 @@
 	dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	printk("\n");
 
-#define DO_TESTCASE_3(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
-	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);	\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
-	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
+#define DO_TESTCASE_3(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
+	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);		\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock_##nr, READ_SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
-#define DO_TESTCASE_3RW(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
+#define DO_TESTCASE_3RW(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
 	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
-	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock_##nr, READ_SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
 #define DO_TESTCASE_6(desc, name)				\
@@ -1045,7 +1069,7 @@
 	print_testname(desc);					\
 	dotest(name##_spin, FAILURE, LOCKTYPE_SPIN);		\
 	dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK);		\
-	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock, READ_SUCCESS, LOCKTYPE_RWLOCK);	\
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
@@ -1146,14 +1170,14 @@
 	printk("  --------------------------------------------------------------------------\n");
 	print_testname("recursive read-lock");
 	printk("             |");
-	dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK);
+	dotest(rlock_AA1, READ_SUCCESS, LOCKTYPE_RWLOCK);
 	printk("             |");
 	dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM);
 	printk("\n");
 
 	print_testname("recursive read-lock #2");
 	printk("             |");
-	dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK);
+	dotest(rlock_AA1B, READ_SUCCESS, LOCKTYPE_RWLOCK);
 	printk("             |");
 	dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM);
 	printk("\n");
@@ -1177,6 +1201,7 @@
 	/*
 	 * irq-context testcases:
 	 */
+#ifndef CONFIG_PREEMPT_RT
 	DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1);
 	DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A);
 	DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B);
@@ -1186,6 +1211,7 @@
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#endif
 
 	if (unexpected_testcase_failures) {
 		printk("-----------------------------------------------------------------\n");
diff -urN ./linux-2.6.18.1/lib/radix-tree.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/radix-tree.c
--- ./linux-2.6.18.1/lib/radix-tree.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/radix-tree.c	2007-05-19 23:58:35.000000000 +0900
@@ -109,6 +109,8 @@
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -139,6 +141,7 @@
 out:
 	return ret;
 }
+#endif
 
 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
 		int offset)
diff -urN ./linux-2.6.18.1/lib/rwsem-spinlock.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/rwsem-spinlock.c
--- ./linux-2.6.18.1/lib/rwsem-spinlock.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/rwsem-spinlock.c	2007-05-19 23:58:35.000000000 +0900
@@ -20,7 +20,7 @@
 /*
  * initialise the semaphore
  */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
+void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name,
 		  struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -44,8 +44,8 @@
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if wakewrite is non-zero
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+static inline struct compat_rw_semaphore *
+__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -103,8 +103,8 @@
 /*
  * wake a single writer
  */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
+static inline struct compat_rw_semaphore *
+__rwsem_wake_one_writer(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -125,7 +125,7 @@
 /*
  * get a read lock on the semaphore
  */
-void fastcall __sched __down_read(struct rw_semaphore *sem)
+void fastcall __sched __down_read(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -168,7 +168,7 @@
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_read_trylock(struct rw_semaphore *sem)
+int fastcall __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -191,7 +191,8 @@
  * get a write lock on the semaphore
  * - we increment the waiting count anyway to indicate an exclusive lock
  */
-void fastcall __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+void fastcall __sched
+__down_write_nested(struct compat_rw_semaphore *sem, int subclass)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -231,7 +232,7 @@
 	;
 }
 
-void fastcall __sched __down_write(struct rw_semaphore *sem)
+void fastcall __sched __down_write(struct compat_rw_semaphore *sem)
 {
 	__down_write_nested(sem, 0);
 }
@@ -239,7 +240,7 @@
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_write_trylock(struct rw_semaphore *sem)
+int fastcall __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -260,7 +261,7 @@
 /*
  * release a read lock on the semaphore
  */
-void fastcall __up_read(struct rw_semaphore *sem)
+void fastcall __up_read(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -275,7 +276,7 @@
 /*
  * release a write lock on the semaphore
  */
-void fastcall __up_write(struct rw_semaphore *sem)
+void fastcall __up_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -292,7 +293,7 @@
  * downgrade a write lock into a read lock
  * - just wake up any readers at the front of the queue
  */
-void fastcall __downgrade_write(struct rw_semaphore *sem)
+void fastcall __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -305,7 +306,7 @@
 	spin_unlock_irqrestore(&sem->wait_lock, flags);
 }
 
-EXPORT_SYMBOL(__init_rwsem);
+EXPORT_SYMBOL(__compat_init_rwsem);
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
 EXPORT_SYMBOL(__down_write_nested);
diff -urN ./linux-2.6.18.1/lib/rwsem.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/rwsem.c
--- ./linux-2.6.18.1/lib/rwsem.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/rwsem.c	2007-05-19 23:58:35.000000000 +0900
@@ -11,8 +11,8 @@
 /*
  * Initialize an rwsem:
  */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
-		  struct lock_class_key *key)
+void __compat_init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
@@ -26,7 +26,7 @@
 	INIT_LIST_HEAD(&sem->wait_list);
 }
 
-EXPORT_SYMBOL(__init_rwsem);
+EXPORT_SYMBOL(__compat_init_rwsem);
 
 struct rwsem_waiter {
 	struct list_head list;
diff -urN ./linux-2.6.18.1/lib/semaphore-sleepers.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/semaphore-sleepers.c
--- ./linux-2.6.18.1/lib/semaphore-sleepers.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/semaphore-sleepers.c	2007-05-19 23:58:35.000000000 +0900
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -48,12 +49,12 @@
  *    we cannot lose wakeup events.
  */
 
-fastcall void __up(struct semaphore *sem)
+fastcall void __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-fastcall void __sched __down(struct semaphore * sem)
+fastcall void __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -90,7 +91,7 @@
 	tsk->state = TASK_RUNNING;
 }
 
-fastcall int __sched __down_interruptible(struct semaphore * sem)
+fastcall int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -153,7 +154,7 @@
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-fastcall int __down_trylock(struct semaphore * sem)
+fastcall int __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -174,3 +175,10 @@
 	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	return 1;
 }
+
+int fastcall compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
diff -urN ./linux-2.6.18.1/lib/smp_processor_id.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/smp_processor_id.c
--- ./linux-2.6.18.1/lib/smp_processor_id.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/smp_processor_id.c	2007-05-19 23:58:35.000000000 +0900
@@ -7,7 +7,7 @@
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
 
-unsigned int debug_smp_processor_id(void)
+unsigned int notrace debug_smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
@@ -42,7 +42,7 @@
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count()-1, current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
diff -urN ./linux-2.6.18.1/lib/spinlock_debug.c linux-2.6.18.1-cabi-20070529-RT_HRT/lib/spinlock_debug.c
--- ./linux-2.6.18.1/lib/spinlock_debug.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/lib/spinlock_debug.c	2007-05-19 23:58:35.000000000 +0900
@@ -12,8 +12,8 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 
-void __spin_lock_init(spinlock_t *lock, const char *name,
-		      struct lock_class_key *key)
+void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+			  struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
@@ -22,16 +22,16 @@
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key);
 #endif
-	lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	lock->raw_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	lock->magic = SPINLOCK_MAGIC;
 	lock->owner = SPINLOCK_OWNER_INIT;
 	lock->owner_cpu = -1;
 }
 
-EXPORT_SYMBOL(__spin_lock_init);
+EXPORT_SYMBOL(__raw_spin_lock_init);
 
-void __rwlock_init(rwlock_t *lock, const char *name,
-		   struct lock_class_key *key)
+void __raw_rwlock_init(raw_rwlock_t *lock, const char *name,
+		       struct lock_class_key *key)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/*
@@ -40,15 +40,15 @@
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lockdep_init_map(&lock->dep_map, name, key);
 #endif
-	lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED;
+	lock->raw_lock = (__raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED;
 	lock->magic = RWLOCK_MAGIC;
 	lock->owner = SPINLOCK_OWNER_INIT;
 	lock->owner_cpu = -1;
 }
 
-EXPORT_SYMBOL(__rwlock_init);
+EXPORT_SYMBOL(__raw_rwlock_init);
 
-static void spin_bug(spinlock_t *lock, const char *msg)
+static void spin_bug(raw_spinlock_t *lock, const char *msg)
 {
 	struct task_struct *owner = NULL;
 
@@ -72,7 +72,7 @@
 #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
 
 static inline void
-debug_spin_lock_before(spinlock_t *lock)
+debug_spin_lock_before(raw_spinlock_t *lock)
 {
 	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
 	SPIN_BUG_ON(lock->owner == current, lock, "recursion");
@@ -80,13 +80,13 @@
 							lock, "cpu recursion");
 }
 
-static inline void debug_spin_lock_after(spinlock_t *lock)
+static inline void debug_spin_lock_after(raw_spinlock_t *lock)
 {
 	lock->owner_cpu = raw_smp_processor_id();
 	lock->owner = current;
 }
 
-static inline void debug_spin_unlock(spinlock_t *lock)
+static inline void debug_spin_unlock(raw_spinlock_t *lock)
 {
 	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
 	SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked");
@@ -97,7 +97,7 @@
 	lock->owner_cpu = -1;
 }
 
-static void __spin_lock_debug(spinlock_t *lock)
+static void __spin_lock_debug(raw_spinlock_t *lock)
 {
 	int print_once = 1;
 	u64 i;
@@ -120,7 +120,7 @@
 	}
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
 	debug_spin_lock_before(lock);
 	if (unlikely(!__raw_spin_trylock(&lock->raw_lock)))
@@ -128,7 +128,7 @@
 	debug_spin_lock_after(lock);
 }
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int ret = __raw_spin_trylock(&lock->raw_lock);
 
@@ -143,13 +143,13 @@
 	return ret;
 }
 
-void _raw_spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
 	debug_spin_unlock(lock);
 	__raw_spin_unlock(&lock->raw_lock);
 }
 
-static void rwlock_bug(rwlock_t *lock, const char *msg)
+static void rwlock_bug(raw_rwlock_t *lock, const char *msg)
 {
 	if (!debug_locks_off())
 		return;
@@ -163,7 +163,7 @@
 #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
 
 #if 0		/* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
+static void __read_lock_debug(raw_rwlock_t *lock)
 {
 	int print_once = 1;
 	u64 i;
@@ -187,13 +187,13 @@
 }
 #endif
 
-void _raw_read_lock(rwlock_t *lock)
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	__raw_read_lock(&lock->raw_lock);
 }
 
-int _raw_read_trylock(rwlock_t *lock)
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock)
 {
 	int ret = __raw_read_trylock(&lock->raw_lock);
 
@@ -206,13 +206,13 @@
 	return ret;
 }
 
-void _raw_read_unlock(rwlock_t *lock)
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	__raw_read_unlock(&lock->raw_lock);
 }
 
-static inline void debug_write_lock_before(rwlock_t *lock)
+static inline void debug_write_lock_before(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
@@ -220,13 +220,13 @@
 							lock, "cpu recursion");
 }
 
-static inline void debug_write_lock_after(rwlock_t *lock)
+static inline void debug_write_lock_after(raw_rwlock_t *lock)
 {
 	lock->owner_cpu = raw_smp_processor_id();
 	lock->owner = current;
 }
 
-static inline void debug_write_unlock(rwlock_t *lock)
+static inline void debug_write_unlock(raw_rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
 	RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
@@ -237,7 +237,7 @@
 }
 
 #if 0		/* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
+static void __write_lock_debug(raw_rwlock_t *lock)
 {
 	int print_once = 1;
 	u64 i;
@@ -261,14 +261,14 @@
 }
 #endif
 
-void _raw_write_lock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)
 {
 	debug_write_lock_before(lock);
 	__raw_write_lock(&lock->raw_lock);
 	debug_write_lock_after(lock);
 }
 
-int _raw_write_trylock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock)
 {
 	int ret = __raw_write_trylock(&lock->raw_lock);
 
@@ -283,7 +283,7 @@
 	return ret;
 }
 
-void _raw_write_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)
 {
 	debug_write_unlock(lock);
 	__raw_write_unlock(&lock->raw_lock);
diff -urN ./linux-2.6.18.1/mm/highmem.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/highmem.c
--- ./linux-2.6.18.1/mm/highmem.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/highmem.c	2007-05-19 23:58:35.000000000 +0900
@@ -233,11 +233,11 @@
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
diff -urN ./linux-2.6.18.1/mm/memory.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/memory.c
--- ./linux-2.6.18.1/mm/memory.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/memory.c	2007-05-19 23:58:35.000000000 +0900
@@ -265,18 +265,52 @@
 		flush_tlb_pgtables((*tlb)->mm, start, end);
 }
 
+#ifdef CONFIG_IA64
+#define tlb_start_addr(tlb)	(tlb)->start_addr
+#define tlb_end_addr(tlb)	(tlb)->end_addr
+#else
+#define tlb_start_addr(tlb)	0UL	/* only ia64 really uses it */
+#define tlb_end_addr(tlb)	0UL	/* only ia64 really uses it */
+#endif
+
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
+#ifdef CONFIG_PREEMPT
+	struct vm_area_struct *unlink = vma;
+	int fullmm = (*tlb)->fullmm;
+
+	if (!vma)	/* Sometimes when exiting after an oops */
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	if (vma->vm_next)
+#endif
+		tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb));
+	/*
+	 * Hide vma from rmap and vmtruncate before freeeing pgtables,
+	 * with preemption enabled, except when unmapping just one area.
+	 */
+	while (unlink) {
+		anon_vma_unlink(unlink);
+		unlink_file_vma(unlink);
+		unlink = unlink->vm_next;
+	}
+#ifndef CONFIG_PREEMPT_RT
+	if (vma->vm_next)
+#endif
+		*tlb = tlb_gather_mmu(vma->vm_mm, fullmm);
+#endif
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * Hide vma from rmap and vmtruncate before freeing pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
+#endif
 
 		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
@@ -289,8 +323,10 @@
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
+#ifndef CONFIG_PREEMPT
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
+#endif
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
@@ -770,10 +806,13 @@
 	return addr;
 }
 
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif
 
diff -urN ./linux-2.6.18.1/mm/mmap.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/mmap.c
--- ./linux-2.6.18.1/mm/mmap.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/mmap.c	2007-05-19 23:58:35.000000000 +0900
@@ -1848,10 +1848,16 @@
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
-	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+# ifdef CONFIG_PREEMPT_RT
+	if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) {
 		WARN_ON(1);
-		up_read(&mm->mmap_sem);
 	}
+# else
+        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+        }
+# endif
 #endif
 }
 
diff -urN ./linux-2.6.18.1/mm/page_alloc.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/page_alloc.c
--- ./linux-2.6.18.1/mm/page_alloc.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/page_alloc.c	2007-05-20 00:11:13.000000000 +0900
@@ -415,7 +415,9 @@
 static void free_pages_bulk(struct zone *zone, int count,
 					struct list_head *list, int order)
 {
-	spin_lock(&zone->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (count--) {
@@ -427,7 +429,7 @@
 		list_del(&page->lru);
 		__free_one_page(page, zone, order);
 	}
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static void free_one_page(struct zone *zone, struct page *page, int order)
@@ -456,8 +458,9 @@
 	kernel_map_pages(page, 1 << order, 0);
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
-	free_one_page(page_zone(page), page, order);
+	// TODO: PREEMPT_RT moves this to before free_one_page() - is it safe?
 	local_irq_restore(flags);
+	free_one_page(page_zone(page), page, order);
 }
 
 /*
@@ -591,6 +594,7 @@
 	return NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -611,6 +615,7 @@
 	spin_unlock(&zone->lock);
 	return i;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 /*
@@ -621,6 +626,7 @@
  */
 void drain_node_pages(int nodeid)
 {
+#ifndef CONFIG_PREEMPT_RT
 	int i, z;
 	unsigned long flags;
 
@@ -641,6 +647,7 @@
 			}
 		}
 	}
+#endif
 }
 #endif
 
@@ -708,6 +715,7 @@
 }
 #endif /* CONFIG_PM */
 
+#ifndef CONFIG_PREEMPT_RT
 /*
  * Free a 0-order page
  */
@@ -738,15 +746,32 @@
 	local_irq_restore(flags);
 	put_cpu();
 }
+#endif
 
+/*
+ * On PREEMPT_RT we use a simple solution for the time being,
+ * per-CPU allocation is disabled.
+ */
 void fastcall free_hot_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 0);
+#endif
 }
-	
+
 void fastcall free_cold_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 1);
+#endif
 }
 
 /*
@@ -777,13 +802,20 @@
 {
 	unsigned long flags;
 	struct page *page;
+#ifndef CONFIG_PREEMPT_RT
 	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
+#endif
 
 again:
-	cpu  = get_cpu();
+	/*
+	 * Moved the get_cpu() magic inside the !PREEMPT_RT section.
+	 * NOTE: this moves the __count_zone_vm_events() and zone_statistics()
+	 * calls outside of the irqs-off section - is this safe?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
+		int cpu = get_cpu();
 
 		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
@@ -796,18 +828,22 @@
 		page = list_entry(pcp->list.next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
-	} else {
+		local_irq_restore(flags);
+		put_cpu();
+	} else
+#endif
+	{
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
-		spin_unlock(&zone->lock);
+		spin_unlock_irqrestore(&zone->lock, flags);
 		if (!page)
 			goto failed;
 	}
 
+	preempt_disable();
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
+	preempt_enable();
 	zone_statistics(zonelist, zone);
-	local_irq_restore(flags);
-	put_cpu();
 
 	BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -815,7 +851,7 @@
 	return page;
 
 failed:
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	put_cpu();
 	return NULL;
 }
@@ -987,9 +1023,11 @@
 		goto nopage;
 	}
 
+#ifndef CONFIG_PREEMPT_RT
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
+#endif
 
 rebalance:
 	cond_resched();
@@ -1097,8 +1135,15 @@
 {
 	int i = pagevec_count(pvec);
 
-	while (--i >= 0)
+	while (--i >= 0) {
+#ifdef CONFIG_PREEMPT_RT
+		if (PageAnon(pvec->pages[i]))
+			pvec->pages[i]->mapping = NULL;
+		__free_pages_ok(pvec->pages[i], 0);
+#else
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
+#endif
+	}
 }
 
 fastcall void __free_pages(struct page *page, unsigned int order)
@@ -2472,6 +2517,34 @@
 	return table;
 }
 
+
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags)
+{
+	unsigned long words = bits / (sizeof(unsigned long)*8);
+	unsigned long size = words * sizeof(unsigned long);
+	unsigned long *bitmask = NULL;
+
+	if (flags & HASH_EARLY)
+		bitmask = alloc_bootmem(size);
+	else if (hashdist)
+		bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+	else {
+		bitmask = kmalloc(size, GFP_ATOMIC);
+		if (!bitmask) {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	}
+
+	if (!bitmask)
+		panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+	return bitmask;
+}
+
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
 struct page *pfn_to_page(unsigned long pfn)
 {
diff -urN ./linux-2.6.18.1/mm/page_alloc.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/mm/page_alloc.c.orig
--- ./linux-2.6.18.1/mm/page_alloc.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/page_alloc.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,2540 @@
+/*
+ *  linux/mm/page_alloc.c
+ *
+ *  Manages the free list, the system allocates free pages here.
+ *  Note that kmalloc() lives in slab.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nodemask.h>
+#include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+#include "internal.h"
+
+/*
+ * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * initializer cleaner
+ */
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
+EXPORT_SYMBOL(node_online_map);
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
+EXPORT_SYMBOL(node_possible_map);
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
+unsigned long totalreserve_pages __read_mostly;
+long nr_swap_pages;
+int percpu_pagelist_fraction;
+
+static void __free_pages_ok(struct page *page, unsigned int order);
+
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ *	1G machine -> (16M dma, 784M normal, 224M high)
+ *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
+
+EXPORT_SYMBOL(totalram_pages);
+
+/*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+ */
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+int min_free_kbytes = 1024;
+
+unsigned long __meminitdata nr_kernel_pages;
+unsigned long __meminitdata nr_all_pages;
+
+#ifdef CONFIG_DEBUG_VM
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+	int ret = 0;
+	unsigned seq;
+	unsigned long pfn = page_to_pfn(page);
+
+	do {
+		seq = zone_span_seqbegin(zone);
+		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+			ret = 1;
+		else if (pfn < zone->zone_start_pfn)
+			ret = 1;
+	} while (zone_span_seqretry(zone, seq));
+
+	return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 0;
+#endif
+	if (zone != page_zone(page))
+		return 0;
+
+	return 1;
+}
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+	if (page_outside_zone_boundaries(zone, page))
+		return 1;
+	if (!page_is_consistent(zone, page))
+		return 1;
+
+	return 0;
+}
+
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+	return 0;
+}
+#endif
+
+static void bad_page(struct page *page)
+{
+	printk(KERN_EMERG "Bad page state in process '%s'\n"
+		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+		KERN_EMERG "Backtrace:\n",
+		current->comm, page, (int)(2*sizeof(unsigned long)),
+		(unsigned long)page->flags, page->mapping,
+		page_mapcount(page), page_count(page));
+	dump_stack();
+	page->flags &= ~(1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_reclaim |
+			1 << PG_slab    |
+			1 << PG_swapcache |
+			1 << PG_writeback |
+			1 << PG_buddy );
+	set_page_count(page, 0);
+	reset_page_mapcount(page);
+	page->mapping = NULL;
+	add_taint(TAINT_BAD_PAGE);
+}
+
+/*
+ * Higher-order pages are called "compound pages".  They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set.  All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function.  Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
+ */
+
+static void free_compound_page(struct page *page)
+{
+	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
+static void prep_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
+	page[1].lru.prev = (void *)order;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		__SetPageCompound(p);
+		set_page_private(p, (unsigned long)page);
+	}
+}
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	if (unlikely((unsigned long)page[1].lru.prev != order))
+		bad_page(page);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		if (unlikely(!PageCompound(p) |
+				(page_private(p) != (unsigned long)page)))
+			bad_page(page);
+		__ClearPageCompound(p);
+	}
+}
+
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	int i;
+
+	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	/*
+	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+	 * and __GFP_HIGHMEM from hard or soft interrupt context.
+	 */
+	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+	for (i = 0; i < (1 << order); i++)
+		clear_highpage(page + i);
+}
+
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page)
+{
+	return page_private(page);
+}
+
+static inline void set_page_order(struct page *page, int order)
+{
+	set_page_private(page, order);
+	__SetPageBuddy(page);
+}
+
+static inline void rmv_page_order(struct page *page)
+{
+	__ClearPageBuddy(page);
+	set_page_private(page, 0);
+}
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ *     B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ *     P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ */
+static inline struct page *
+__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
+{
+	unsigned long buddy_idx = page_idx ^ (1 << order);
+
+	return page + (buddy_idx - page_idx);
+}
+
+static inline unsigned long
+__find_combined_index(unsigned long page_idx, unsigned int order)
+{
+	return (page_idx & ~(1 << order));
+}
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can do coalesce a page and its buddy if
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
+ *
+ * For recording whether a page is in the buddy system, we use PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ *
+ * For recording page's order, we use page_private(page).
+ */
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+								int order)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(buddy)))
+		return 0;
+#endif
+
+	if (page_zone_id(page) != page_zone_id(buddy))
+		return 0;
+
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep a list of pages, which are heads of continuous
+ * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * order is recorded in page_private(page) field.
+ * So when we are allocating or freeing one, we can derive the state of the
+ * other.  That is, if we allocate a small block, and both were   
+ * free, the remainder of the region must be split into blocks.   
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.            
+ *
+ * -- wli
+ */
+
+static inline void __free_one_page(struct page *page,
+		struct zone *zone, unsigned int order)
+{
+	unsigned long page_idx;
+	int order_size = 1 << order;
+
+	if (unlikely(PageCompound(page)))
+		destroy_compound_page(page, order);
+
+	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+
+	BUG_ON(page_idx & (order_size - 1));
+	BUG_ON(bad_range(zone, page));
+
+	zone->free_pages += order_size;
+	while (order < MAX_ORDER-1) {
+		unsigned long combined_idx;
+		struct free_area *area;
+		struct page *buddy;
+
+		buddy = __page_find_buddy(page, page_idx, order);
+		if (!page_is_buddy(page, buddy, order))
+			break;		/* Move the buddy up one level. */
+
+		list_del(&buddy->lru);
+		area = zone->free_area + order;
+		area->nr_free--;
+		rmv_page_order(buddy);
+		combined_idx = __find_combined_index(page_idx, order);
+		page = page + (combined_idx - page_idx);
+		page_idx = combined_idx;
+		order++;
+	}
+	set_page_order(page, order);
+	list_add(&page->lru, &zone->free_area[order].free_list);
+	zone->free_area[order].nr_free++;
+}
+
+static inline int free_pages_check(struct page *page)
+{
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_reclaim	|
+			1 << PG_slab	|
+			1 << PG_swapcache |
+			1 << PG_writeback |
+			1 << PG_reserved |
+			1 << PG_buddy ))))
+		bad_page(page);
+	if (PageDirty(page))
+		__ClearPageDirty(page);
+	/*
+	 * For now, we report if PG_reserved was found set, but do not
+	 * clear it, and do not free the page.  But we shall soon need
+	 * to do more, for when the ZERO_PAGE count wraps negative.
+	 */
+	return PageReserved(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static void free_pages_bulk(struct zone *zone, int count,
+					struct list_head *list, int order)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	zone->all_unreclaimable = 0;
+	zone->pages_scanned = 0;
+	while (count--) {
+		struct page *page;
+
+		BUG_ON(list_empty(list));
+		page = list_entry(list->prev, struct page, lru);
+		/* have to delete it as __free_one_page list manipulates */
+		list_del(&page->lru);
+		__free_one_page(page, zone, order);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static void free_one_page(struct zone *zone, struct page *page, int order)
+{
+	LIST_HEAD(list);
+	list_add(&page->lru, &list);
+	free_pages_bulk(zone, 1, &list, order);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+	unsigned long flags;
+	int i;
+	int reserved = 0;
+
+	arch_free_page(page, order);
+	if (!PageHighMem(page))
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE<<order);
+
+	for (i = 0 ; i < (1 << order) ; ++i)
+		reserved += free_pages_check(page + i);
+	if (reserved)
+		return;
+
+	kernel_map_pages(page, 1 << order, 0);
+	local_irq_save(flags);
+	__count_vm_events(PGFREE, 1 << order);
+	// TODO: PREEMPT_RT moves this to before free_one_page() - is it safe?
+	local_irq_restore(flags);
+	free_one_page(page_zone(page), page, order);
+}
+
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+	if (order == 0) {
+		__ClearPageReserved(page);
+		set_page_count(page, 0);
+		set_page_refcounted(page);
+		__free_page(page);
+	} else {
+		int loop;
+
+		prefetchw(page);
+		for (loop = 0; loop < BITS_PER_LONG; loop++) {
+			struct page *p = &page[loop];
+
+			if (loop + 1 < BITS_PER_LONG)
+				prefetchw(p + 1);
+			__ClearPageReserved(p);
+			set_page_count(p, 0);
+		}
+
+		set_page_refcounted(page);
+		__free_pages(page, order);
+	}
+}
+
+
+/*
+ * The order of subdivision here is critical for the IO subsystem.
+ * Please do not alter this order without good reasons and regression
+ * testing. Specifically, as large blocks of memory are subdivided,
+ * the order in which smaller blocks are delivered depends on the order
+ * they're subdivided in this function. This is the primary factor
+ * influencing the order in which pages are delivered to the IO
+ * subsystem according to empirical testing, and this is also justified
+ * by considering the behavior of a buddy system containing a single
+ * large block of memory acted on by a series of small allocations.
+ * This behavior is a critical factor in sglist merging's success.
+ *
+ * -- wli
+ */
+static inline void expand(struct zone *zone, struct page *page,
+ 	int low, int high, struct free_area *area)
+{
+	unsigned long size = 1 << high;
+
+	while (high > low) {
+		area--;
+		high--;
+		size >>= 1;
+		BUG_ON(bad_range(zone, &page[size]));
+		list_add(&page[size].lru, &area->free_list);
+		area->nr_free++;
+		set_page_order(&page[size], high);
+	}
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_reclaim	|
+			1 << PG_slab    |
+			1 << PG_swapcache |
+			1 << PG_writeback |
+			1 << PG_reserved |
+			1 << PG_buddy ))))
+		bad_page(page);
+
+	/*
+	 * For now, we report if PG_reserved was found set, but do not
+	 * clear it, and do not allocate the page: as a safety net.
+	 */
+	if (PageReserved(page))
+		return 1;
+
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_checked | 1 << PG_mappedtodisk);
+	set_page_private(page, 0);
+	set_page_refcounted(page);
+	kernel_map_pages(page, 1 << order, 1);
+
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
+
+	return 0;
+}
+
+/* 
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
+{
+	struct free_area * area;
+	unsigned int current_order;
+	struct page *page;
+
+	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+		area = zone->free_area + current_order;
+		if (list_empty(&area->free_list))
+			continue;
+
+		page = list_entry(area->free_list.next, struct page, lru);
+		list_del(&page->lru);
+		rmv_page_order(page);
+		area->nr_free--;
+		zone->free_pages -= 1UL << order;
+		expand(zone, page, order, current_order, area);
+		return page;
+	}
+
+	return NULL;
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/* 
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency.  Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+			unsigned long count, struct list_head *list)
+{
+	int i;
+	
+	spin_lock(&zone->lock);
+	for (i = 0; i < count; ++i) {
+		struct page *page = __rmqueue(zone, order);
+		if (unlikely(page == NULL))
+			break;
+		list_add_tail(&page->lru, list);
+	}
+	spin_unlock(&zone->lock);
+	return i;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+/*
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
+ */
+void drain_node_pages(int nodeid)
+{
+#ifndef CONFIG_PREEMPT_RT
+	int i, z;
+	unsigned long flags;
+
+	for (z = 0; z < MAX_NR_ZONES; z++) {
+		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
+		struct per_cpu_pageset *pset;
+
+		pset = zone_pcp(zone, smp_processor_id());
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			if (pcp->count) {
+				local_irq_save(flags);
+				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+				pcp->count = 0;
+				local_irq_restore(flags);
+			}
+		}
+	}
+#endif
+}
+#endif
+
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+static void __drain_pages(unsigned int cpu)
+{
+	unsigned long flags;
+	struct zone *zone;
+	int i;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset;
+
+		pset = zone_pcp(zone, cpu);
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			local_irq_save(flags);
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
+			local_irq_restore(flags);
+		}
+	}
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_PM
+
+void mark_free_pages(struct zone *zone)
+{
+	unsigned long zone_pfn, flags;
+	int order;
+	struct list_head *curr;
+
+	if (!zone->spanned_pages)
+		return;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
+
+	for (order = MAX_ORDER - 1; order >= 0; --order)
+		list_for_each(curr, &zone->free_area[order].free_list) {
+			unsigned long start_pfn, i;
+
+			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+
+			for (i=0; i < (1<<order); i++)
+				SetPageNosaveFree(pfn_to_page(start_pfn+i));
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);	
+	__drain_pages(smp_processor_id());
+	local_irq_restore(flags);	
+}
+#endif /* CONFIG_PM */
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * Free a 0-order page
+ */
+static void fastcall free_hot_cold_page(struct page *page, int cold)
+{
+	struct zone *zone = page_zone(page);
+	struct per_cpu_pages *pcp;
+	unsigned long flags;
+
+	arch_free_page(page, 0);
+
+	if (PageAnon(page))
+		page->mapping = NULL;
+	if (free_pages_check(page))
+		return;
+
+	kernel_map_pages(page, 1, 0);
+
+	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+	local_irq_save(flags);
+	__count_vm_event(PGFREE);
+	list_add(&page->lru, &pcp->list);
+	pcp->count++;
+	if (pcp->count >= pcp->high) {
+		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		pcp->count -= pcp->batch;
+	}
+	local_irq_restore(flags);
+	put_cpu();
+}
+#endif
+
+/*
+ * On PREEMPT_RT we use a simple solution for the time being,
+ * per-CPU allocation is disabled.
+ */
+void fastcall free_hot_page(struct page *page)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
+	free_hot_cold_page(page, 0);
+#endif
+}
+
+void fastcall free_cold_page(struct page *page)
+{
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
+	free_hot_cold_page(page, 1);
+#endif
+}
+
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+	int i;
+
+	BUG_ON(PageCompound(page));
+	BUG_ON(!page_count(page));
+	for (i = 1; i < (1 << order); i++)
+		set_page_refcounted(page + i);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * we cheat by calling it from here, in the order > 0 path.  Saves a branch
+ * or two.
+ */
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+			struct zone *zone, int order, gfp_t gfp_flags)
+{
+	unsigned long flags;
+	struct page *page;
+#ifndef CONFIG_PREEMPT_RT
+	int cold = !!(gfp_flags & __GFP_COLD);
+#endif
+
+again:
+	/*
+	 * Moved the get_cpu() magic inside the !PREEMPT_RT section.
+	 * NOTE: this moves the __count_zone_vm_events() and zone_statistics()
+	 * calls outside of the irqs-off section - is this safe?
+	 */
+#ifndef CONFIG_PREEMPT_RT
+	if (likely(order == 0)) {
+		struct per_cpu_pages *pcp;
+		int cpu = get_cpu();
+
+		pcp = &zone_pcp(zone, cpu)->pcp[cold];
+		local_irq_save(flags);
+		if (!pcp->count) {
+			pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, &pcp->list);
+			if (unlikely(!pcp->count))
+				goto failed;
+		}
+		page = list_entry(pcp->list.next, struct page, lru);
+		list_del(&page->lru);
+		pcp->count--;
+		local_irq_restore(flags);
+		put_cpu();
+	} else
+#endif
+	{
+		spin_lock_irqsave(&zone->lock, flags);
+		page = __rmqueue(zone, order);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		if (!page)
+			goto failed;
+	}
+
+	preempt_disable();
+	__count_zone_vm_events(PGALLOC, zone, 1 << order);
+	preempt_enable();
+	zone_statistics(zonelist, zone);
+
+	BUG_ON(bad_range(zone, page));
+	if (prep_new_page(page, order, gfp_flags))
+		goto again;
+	return page;
+
+failed:
+	local_irq_restore_nort(flags);
+	put_cpu();
+	return NULL;
+}
+
+#define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
+#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
+#define ALLOC_HARDER		0x10 /* try to alloc harder */
+#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+
+/*
+ * Return 1 if free pages are above 'mark'. This takes into account the order
+ * of the allocation.
+ */
+int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	/* free_pages my go negative - that's OK */
+	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+	int o;
+
+	if (alloc_flags & ALLOC_HIGH)
+		min -= min / 2;
+	if (alloc_flags & ALLOC_HARDER)
+		min -= min / 4;
+
+	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+		return 0;
+	for (o = 0; o < order; o++) {
+		/* At the next order, this order's pages become unavailable */
+		free_pages -= z->free_area[o].nr_free << o;
+
+		/* Require fewer higher order pages to be free */
+		min >>= 1;
+
+		if (free_pages <= min)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, int alloc_flags)
+{
+	struct zone **z = zonelist->zones;
+	struct page *page = NULL;
+	int classzone_idx = zone_idx(*z);
+
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	do {
+		if ((alloc_flags & ALLOC_CPUSET) &&
+				!cpuset_zone_allowed(*z, gfp_mask))
+			continue;
+
+		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+			unsigned long mark;
+			if (alloc_flags & ALLOC_WMARK_MIN)
+				mark = (*z)->pages_min;
+			else if (alloc_flags & ALLOC_WMARK_LOW)
+				mark = (*z)->pages_low;
+			else
+				mark = (*z)->pages_high;
+			if (!zone_watermark_ok(*z, order, mark,
+				    classzone_idx, alloc_flags))
+				if (!zone_reclaim_mode ||
+				    !zone_reclaim(*z, gfp_mask, order))
+					continue;
+		}
+
+		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
+		if (page) {
+			break;
+		}
+	} while (*(++z) != NULL);
+	return page;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page * fastcall
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	struct zone **z;
+	struct page *page;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+	int do_retry;
+	int alloc_flags;
+	int did_some_progress;
+
+	might_sleep_if(wait);
+
+restart:
+	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+
+	if (unlikely(*z == NULL)) {
+		/* Should this ever happen?? */
+		return NULL;
+	}
+
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (page)
+		goto got_pg;
+
+	do {
+		wakeup_kswapd(*z, order);
+	} while (*(++z));
+
+	/*
+	 * OK, we're below the kswapd watermark and have kicked background
+	 * reclaim. Now things get more complex, so set up alloc_flags according
+	 * to how we want to proceed.
+	 *
+	 * The caller may dip into page reserves a bit more if the caller
+	 * cannot run direct reclaim, or if the caller has realtime scheduling
+	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+	 */
+	alloc_flags = ALLOC_WMARK_MIN;
+	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+		alloc_flags |= ALLOC_HARDER;
+	if (gfp_mask & __GFP_HIGH)
+		alloc_flags |= ALLOC_HIGH;
+	if (wait)
+		alloc_flags |= ALLOC_CPUSET;
+
+	/*
+	 * Go through the zonelist again. Let __GFP_HIGH and allocations
+	 * coming from realtime tasks go deeper into reserves.
+	 *
+	 * This is the last chance, in general, before the goto nopage.
+	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+	if (page)
+		goto got_pg;
+
+	/* This allocation should allow future memory freeing. */
+
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt()) {
+		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
+			/* go through the zonelist yet again, ignoring mins */
+			page = get_page_from_freelist(gfp_mask, order,
+				zonelist, ALLOC_NO_WATERMARKS);
+			if (page)
+				goto got_pg;
+			if (gfp_mask & __GFP_NOFAIL) {
+				blk_congestion_wait(WRITE, HZ/50);
+				goto nofail_alloc;
+			}
+		}
+		goto nopage;
+	}
+
+#ifndef CONFIG_PREEMPT_RT
+	/* Atomic allocations - we can't balance anything */
+	if (!wait)
+		goto nopage;
+#endif
+
+rebalance:
+	cond_resched();
+
+	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (likely(did_some_progress)) {
+		page = get_page_from_freelist(gfp_mask, order,
+						zonelist, alloc_flags);
+		if (page)
+			goto got_pg;
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+		if (page)
+			goto got_pg;
+
+		out_of_memory(zonelist, gfp_mask, order);
+		goto restart;
+	}
+
+	/*
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+	 * <= 3, but that may not be true in other implementations.
+	 */
+	do_retry = 0;
+	if (!(gfp_mask & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_mask & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		blk_congestion_wait(WRITE, HZ/50);
+		goto rebalance;
+	}
+
+nopage:
+	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+		printk(KERN_WARNING "%s: page allocation failure."
+			" order:%d, mode:0x%x\n",
+			p->comm, order, gfp_mask);
+		dump_stack();
+		show_mem();
+	}
+got_pg:
+	return page;
+}
+
+EXPORT_SYMBOL(__alloc_pages);
+
+/*
+ * Common helper functions.
+ */
+fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+{
+	struct page * page;
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return 0;
+	return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
+{
+	struct page * page;
+
+	/*
+	 * get_zeroed_page() returns a 32-bit address, which cannot represent
+	 * a highmem page
+	 */
+	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+	int i = pagevec_count(pvec);
+
+	while (--i >= 0) {
+#ifdef CONFIG_PREEMPT_RT
+		if (PageAnon(pvec->pages[i]))
+			pvec->pages[i]->mapping = NULL;
+		__free_pages_ok(pvec->pages[i], 0);
+#else
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
+#endif
+	}
+}
+
+fastcall void __free_pages(struct page *page, unsigned int order)
+{
+	if (put_page_testzero(page)) {
+		if (order == 0)
+			free_hot_page(page);
+		else
+			__free_pages_ok(page, order);
+	}
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+fastcall void free_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		BUG_ON(!virt_addr_valid((void *)addr));
+		__free_pages(virt_to_page((void *)addr), order);
+	}
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages(void)
+{
+	unsigned int sum = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		sum += zone->free_pages;
+
+	return sum;
+}
+
+EXPORT_SYMBOL(nr_free_pages);
+
+#ifdef CONFIG_NUMA
+unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+{
+	unsigned int i, sum = 0;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += pgdat->node_zones[i].free_pages;
+
+	return sum;
+}
+#endif
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+	/* Just pick one node, since fallback list is circular */
+	pg_data_t *pgdat = NODE_DATA(numa_node_id());
+	unsigned int sum = 0;
+
+	struct zonelist *zonelist = pgdat->node_zonelists + offset;
+	struct zone **zonep = zonelist->zones;
+	struct zone *zone;
+
+	for (zone = *zonep++; zone; zone = *zonep++) {
+		unsigned long size = zone->present_pages;
+		unsigned long high = zone->pages_high;
+		if (size > high)
+			sum += size - high;
+	}
+
+	return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+	return nr_free_zone_pages(gfp_zone(GFP_USER));
+}
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+}
+
+#ifdef CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+	pg_data_t *pgdat;
+	unsigned int pages = 0;
+
+	for_each_online_pgdat(pgdat)
+		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+	return pages;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void show_node(struct zone *zone)
+{
+	printk("Node %d ", zone->zone_pgdat->node_id);
+}
+#else
+#define show_node(zone)	do { } while (0)
+#endif
+
+void si_meminfo(struct sysinfo *val)
+{
+	val->totalram = totalram_pages;
+	val->sharedram = 0;
+	val->freeram = nr_free_pages();
+	val->bufferram = nr_blockdev_pages();
+#ifdef CONFIG_HIGHMEM
+	val->totalhigh = totalhigh_pages;
+	val->freehigh = nr_free_highpages();
+#else
+	val->totalhigh = 0;
+	val->freehigh = 0;
+#endif
+	val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	val->totalram = pgdat->node_present_pages;
+	val->freeram = nr_free_pages_pgdat(pgdat);
+	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+	val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+	int cpu, temperature;
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long free;
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		show_node(zone);
+		printk("%s per-cpu:", zone->name);
+
+		if (!populated_zone(zone)) {
+			printk(" empty\n");
+			continue;
+		} else
+			printk("\n");
+
+		for_each_online_cpu(cpu) {
+			struct per_cpu_pageset *pageset;
+
+			pageset = zone_pcp(zone, cpu);
+
+			for (temperature = 0; temperature < 2; temperature++)
+				printk("cpu %d %s: high %d, batch %d used:%d\n",
+					cpu,
+					temperature ? "cold" : "hot",
+					pageset->pcp[temperature].high,
+					pageset->pcp[temperature].batch,
+					pageset->pcp[temperature].count);
+		}
+	}
+
+	get_zone_counts(&active, &inactive, &free);
+
+	printk("Free pages: %11ukB (%ukB HighMem)\n",
+		K(nr_free_pages()),
+		K(nr_free_highpages()));
+
+	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+		active,
+		inactive,
+		global_page_state(NR_FILE_DIRTY),
+		global_page_state(NR_WRITEBACK),
+		global_page_state(NR_UNSTABLE_NFS),
+		nr_free_pages(),
+		global_page_state(NR_SLAB),
+		global_page_state(NR_FILE_MAPPED),
+		global_page_state(NR_PAGETABLE));
+
+	for_each_zone(zone) {
+		int i;
+
+		show_node(zone);
+		printk("%s"
+			" free:%lukB"
+			" min:%lukB"
+			" low:%lukB"
+			" high:%lukB"
+			" active:%lukB"
+			" inactive:%lukB"
+			" present:%lukB"
+			" pages_scanned:%lu"
+			" all_unreclaimable? %s"
+			"\n",
+			zone->name,
+			K(zone->free_pages),
+			K(zone->pages_min),
+			K(zone->pages_low),
+			K(zone->pages_high),
+			K(zone->nr_active),
+			K(zone->nr_inactive),
+			K(zone->present_pages),
+			zone->pages_scanned,
+			(zone->all_unreclaimable ? "yes" : "no")
+			);
+		printk("lowmem_reserve[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->lowmem_reserve[i]);
+		printk("\n");
+	}
+
+	for_each_zone(zone) {
+ 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
+
+		show_node(zone);
+		printk("%s: ", zone->name);
+		if (!populated_zone(zone)) {
+			printk("empty\n");
+			continue;
+		}
+
+		spin_lock_irqsave(&zone->lock, flags);
+		for (order = 0; order < MAX_ORDER; order++) {
+			nr[order] = zone->free_area[order].nr_free;
+			total += nr[order] << order;
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+		for (order = 0; order < MAX_ORDER; order++)
+			printk("%lu*%lukB ", nr[order], K(1UL) << order);
+		printk("= %lukB\n", K(total));
+	}
+
+	show_swap_cache_info();
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int nr_zones, int zone_type)
+{
+	struct zone *zone;
+
+	BUG_ON(zone_type > ZONE_HIGHMEM);
+
+	do {
+		zone = pgdat->node_zones + zone_type;
+		if (populated_zone(zone)) {
+#ifndef CONFIG_HIGHMEM
+			BUG_ON(zone_type > ZONE_NORMAL);
+#endif
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
+		}
+		zone_type--;
+
+	} while (zone_type >= 0);
+	return nr_zones;
+}
+
+static inline int highest_zone(int zone_bits)
+{
+	int res = ZONE_NORMAL;
+	if (zone_bits & (__force int)__GFP_HIGHMEM)
+		res = ZONE_HIGHMEM;
+	if (zone_bits & (__force int)__GFP_DMA32)
+		res = ZONE_DMA32;
+	if (zone_bits & (__force int)__GFP_DMA)
+		res = ZONE_DMA;
+	return res;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __meminitdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	/* Use the local node if we haven't already */
+	if (!node_isset(node, *used_node_mask)) {
+		node_set(node, *used_node_mask);
+		return node;
+	}
+
+	for_each_online_node(n) {
+		cpumask_t tmp;
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Penalize nodes under us ("prefer the next node") */
+		val += (n < node);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __meminit build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = highest_zone(i);
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __meminit build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = highest_zone(i);
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
+{
+	int nid;
+	for_each_online_node(nid)
+		build_zonelists(NODE_DATA(nid));
+	return 0;
+}
+
+void __meminit build_all_zonelists(void)
+{
+	if (system_state == SYSTEM_BOOTING) {
+		__build_all_zonelists(0);
+		cpuset_init_current_mems_allowed();
+	} else {
+		/* we have to stop all cpus to guaranntee there is no user
+		   of zonelist */
+		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+		/* cpuset refresh routine should be here */
+	}
+	vm_total_pages = nr_free_pagecache_pages();
+	printk("Built %i zonelists.  Total pages: %ld\n",
+			num_online_nodes(), vm_total_pages);
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table.  So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
+ *
+ *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
+ *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above).  It equals:
+ *
+ *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
+ *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
+ *    powerpc (64K page size)             : =  (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+	return 4096UL;
+}
+#endif
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		totalpages += zones_size[i];
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	if (zholes_size)
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			realtotalpages -= zholes_size[i];
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *page;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
+		init_page_count(page);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+static int __cpuinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/2 of a meg.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 512 * 1024)
+		batch = (512 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	memset(p, 0, sizeof(*p));
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, batch/2);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __cpuinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone_pcp(zone, cpu))
+			goto bad;
+
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+}
+
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+		case CPU_UP_CANCELED:
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block __cpuinitdata pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset(void)
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
+static __meminit
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+	int i;
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	size_t alloc_size;
+
+	/*
+	 * The per-page waitqueue mechanism uses hashed waitqueues
+	 * per zone.
+	 */
+	zone->wait_table_hash_nr_entries =
+		 wait_table_hash_nr_entries(zone_size_pages);
+	zone->wait_table_bits =
+		wait_table_bits(zone->wait_table_hash_nr_entries);
+	alloc_size = zone->wait_table_hash_nr_entries
+					* sizeof(wait_queue_head_t);
+
+ 	if (system_state == SYSTEM_BOOTING) {
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, alloc_size);
+	} else {
+		/*
+		 * This case means that a zone whose size was 0 gets new memory
+		 * via memory hot-add.
+		 * But it may be the case that a new node was hot-added.  In
+		 * this case vmalloc() will not be able to use this new node's
+		 * memory - this wait_table must be initialized to use this new
+		 * node itself as well.
+		 * To use this new node's memory, further consideration will be
+		 * necessary.
+		 */
+		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+	}
+	if (!zone->wait_table)
+		return -ENOMEM;
+
+	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+		init_waitqueue_head(zone->wait_table + i);
+
+	return 0;
+}
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+	int cpu;
+	unsigned long batch = zone_batchsize(zone);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+		/* Early boot. Slab allocator not functional yet */
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
+		setup_pageset(&boot_pageset[cpu],0);
+#else
+		setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+	}
+	if (zone->present_pages)
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
+}
+
+__meminit int init_currently_empty_zone(struct zone *zone,
+					unsigned long zone_start_pfn,
+					unsigned long size)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int ret;
+	ret = zone_wait_table_init(zone, size);
+	if (ret)
+		return ret;
+	pgdat->nr_zones = zone_idx(zone) + 1;
+
+	zone->zone_start_pfn = zone_start_pfn;
+
+	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	return 0;
+}
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long j;
+	int nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	int ret;
+
+	pgdat_resize_init(pgdat);
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+	
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+
+		realsize = size = zones_size[j];
+		if (zholes_size)
+			realsize -= zholes_size[j];
+
+		if (j < ZONE_HIGHMEM)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+#ifdef CONFIG_NUMA
+		zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+						/ 100;
+#endif
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone_seqlock_init(zone);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		zone_pcp_init(zone);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		zap_zone_vm_stats(zone);
+		atomic_set(&zone->reclaim_in_progress, 0);
+		if (!size)
+			continue;
+
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+		BUG_ON(ret);
+		zone_start_pfn += size;
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		unsigned long size, start, end;
+		struct page *map;
+
+		/*
+		 * The zone's endpoints aren't required to be MAX_ORDER
+		 * aligned but the node_mem_map endpoints must be in order
+		 * for the buddy allocator to function correctly.
+		 */
+		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+		end = ALIGN(end, MAX_ORDER_NR_PAGES);
+		size =  (end - start) * sizeof(struct page);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+	}
+#ifdef CONFIG_FLATMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+}
+
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
+
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+void __init free_area_init(unsigned long *zones_size)
+{
+	free_area_init_node(0, NODE_DATA(0), zones_size,
+			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	if (action == CPU_DEAD) {
+		local_irq_disable();
+		__drain_pages(cpu);
+		vm_events_fold_cpu(cpu);
+		local_irq_enable();
+		refresh_cpu_vm_stats(cpu);
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init page_alloc_init(void)
+{
+	hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+/*
+ * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ *	or min_free_kbytes changes.
+ */
+static void calculate_totalreserve_pages(void)
+{
+	struct pglist_data *pgdat;
+	unsigned long reserve_pages = 0;
+	int i, j;
+
+	for_each_online_pgdat(pgdat) {
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+			unsigned long max = 0;
+
+			/* Find valid and maximum lowmem_reserve in the zone */
+			for (j = i; j < MAX_NR_ZONES; j++) {
+				if (zone->lowmem_reserve[j] > max)
+					max = zone->lowmem_reserve[j];
+			}
+
+			/* we treat pages_high as reserved pages. */
+			max += zone->pages_high;
+
+			if (max > zone->present_pages)
+				max = zone->present_pages;
+			reserve_pages += max;
+		}
+	}
+	totalreserve_pages = reserve_pages;
+}
+
+/*
+ * setup_per_zone_lowmem_reserve - called whenever
+ *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *	has a correct pages reserved value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ */
+static void setup_per_zone_lowmem_reserve(void)
+{
+	struct pglist_data *pgdat;
+	int j, idx;
+
+	for_each_online_pgdat(pgdat) {
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone *zone = pgdat->node_zones + j;
+			unsigned long present_pages = zone->present_pages;
+
+			zone->lowmem_reserve[j] = 0;
+
+			for (idx = j-1; idx >= 0; idx--) {
+				struct zone *lower_zone;
+
+				if (sysctl_lowmem_reserve_ratio[idx] < 1)
+					sysctl_lowmem_reserve_ratio[idx] = 1;
+
+				lower_zone = pgdat->node_zones + idx;
+				lower_zone->lowmem_reserve[j] = present_pages /
+					sysctl_lowmem_reserve_ratio[idx];
+				present_pages += lower_zone->present_pages;
+			}
+		}
+	}
+
+	/* update totalreserve_pages */
+	calculate_totalreserve_pages();
+}
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *	that the pages_{min,low,high} values for each zone are set correctly 
+ *	with respect to min_free_kbytes.
+ */
+void setup_per_zone_pages_min(void)
+{
+	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+	}
+
+	for_each_zone(zone) {
+		u64 tmp;
+
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		tmp = (u64)pages_min * zone->present_pages;
+		do_div(tmp, lowmem_pages);
+		if (is_highmem(zone)) {
+			/*
+			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+			 * need highmem pages, so cap pages_min to a small
+			 * value here.
+			 *
+			 * The (pages_high-pages_low) and (pages_low-pages_min)
+			 * deltas controls asynch page reclaim, and so should
+			 * not be capped for highmem.
+			 */
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->pages_min = min_pages;
+		} else {
+			/*
+			 * If it's a lowmem zone, reserve a number of pages
+			 * proportionate to the zone's size.
+			 */
+			zone->pages_min = tmp;
+		}
+
+		zone->pages_low   = zone->pages_min + (tmp >> 2);
+		zone->pages_high  = zone->pages_min + (tmp >> 1);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+
+	/* update totalreserve_pages */
+	calculate_totalreserve_pages();
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min).  For large machines
+ * we want it large (64MB max).  But it is not linear, because network
+ * bandwidth does not increase linearly with machine size.  We use
+ *
+ * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
+ *
+ * which yields
+ *
+ * 16MB:	512k
+ * 32MB:	724k
+ * 64MB:	1024k
+ * 128MB:	1448k
+ * 256MB:	2048k
+ * 512MB:	2896k
+ * 1024MB:	4096k
+ * 2048MB:	5792k
+ * 4096MB:	8192k
+ * 8192MB:	11584k
+ * 16384MB:	16384k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+	unsigned long lowmem_kbytes;
+
+	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+	if (min_free_kbytes < 128)
+		min_free_kbytes = 128;
+	if (min_free_kbytes > 65536)
+		min_free_kbytes = 65536;
+	setup_per_zone_pages_min();
+	setup_per_zone_lowmem_reserve();
+	return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *	that we can call two helper functions whenever min_free_kbytes
+ *	changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	setup_per_zone_pages_min();
+	return 0;
+}
+
+#ifdef CONFIG_NUMA
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->min_unmapped_ratio = (zone->present_pages *
+				sysctl_min_unmapped_ratio) / 100;
+	return 0;
+}
+#endif
+
+/*
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ *	whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
+ */
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	setup_per_zone_lowmem_reserve();
+	return 0;
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
+__initdata int hashdist = HASHDIST_DEFAULT;
+
+#ifdef CONFIG_NUMA
+static int __init set_hashdist(char *str)
+{
+	if (!str)
+		return 0;
+	hashdist = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long limit)
+{
+	unsigned long long max = limit;
+	unsigned long log2qty, size;
+	void *table = NULL;
+
+	/* allow the kernel cmdline to have a say */
+	if (!numentries) {
+		/* round applicable memory size up to nearest megabyte */
+		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
+		numentries >>= 20 - PAGE_SHIFT;
+		numentries <<= 20 - PAGE_SHIFT;
+
+		/* limit to 1 bucket per 2^scale bytes of low memory */
+		if (scale > PAGE_SHIFT)
+			numentries >>= (scale - PAGE_SHIFT);
+		else
+			numentries <<= (PAGE_SHIFT - scale);
+	}
+	numentries = roundup_pow_of_two(numentries);
+
+	/* limit allocation size to 1/16 total memory by default */
+	if (max == 0) {
+		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+		do_div(max, bucketsize);
+	}
+
+	if (numentries > max)
+		numentries = max;
+
+	log2qty = long_log2(numentries);
+
+	do {
+		size = bucketsize << log2qty;
+		if (flags & HASH_EARLY)
+			table = alloc_bootmem(size);
+		else if (hashdist)
+			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+		else {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			table = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	} while (!table && size > PAGE_SIZE && --log2qty);
+
+	if (!table)
+		panic("Failed to allocate %s hash table\n", tablename);
+
+	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+	       tablename,
+	       (1U << log2qty),
+	       long_log2(size) - PAGE_SHIFT,
+	       size);
+
+	if (_hash_shift)
+		*_hash_shift = log2qty;
+	if (_hash_mask)
+		*_hash_mask = (1 << log2qty) - 1;
+
+	return table;
+}
+
+
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags)
+{
+	unsigned long words = bits / (sizeof(unsigned long)*8);
+	unsigned long size = words * sizeof(unsigned long);
+	unsigned long *bitmask = NULL;
+
+	if (flags & HASH_EARLY)
+		bitmask = alloc_bootmem(size);
+	else if (hashdist)
+		bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+	else {
+		bitmask = kmalloc(size, GFP_ATOMIC);
+		if (!bitmask) {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	}
+
+	if (!bitmask)
+		panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+	return bitmask;
+}
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return __pfn_to_page(pfn);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	return __page_to_pfn(page);
+}
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff -urN ./linux-2.6.18.1/mm/slab.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/slab.c
--- ./linux-2.6.18.1/mm/slab.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/slab.c	2007-05-19 23:58:35.000000000 +0900
@@ -107,7 +107,6 @@
 #include	<linux/nodemask.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
-#include	<linux/rtmutex.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -115,6 +114,63 @@
 #include	<asm/page.h>
 
 /*
+ * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking
+ * mechanism.
+ *
+ * On PREEMPT_RT, we use per-CPU locks for this. That's why the
+ * calling convention is changed slightly: a new 'flags' argument
+ * is passed to 'irq disable/enable' - the PREEMPT_RT code stores
+ * the CPU number of the lock there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define slab_irq_disable(cpu) \
+	do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_enable(cpu)		local_irq_enable()
+# define slab_irq_save(flags, cpu) \
+	do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_restore(flags, cpu)	local_irq_restore(flags)
+/*
+ * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
+ * which has no per-CPU locking effect since we are holding the cache
+ * lock in that case already.
+ *
+ * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
+ */
+# define slab_irq_disable_nort()	local_irq_disable()
+# define slab_irq_enable_nort()		local_irq_enable()
+# define slab_irq_disable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_irq_enable_rt(flags)	do { (void)(flags); } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+ 	do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+				 	spin_unlock_irq(lock)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+ 	do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+ 	do { spin_unlock_irqrestore(lock, flags); } while (0)
+#else
+DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, };
+# define slab_irq_disable(cpu)		(void)get_cpu_var_locked(slab_irq_locks, &(cpu))
+# define slab_irq_enable(cpu)		put_cpu_var_locked(slab_irq_locks, cpu)
+# define slab_irq_save(flags, cpu) \
+	do { slab_irq_disable(cpu); (void) (flags); } while (0)
+# define slab_irq_restore(flags, cpu) \
+	do { slab_irq_enable(cpu); (void) (flags); } while (0)
+# define slab_irq_disable_rt(cpu)	slab_irq_disable(cpu)
+# define slab_irq_enable_rt(cpu)	slab_irq_enable(cpu)
+# define slab_irq_disable_nort()	do { } while (0)
+# define slab_irq_enable_nort()		do { } while (0)
+# define slab_spin_lock_irq(lock, cpu) \
+		do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+		do { spin_unlock(lock); slab_irq_enable(cpu); } while (0)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+ 	do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+ 	do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
+#endif
+
+/*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
  *		  SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -312,7 +368,7 @@
 static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
-			int node);
+			int node, int *this_cpu);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 
@@ -455,8 +511,13 @@
  * OTOH the cpuarrays can contain lots of objects,
  * which could lock up otherwise freeable slabs.
  */
-#define REAPTIMEOUT_CPUC	(2*HZ)
-#define REAPTIMEOUT_LIST3	(4*HZ)
+#ifdef CONFIG_NO_HZ
+# define REAPTIMEOUT_CPUC	(4*HZ)
+# define REAPTIMEOUT_LIST3	(8*HZ)
+#else
+# define REAPTIMEOUT_CPUC	(2*HZ)
+# define REAPTIMEOUT_LIST3	(4*HZ)
+#endif
 
 #if STATS
 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
@@ -738,9 +799,10 @@
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
+static inline struct array_cache *
+cpu_cache_get(struct kmem_cache *cachep, int this_cpu)
 {
-	return cachep->array[smp_processor_id()];
+	return cachep->array[this_cpu];
 }
 
 static inline struct kmem_cache *__find_general_cachep(size_t size,
@@ -956,7 +1018,8 @@
 }
 
 #ifdef CONFIG_NUMA
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				int nodeid, int *this_cpu);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -998,7 +1061,8 @@
 }
 
 static void __drain_alien_cache(struct kmem_cache *cachep,
-				struct array_cache *ac, int node)
+				struct array_cache *ac, int node,
+				int *this_cpu)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
@@ -1012,7 +1076,7 @@
 		if (rl3->shared)
 			transfer_objects(rl3->shared, ac, ac->limit);
 
-		free_block(cachep, ac->entry, ac->avail, node);
+		free_block(cachep, ac->entry, ac->avail, node, this_cpu);
 		ac->avail = 0;
 		spin_unlock(&rl3->list_lock);
 	}
@@ -1021,15 +1085,16 @@
 /*
  * Called from cache_reap() to regularly drain alien caches round robin.
  */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void
+reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu)
 {
-	int node = __get_cpu_var(reap_node);
+	int node = per_cpu(reap_node, *this_cpu);
 
 	if (l3->alien) {
 		struct array_cache *ac = l3->alien[node];
 
 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
-			__drain_alien_cache(cachep, ac, node);
+			__drain_alien_cache(cachep, ac, node, this_cpu);
 			spin_unlock_irq(&ac->lock);
 		}
 	}
@@ -1038,21 +1103,22 @@
 static void drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache **alien)
 {
-	int i = 0;
+	int i = 0, this_cpu;
 	struct array_cache *ac;
 	unsigned long flags;
 
 	for_each_online_node(i) {
 		ac = alien[i];
 		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
-			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
+			slab_spin_lock_irqsave(&ac->lock, flags, this_cpu);
+			__drain_alien_cache(cachep, ac, i, &this_cpu);
+			slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu);
 		}
 	}
 }
 
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int
+cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
 	struct slab *slabp = virt_to_slab(objp);
 	int nodeid = slabp->nodeid;
@@ -1073,13 +1139,13 @@
 		spin_lock(&alien->lock);
 		if (unlikely(alien->avail == alien->limit)) {
 			STATS_INC_ACOVERFLOW(cachep);
-			__drain_alien_cache(cachep, alien, nodeid);
+			__drain_alien_cache(cachep, alien, nodeid, this_cpu);
 		}
 		alien->entry[alien->avail++] = objp;
 		spin_unlock(&alien->lock);
 	} else {
 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
-		free_block(cachep, &objp, 1, nodeid);
+		free_block(cachep, &objp, 1, nodeid, this_cpu);
 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
 	}
 	return 1;
@@ -1088,7 +1154,7 @@
 #else
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, this_cpu) do { } while (0)
 
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1099,7 +1165,8 @@
 {
 }
 
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int
+cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
 	return 0;
 }
@@ -1114,6 +1181,7 @@
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
+	int this_cpu;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
@@ -1147,11 +1215,11 @@
 				cachep->nodelists[node] = l3;
 			}
 
-			spin_lock_irq(&cachep->nodelists[node]->list_lock);
+			slab_spin_lock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
 			cachep->nodelists[node]->free_limit =
 				(1 + nr_cpus_node(node)) *
 				cachep->batchcount + cachep->num;
-			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+			slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu);
 		}
 
 		/*
@@ -1162,6 +1230,7 @@
 			struct array_cache *nc;
 			struct array_cache *shared;
 			struct array_cache **alien;
+			int this_cpu;
 
 			nc = alloc_arraycache(node, cachep->limit,
 						cachep->batchcount);
@@ -1180,7 +1249,7 @@
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 			if (!l3->shared) {
 				/*
 				 * We are serialised from CPU_DEAD or
@@ -1195,7 +1264,7 @@
 				alien = NULL;
 			}
 #endif
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			kfree(shared);
 			free_alien_cache(alien);
 		}
@@ -1221,6 +1290,7 @@
 			struct array_cache *nc;
 			struct array_cache *shared;
 			struct array_cache **alien;
+			int this_cpu;
 			cpumask_t mask;
 
 			mask = node_to_cpumask(node);
@@ -1232,29 +1302,30 @@
 			if (!l3)
 				goto free_array_cache;
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			/* Free limit for this kmem_list3 */
 			l3->free_limit -= cachep->batchcount;
 			if (nc)
-				free_block(cachep, nc->entry, nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node,
+					   &this_cpu);
 
 			if (!cpus_empty(mask)) {
-				spin_unlock_irq(&l3->list_lock);
+				slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 				goto free_array_cache;
 			}
 
 			shared = l3->shared;
 			if (shared) {
 				free_block(cachep, l3->shared->entry,
-					   l3->shared->avail, node);
+					   l3->shared->avail, node, &this_cpu);
 				l3->shared = NULL;
 			}
 
 			alien = l3->alien;
 			l3->alien = NULL;
 
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 
 			kfree(shared);
 			if (alien) {
@@ -1296,12 +1367,14 @@
 			int nodeid)
 {
 	struct kmem_list3 *ptr;
+	int this_cpu;
 
 	BUG_ON(cachep->nodelists[nodeid] != list);
 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
+	WARN_ON(spin_is_locked(&list->list_lock));
+	slab_irq_disable(this_cpu);
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1310,7 +1383,7 @@
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
+	slab_irq_enable(this_cpu);
 }
 
 /*
@@ -1436,36 +1509,34 @@
 	/* 4) Replace the bootstrap head arrays */
 	{
 		struct array_cache *ptr;
+		int this_cpu;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, cpu_cache_get(&cache_cache),
-		       sizeof(struct arraycache_init));
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache);
+		memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu),
+				sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
-
-		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		cache_cache.array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
-		local_irq_disable();
-		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
-		       != &initarray_generic.cache);
-		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
-		       sizeof(struct arraycache_init));
+		slab_irq_disable(this_cpu);
+		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu)
+				!= &initarray_generic.cache);
+		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu),
+				sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
-
-		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-		    ptr;
-		local_irq_enable();
+		malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr;
+		slab_irq_enable(this_cpu);
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1605,7 +1676,7 @@
 
 	*addr++ = 0x12345678;
 	*addr++ = caller;
-	*addr++ = smp_processor_id();
+	*addr++ = raw_smp_processor_id();
 	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
@@ -1738,7 +1809,11 @@
 }
 #endif
 
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu);
+
 #if DEBUG
+
 /**
  * slab_destroy_objs - destroy a slab and its objects
  * @cachep: cache pointer being destroyed
@@ -1747,7 +1822,8 @@
  * Call the registered destructor for each object in a slab that is being
  * destroyed.
  */
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
@@ -1778,7 +1854,8 @@
 	}
 }
 #else
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	if (cachep->dtor) {
 		int i;
@@ -1799,7 +1876,8 @@
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
  */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1813,8 +1891,12 @@
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, addr);
-		if (OFF_SLAB(cachep))
-			kmem_cache_free(cachep->slabp_cache, slabp);
+		if (OFF_SLAB(cachep)) {
+			if (this_cpu)
+				__cache_free(cachep->slabp_cache, slabp, this_cpu);
+			else
+				kmem_cache_free(cachep->slabp_cache, slabp);
+		}
 	}
 }
 
@@ -1906,6 +1988,8 @@
 
 static void setup_cpu_cache(struct kmem_cache *cachep)
 {
+	int this_cpu;
+
 	if (g_cpucache_up == FULL) {
 		enable_cpucache(cachep);
 		return;
@@ -1950,10 +2034,12 @@
 			jiffies + REAPTIMEOUT_LIST3 +
 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
-	cpu_cache_get(cachep)->avail = 0;
-	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-	cpu_cache_get(cachep)->batchcount = 1;
-	cpu_cache_get(cachep)->touched = 0;
+	this_cpu = raw_smp_processor_id();
+
+	cpu_cache_get(cachep, this_cpu)->avail = 0;
+	cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+	cpu_cache_get(cachep, this_cpu)->batchcount = 1;
+	cpu_cache_get(cachep, this_cpu)->touched = 0;
 	cachep->batchcount = 1;
 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
 }
@@ -2224,19 +2310,19 @@
 #if DEBUG
 static void check_irq_off(void)
 {
+/*
+ * On PREEMPT_RT we use locks to protect the per-CPU lists,
+ * and keep interrupts enabled.
+ */
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!irqs_disabled());
+#endif
 }
 
 static void check_irq_on(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(irqs_disabled());
-}
-
-static void check_spinlock_acquired(struct kmem_cache *cachep)
-{
-#ifdef CONFIG_SMP
-	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
 #endif
 }
 
@@ -2251,7 +2337,6 @@
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
-#define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
@@ -2259,26 +2344,60 @@
 			struct array_cache *ac,
 			int force, int node);
 
-static void do_drain(void *arg)
+static void __do_drain(void *arg, int this_cpu)
 {
 	struct kmem_cache *cachep = arg;
+	int node = cpu_to_node(this_cpu);
 	struct array_cache *ac;
-	int node = numa_node_id();
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, this_cpu);
 	spin_lock(&cachep->nodelists[node]->list_lock);
-	free_block(cachep, ac->entry, ac->avail, node);
+	free_block(cachep, ac->entry, ac->avail, node, &this_cpu);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void do_drain(void *arg, int this_cpu)
+{
+	__do_drain(arg, this_cpu);
+}
+#else
+static void do_drain(void *arg)
+{
+	__do_drain(arg, smp_processor_id());
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * execute func() for all CPUs. On PREEMPT_RT we dont actually have
+ * to run on the remote CPUs - we only have to take their CPU-locks.
+ * (This is a rare operation, so cacheline bouncing is not an issue.)
+ */
+static void
+slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg)
+{
+	unsigned int i;
+
+	check_irq_on();
+	for_each_online_cpu(i) {
+		spin_lock(&__get_cpu_lock(slab_irq_locks, i));
+		func(arg, i);
+		spin_unlock(&__get_cpu_lock(slab_irq_locks, i));
+	}
+}
+#else
+# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1, 1)
+#endif
+
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1, 1);
+	slab_on_each_cpu(do_drain, cachep);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -2303,13 +2422,13 @@
 			struct kmem_list3 *l3, int tofree)
 {
 	struct list_head *p;
-	int nr_freed;
+	int nr_freed, this_cpu;
 	struct slab *slabp;
 
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
 
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 		p = l3->slabs_free.prev;
 		if (p == &l3->slabs_free) {
 			spin_unlock_irq(&l3->list_lock);
@@ -2321,13 +2440,9 @@
 		BUG_ON(slabp->inuse);
 #endif
 		list_del(&slabp->list);
-		/*
-		 * Safe to drop the lock. The slab is no longer linked
-		 * to the cache.
-		 */
 		l3->free_objects -= cache->num;
-		spin_unlock_irq(&l3->list_lock);
-		slab_destroy(cache, slabp);
+		slab_destroy(cache, slabp, &this_cpu);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 		nr_freed++;
 	}
 out:
@@ -2586,7 +2701,8 @@
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+		      int *this_cpu)
 {
 	struct slab *slabp;
 	void *objp;
@@ -2627,7 +2743,8 @@
 	offset *= cachep->colour_off;
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		slab_irq_enable_nort();
+	slab_irq_enable_rt(*this_cpu);
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2655,8 +2772,9 @@
 
 	cache_init_objs(cachep, slabp, ctor_flags);
 
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort();
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 
@@ -2669,8 +2787,9 @@
 opps1:
 	kmem_freepages(cachep, objp);
 failed:
+	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		slab_irq_disable_nort();
 	return 0;
 }
 
@@ -2814,14 +2933,15 @@
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *
+cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
 retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2832,7 +2952,7 @@
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = cachep->nodelists[numa_node_id()];
+	l3 = cachep->nodelists[cpu_to_node(*this_cpu)];
 
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
@@ -2855,14 +2975,14 @@
 
 		slabp = list_entry(entry, struct slab, list);
 		check_slabp(cachep, slabp);
-		check_spinlock_acquired(cachep);
+		check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu));
 		while (slabp->inuse < cachep->num && batchcount--) {
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
-							    numa_node_id());
+						    cpu_to_node(*this_cpu));
 		}
 		check_slabp(cachep, slabp);
 
@@ -2881,10 +3001,10 @@
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, numa_node_id());
+		x = cache_grow(cachep, flags, cpu_to_node(*this_cpu), this_cpu);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
-		ac = cpu_cache_get(cachep);
+		ac = cpu_cache_get(cachep, *this_cpu);
 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
 			return NULL;
 
@@ -2963,7 +3083,8 @@
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static inline void *
+____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
 	void *objp;
 	struct array_cache *ac;
@@ -2977,14 +3098,14 @@
 #endif
 
 	check_irq_off();
-	ac = cpu_cache_get(cachep);
+	ac = cpu_cache_get(cachep, *this_cpu);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
 		objp = ac->entry[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		objp = cache_alloc_refill(cachep, flags, this_cpu);
 	}
 	return objp;
 }
@@ -2992,14 +3113,15 @@
 static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 						gfp_t flags, void *caller)
 {
-	unsigned long save_flags;
+	unsigned long irqflags;
+	int this_cpu;
 	void *objp;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 
-	local_irq_save(save_flags);
-	objp = ____cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
+	slab_irq_save(irqflags, this_cpu);
+	objp = ____cache_alloc(cachep, flags, &this_cpu);
+	slab_irq_restore(irqflags, this_cpu);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
 					    caller);
 	prefetchw(objp);
@@ -3015,7 +3137,7 @@
  */
 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	int nid_alloc, nid_here;
+	int nid_alloc, nid_here, this_cpu = raw_smp_processor_id();
 
 	if (in_interrupt())
 		return NULL;
@@ -3025,7 +3147,7 @@
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
 	if (nid_alloc != nid_here)
-		return __cache_alloc_node(cachep, flags, nid_alloc);
+		return __cache_alloc_node(cachep, flags, nid_alloc, &this_cpu);
 	return NULL;
 }
 
@@ -3033,7 +3155,7 @@
  * A interface to enable slab creation on nodeid
  */
 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
-				int nodeid)
+				int nodeid, int *this_cpu)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -3081,7 +3203,7 @@
 
 must_grow:
 	spin_unlock(&l3->list_lock);
-	x = cache_grow(cachep, flags, nodeid);
+	x = cache_grow(cachep, flags, nodeid, this_cpu);
 
 	if (!x)
 		return NULL;
@@ -3096,7 +3218,7 @@
  * Caller needs to acquire correct kmem_list's list_lock
  */
 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
-		       int node)
+		       int node, int *this_cpu)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -3119,7 +3241,7 @@
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
 				l3->free_objects -= cachep->num;
-				slab_destroy(cachep, slabp);
+				slab_destroy(cachep, slabp, this_cpu);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
@@ -3133,11 +3255,12 @@
 	}
 }
 
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+static void
+cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
-	int node = numa_node_id();
+	int node = cpu_to_node(*this_cpu);
 
 	batchcount = ac->batchcount;
 #if DEBUG
@@ -3159,7 +3282,7 @@
 		}
 	}
 
-	free_block(cachep, ac->entry, batchcount, node);
+	free_block(cachep, ac->entry, batchcount, node, this_cpu);
 free_done:
 #if STATS
 	{
@@ -3188,14 +3311,15 @@
  * Release an obj back to its cache. If the obj has a constructed state, it must
  * be in this state _before_ it is released.  Called with disabled ints.
  */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	struct array_cache *ac = cpu_cache_get(cachep, *this_cpu);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
-	if (cache_free_alien(cachep, objp))
+	if (cache_free_alien(cachep, objp, this_cpu))
 		return;
 
 	if (likely(ac->avail < ac->limit)) {
@@ -3204,7 +3328,7 @@
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
-		cache_flusharray(cachep, ac);
+		cache_flusharray(cachep, ac, this_cpu);
 		ac->entry[ac->avail++] = objp;
 	}
 }
@@ -3298,17 +3422,18 @@
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	unsigned long save_flags;
+	int this_cpu;
 	void *ptr;
 
 	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
+	slab_irq_save(save_flags, this_cpu);
 
-	if (nodeid == -1 || nodeid == numa_node_id() ||
+	if (nodeid == -1 || nodeid == cpu_to_node(this_cpu) ||
 			!cachep->nodelists[nodeid])
-		ptr = ____cache_alloc(cachep, flags);
+		ptr = ____cache_alloc(cachep, flags, &this_cpu);
 	else
-		ptr = __cache_alloc_node(cachep, flags, nodeid);
-	local_irq_restore(save_flags);
+		ptr = __cache_alloc_node(cachep, flags, nodeid, &this_cpu);
+	slab_irq_restore(save_flags, this_cpu);
 
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
 					   __builtin_return_address(0));
@@ -3430,12 +3555,13 @@
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
+	int this_cpu;
 
 	BUG_ON(virt_to_cache(objp) != cachep);
 
-	local_irq_save(flags);
-	__cache_free(cachep, objp);
-	local_irq_restore(flags);
+	slab_irq_save(flags, this_cpu);
+	__cache_free(cachep, objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3452,15 +3578,16 @@
 {
 	struct kmem_cache *c;
 	unsigned long flags;
+	int this_cpu;
 
 	if (unlikely(!objp))
 		return;
-	local_irq_save(flags);
+	slab_irq_save(flags, this_cpu);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, obj_size(c));
-	__cache_free(c, (void *)objp);
-	local_irq_restore(flags);
+	__cache_free(c, (void *)objp, &this_cpu);
+	slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3504,7 +3631,7 @@
  */
 static int alloc_kmemlist(struct kmem_cache *cachep)
 {
-	int node;
+	int node, this_cpu;
 	struct kmem_list3 *l3;
 	struct array_cache *new_shared;
 	struct array_cache **new_alien;
@@ -3527,11 +3654,11 @@
 		if (l3) {
 			struct array_cache *shared = l3->shared;
 
-			spin_lock_irq(&l3->list_lock);
+			slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 			if (shared)
 				free_block(cachep, shared->entry,
-						shared->avail, node);
+					   shared->avail, node, &this_cpu);
 
 			l3->shared = new_shared;
 			if (!l3->alien) {
@@ -3540,7 +3667,7 @@
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
+			slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
@@ -3587,24 +3714,36 @@
 	struct array_cache *new[NR_CPUS];
 };
 
-static void do_ccupdate_local(void *info)
+static void __do_ccupdate_local(void *info, int this_cpu)
 {
 	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
 	check_irq_off();
-	old = cpu_cache_get(new->cachep);
+	old = cpu_cache_get(new->cachep, this_cpu);
 
-	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-	new->new[smp_processor_id()] = old;
+	new->cachep->array[this_cpu] = new->new[this_cpu];
+	new->new[this_cpu] = old;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void do_ccupdate_local(void *arg, int this_cpu)
+{
+	__do_ccupdate_local(arg, this_cpu);
+}
+#else
+static void do_ccupdate_local(void *arg)
+{
+	__do_ccupdate_local(arg, smp_processor_id());
+}
+#endif
+
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
 	struct ccupdate_struct new;
-	int i, err;
+	int i, err, this_cpu;
 
 	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
@@ -3618,7 +3757,7 @@
 	}
 	new.cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
+	slab_on_each_cpu(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
@@ -3629,9 +3768,9 @@
 		struct array_cache *ccold = new.new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
-		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+		slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
+		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu);
+		slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu);
 		kfree(ccold);
 	}
 
@@ -3705,8 +3844,9 @@
  * if drain_array() is used on the shared array.
  */
 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
-			 struct array_cache *ac, int force, int node)
+		 struct array_cache *ac, int force, int node)
 {
+	int this_cpu = smp_processor_id();
 	int tofree;
 
 	if (!ac || !ac->avail)
@@ -3714,17 +3854,17 @@
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else {
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 		if (ac->avail) {
 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
 			if (tofree > ac->avail)
 				tofree = (ac->avail + 1) / 2;
-			free_block(cachep, ac->entry, tofree, node);
+			free_block(cachep, ac->entry, tofree, node, &this_cpu);
 			ac->avail -= tofree;
 			memmove(ac->entry, &(ac->entry[tofree]),
 				sizeof(void *) * ac->avail);
 		}
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
 }
 
@@ -3742,13 +3882,13 @@
  */
 static void cache_reap(void *unused)
 {
+	int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu);
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
-	int node = numa_node_id();
 
 	if (!mutex_trylock(&cache_chain_mutex)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work),
+		schedule_delayed_work(&per_cpu(reap_work, this_cpu),
 				      REAPTIMEOUT_CPUC);
 		return;
 	}
@@ -3763,9 +3903,10 @@
 		 */
 		l3 = searchp->nodelists[node];
 
-		reap_alien(searchp, l3);
+		reap_alien(searchp, l3, &this_cpu);
 
-		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+		drain_array(searchp, l3, cpu_cache_get(searchp, this_cpu),
+			    0, node);
 
 		/*
 		 * These are racy checks but it does not matter
@@ -3863,7 +4004,7 @@
 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
-	int node;
+	int this_cpu, node;
 	struct kmem_list3 *l3;
 
 	active_objs = 0;
@@ -3874,7 +4015,7 @@
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list) {
 			if (slabp->inuse != cachep->num && !error)
@@ -3899,7 +4040,7 @@
 		if (l3->shared)
 			shared_avail += l3->shared->avail;
 
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
@@ -4108,7 +4249,7 @@
 	struct kmem_list3 *l3;
 	const char *name;
 	unsigned long *n = m->private;
-	int node;
+	int node, this_cpu;
 	int i;
 
 	if (!(cachep->flags & SLAB_STORE_USER))
@@ -4126,13 +4267,13 @@
 			continue;
 
 		check_irq_on();
-		spin_lock_irq(&l3->list_lock);
+		slab_spin_lock_irq(&l3->list_lock, this_cpu);
 
 		list_for_each_entry(slabp, &l3->slabs_full, list)
 			handle_slab(n, cachep, slabp);
 		list_for_each_entry(slabp, &l3->slabs_partial, list)
 			handle_slab(n, cachep, slabp);
-		spin_unlock_irq(&l3->list_lock);
+		slab_spin_unlock_irq(&l3->list_lock, this_cpu);
 	}
 	name = cachep->name;
 	if (n[0] == n[1]) {
diff -urN ./linux-2.6.18.1/mm/slob.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/slob.c
--- ./linux-2.6.18.1/mm/slob.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/slob.c	2007-05-19 23:58:35.000000000 +0900
@@ -27,6 +27,20 @@
  * are allocated by calling __get_free_pages. As SLAB objects know
  * their size, no separate size bookkeeping is necessary and there is
  * essentially no allocation space overhead.
+ *
+ * Modified by: Steven Rostedt <rostedt@goodmis.org> 12/20/05
+ *
+ * Now we take advantage of the kmem_cache usage.  I've removed
+ * the global slobfree, and created one for every cache.
+ *
+ * For kmalloc/kfree I've reintroduced the usage of cache_sizes,
+ * but only for sizes 32 through PAGE_SIZE >> 1 by order of 2.
+ *
+ * Having the SLOB alloc per size of the cache should speed things up
+ * greatly, not only by making the search paths smaller, but also by
+ * keeping all the caches of similar units.  This way the fragmentation
+ * should not be as big of a problem.
+ *
  */
 
 #include <linux/slab.h>
@@ -36,6 +50,8 @@
 #include <linux/module.h>
 #include <linux/timer.h>
 
+#undef DEBUG_CACHE
+
 struct slob_block {
 	int units;
 	struct slob_block *next;
@@ -49,28 +65,125 @@
 struct bigblock {
 	int order;
 	void *pages;
-	struct bigblock *next;
 };
 typedef struct bigblock bigblock_t;
 
-static slob_t arena = { .next = &arena, .units = 1 };
-static slob_t *slobfree = &arena;
-static bigblock_t *bigblocks;
-static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	slob_t *slobfree;
+	slob_t arena;
+	spinlock_t lock;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+	atomic_t items;
+	unsigned int free;
+	struct list_head list;
+};
+
+#define NR_SLOB_CACHES ((PAGE_SHIFT) - 5) /* 32 to PAGE_SIZE-1 by order of 2 */
+#define MAX_SLOB_CACHE_SIZE (PAGE_SIZE >> 1)
 
-static void slob_free(void *b, int size);
+static struct kmem_cache *cache_sizes[NR_SLOB_CACHES];
+static struct kmem_cache *bb_cache;
 
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static struct semaphore	cache_chain_sem;
+static struct list_head cache_chain;
+
+#ifdef DEBUG_CACHE
+static void test_cache(kmem_cache_t *c)
 {
+	slob_t *cur = c->slobfree;
+	unsigned int x = -1 >> 2;
+
+	do {
+		BUG_ON(!cur->next);
+		cur = cur->next;
+	} while (cur != c->slobfree && --x);
+	BUG_ON(!x);
+}
+#else
+#define test_cache(x) do {} while(0)
+#endif
+
+/*
+ * Here we take advantage of the lru field of the pages that
+ * map to the pages we use in the SLOB.  This is done similar
+ * to what is done with SLAB.
+ *
+ * The lru.next field is used to get the bigblock descriptor
+ *    for large blocks larger than PAGE_SIZE >> 1.
+ *
+ * Set and retrieved by set_slob_block and get_slob_block
+ * respectively.
+ *
+ * The lru.prev field is used to find the cache descriptor
+ *   for small blocks smaller than or equal to PAGE_SIZE >> 1.
+ *
+ * Set and retrieved by set_slob_ptr and get_slob_ptr
+ * respectively.
+ *
+ * The use of lru.next tells us in kmalloc that the page is large.
+ */
+static inline struct page *get_slob_page(const void *mem)
+{
+	return virt_to_page(mem);
+}
+
+static inline void zero_slob_block(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	memset(&page->lru, 0, sizeof(page->lru));
+}
+
+static inline void *get_slob_block(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	return page->lru.next;
+}
+
+static inline void set_slob_block(const void *b, void *data)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	page->lru.next = data;
+}
+
+static inline void *get_slob_ptr(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	return page->lru.prev;
+}
+
+static inline void set_slob_ptr(const void *b, void *data)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	page->lru.prev = data;
+}
+
+static void slob_free(kmem_cache_t *cachep, void *b, int size);
+
+static void *slob_alloc(kmem_cache_t *cachep, gfp_t gfp, int align)
+{
+	size_t size;
 	slob_t *prev, *cur, *aligned = 0;
-	int delta = 0, units = SLOB_UNITS(size);
+	int delta = 0, units;
 	unsigned long flags;
 
-	spin_lock_irqsave(&slob_lock, flags);
-	prev = slobfree;
+	size = cachep->size;
+	units = SLOB_UNITS(size);
+	BUG_ON(!units);
+
+	spin_lock_irqsave(&cachep->lock, flags);
+	prev = cachep->slobfree;
 	for (cur = prev->next; ; prev = cur, cur = cur->next) {
 		if (align) {
+			while (align < SLOB_UNIT)
+				align <<= 1;
 			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
 			delta = aligned - cur;
 		}
@@ -93,12 +206,16 @@
 				cur->units = units;
 			}
 
-			slobfree = prev;
-			spin_unlock_irqrestore(&slob_lock, flags);
+			cachep->slobfree = prev;
+			test_cache(cachep);
+			if (prev < prev->next)
+				BUG_ON(cur + cur->units > prev->next);
+			spin_unlock_irqrestore(&cachep->lock, flags);
 			return cur;
 		}
-		if (cur == slobfree) {
-			spin_unlock_irqrestore(&slob_lock, flags);
+		if (cur == cachep->slobfree) {
+			test_cache(cachep);
+			spin_unlock_irqrestore(&cachep->lock, flags);
 
 			if (size == PAGE_SIZE) /* trying to shrink arena? */
 				return 0;
@@ -107,14 +224,16 @@
 			if (!cur)
 				return 0;
 
-			slob_free(cur, PAGE_SIZE);
-			spin_lock_irqsave(&slob_lock, flags);
-			cur = slobfree;
+			zero_slob_block(cur);
+			set_slob_ptr(cur, cachep);
+			slob_free(cachep, cur, PAGE_SIZE);
+			spin_lock_irqsave(&cachep->lock, flags);
+			cur = cachep->slobfree;
 		}
 	}
 }
 
-static void slob_free(void *block, int size)
+static void slob_free(kmem_cache_t *cachep, void *block, int size)
 {
 	slob_t *cur, *b = (slob_t *)block;
 	unsigned long flags;
@@ -126,26 +245,29 @@
 		b->units = SLOB_UNITS(size);
 
 	/* Find reinsertion point */
-	spin_lock_irqsave(&slob_lock, flags);
-	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+	spin_lock_irqsave(&cachep->lock, flags);
+	for (cur = cachep->slobfree; !(b > cur && b < cur->next); cur = cur->next)
 		if (cur >= cur->next && (b > cur || b < cur->next))
 			break;
 
 	if (b + b->units == cur->next) {
 		b->units += cur->next->units;
 		b->next = cur->next->next;
+		BUG_ON(cur->next == &cachep->arena);
 	} else
 		b->next = cur->next;
 
 	if (cur + cur->units == b) {
 		cur->units += b->units;
 		cur->next = b->next;
+		BUG_ON(b == &cachep->arena);
 	} else
 		cur->next = b;
 
-	slobfree = cur;
+	cachep->slobfree = cur;
 
-	spin_unlock_irqrestore(&slob_lock, flags);
+	test_cache(cachep);
+	spin_unlock_irqrestore(&cachep->lock, flags);
 }
 
 static int FASTCALL(find_order(int size));
@@ -159,16 +281,24 @@
 
 void *kmalloc(size_t size, gfp_t gfp)
 {
-	slob_t *m;
 	bigblock_t *bb;
-	unsigned long flags;
 
-	if (size < PAGE_SIZE - SLOB_UNIT) {
-		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
-		return m ? (void *)(m + 1) : 0;
+	/*
+	 * If the size is less than PAGE_SIZE >> 1 then
+	 * we use the generic caches.  Otherwise, we
+	 * just allocate the necessary pages.
+	 */
+	if (size <= MAX_SLOB_CACHE_SIZE) {
+		int i;
+		int order;
+		for (i=0, order=32; i < NR_SLOB_CACHES; i++, order <<= 1)
+			if (size <= order)
+				break;
+		BUG_ON(i == NR_SLOB_CACHES);
+		return kmem_cache_alloc(cache_sizes[i], gfp);
 	}
 
-	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	bb = slob_alloc(bb_cache, gfp, 0);
 	if (!bb)
 		return 0;
 
@@ -176,14 +306,11 @@
 	bb->pages = (void *)__get_free_pages(gfp, bb->order);
 
 	if (bb->pages) {
-		spin_lock_irqsave(&block_lock, flags);
-		bb->next = bigblocks;
-		bigblocks = bb;
-		spin_unlock_irqrestore(&block_lock, flags);
+		set_slob_block(bb->pages, bb);
 		return bb->pages;
 	}
 
-	slob_free(bb, sizeof(bigblock_t));
+	slob_free(bb_cache, bb, sizeof(bigblock_t));
 	return 0;
 }
 
@@ -191,28 +318,26 @@
 
 void kfree(const void *block)
 {
-	bigblock_t *bb, **last = &bigblocks;
-	unsigned long flags;
+	kmem_cache_t *c;
+	bigblock_t *bb;
 
 	if (!block)
 		return;
 
-	if (!((unsigned long)block & (PAGE_SIZE-1))) {
-		/* might be on the big block list */
-		spin_lock_irqsave(&block_lock, flags);
-		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
-			if (bb->pages == block) {
-				*last = bb->next;
-				spin_unlock_irqrestore(&block_lock, flags);
-				free_pages((unsigned long)block, bb->order);
-				slob_free(bb, sizeof(bigblock_t));
-				return;
-			}
-		}
-		spin_unlock_irqrestore(&block_lock, flags);
+	/*
+	 * look into the page of the allocated block to
+	 * see if this is a big allocation or not.
+	 */
+	bb = get_slob_block(block);
+	if (bb) {
+		free_pages((unsigned long)block, bb->order);
+		slob_free(bb_cache, bb, sizeof(bigblock_t));
+		return;
 	}
 
-	slob_free((slob_t *)block - 1, 0);
+	c = get_slob_ptr(block);
+	kmem_cache_free(c, (void *)block);
+
 	return;
 }
 
@@ -221,29 +346,25 @@
 unsigned int ksize(const void *block)
 {
 	bigblock_t *bb;
-	unsigned long flags;
+	kmem_cache_t *c;
 
 	if (!block)
 		return 0;
 
-	if (!((unsigned long)block & (PAGE_SIZE-1))) {
-		spin_lock_irqsave(&block_lock, flags);
-		for (bb = bigblocks; bb; bb = bb->next)
-			if (bb->pages == block) {
-				spin_unlock_irqrestore(&slob_lock, flags);
-				return PAGE_SIZE << bb->order;
-			}
-		spin_unlock_irqrestore(&block_lock, flags);
-	}
+	bb = get_slob_block(block);
+	if (bb)
+		return PAGE_SIZE << bb->order;
 
-	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+	c = get_slob_ptr(block);
+	return c->size;
 }
 
-struct kmem_cache {
-	unsigned int size, align;
-	const char *name;
-	void (*ctor)(void *, struct kmem_cache *, unsigned long);
-	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+static slob_t cache_arena = { .next = &cache_arena, .units = 0 };
+struct kmem_cache cache_cache = {
+	.name = "cache",
+	.slobfree = &cache_cache.arena,
+	.arena = { .next = &cache_cache.arena, .units = 0 },
+	.lock = SPIN_LOCK_UNLOCKED
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
@@ -252,8 +373,22 @@
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *c;
+	void *p;
+
+	c = slob_alloc(&cache_cache, flags, 0);
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	memset(c, 0, sizeof(*c));
+
+	c->size = PAGE_SIZE;
+	c->arena.next = &c->arena;
+	c->arena.units = 0;
+	c->slobfree = &c->arena;
+	atomic_set(&c->items, 0);
+	spin_lock_init(&c->lock);
+
+	p = slob_alloc(c, 0, PAGE_SIZE-1);
+	if (p)
+		free_page((unsigned long)p);
 
 	if (c) {
 		c->name = name;
@@ -265,6 +400,9 @@
 		if (c->align < align)
 			c->align = align;
 	}
+	down(&cache_chain_sem);
+	list_add_tail(&c->list, &cache_chain);
+	up(&cache_chain_sem);
 
 	return c;
 }
@@ -272,7 +410,17 @@
 
 int kmem_cache_destroy(struct kmem_cache *c)
 {
-	slob_free(c, sizeof(struct kmem_cache));
+	down(&cache_chain_sem);
+	list_del(&c->list);
+	up(&cache_chain_sem);
+
+	BUG_ON(atomic_read(&c->items));
+
+	/*
+	 * WARNING!!! Memory leak!
+	 */
+	printk("FIX ME: need to free memory\n");
+	slob_free(&cache_cache, c, sizeof(struct kmem_cache));
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -281,11 +429,16 @@
 {
 	void *b;
 
-	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+	atomic_inc(&c->items);
+
+	if (c->size <= MAX_SLOB_CACHE_SIZE)
+		b = slob_alloc(c, flags, c->align);
 	else
 		b = (void *)__get_free_pages(flags, find_order(c->size));
 
+	if (!b)
+		return 0;
+
 	if (c->ctor)
 		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
 
@@ -305,11 +458,13 @@
 
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
+	atomic_dec(&c->items);
+
 	if (c->dtor)
 		c->dtor(b, c, 0);
 
-	if (c->size < PAGE_SIZE)
-		slob_free(b, c->size);
+	if (c->size <= MAX_SLOB_CACHE_SIZE)
+		slob_free(c, b, c->size);
 	else
 		free_pages((unsigned long)b, find_order(c->size));
 }
@@ -327,22 +482,62 @@
 }
 EXPORT_SYMBOL(kmem_cache_name);
 
-static struct timer_list slob_timer = TIMER_INITIALIZER(
-	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+static char cache_names[NR_SLOB_CACHES][15];
 
 void kmem_cache_init(void)
 {
-	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+	static int done;
+	void *p;
 
-	if (p)
-		free_page((unsigned long)p);
-
-	mod_timer(&slob_timer, jiffies + HZ);
+	if (!done) {
+		int i;
+		int size = 32;
+		done = 1;
+
+		init_MUTEX(&cache_chain_sem);
+		INIT_LIST_HEAD(&cache_chain);
+
+		cache_cache.size = PAGE_SIZE;
+		p = slob_alloc(&cache_cache, 0, PAGE_SIZE-1);
+		if (p)
+			free_page((unsigned long)p);
+		cache_cache.size = sizeof(struct kmem_cache);
+
+		bb_cache = kmem_cache_create("bb_cache",sizeof(bigblock_t), 0,
+					     GFP_KERNEL, NULL, NULL);
+		for (i=0; i < NR_SLOB_CACHES; i++, size <<= 1)
+			cache_sizes[i] = kmem_cache_create(cache_names[i], size, 0,
+							   GFP_KERNEL, NULL, NULL);
+	}
 }
 
 atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
 EXPORT_SYMBOL(slab_reclaim_pages);
 
+static void test_slob(slob_t *s)
+{
+	slob_t *p;
+	long x = 0;
+
+	for (p=s->next; p != s && x < 10000; p = p->next, x++)
+		printk(".");
+}
+
+void print_slobs(void)
+{
+	struct list_head *curr;
+
+	list_for_each(curr, &cache_chain) {
+		kmem_cache_t *c = list_entry(curr, struct kmem_cache, list);
+
+		printk("%s items:%d",
+		       c->name?:"<none>",
+		       atomic_read(&c->items));
+		test_slob(&c->arena);
+		printk("\n");
+	}
+}
+
 #ifdef CONFIG_SMP
 
 void *__alloc_percpu(size_t size)
diff -urN ./linux-2.6.18.1/mm/swap.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/swap.c
--- ./linux-2.6.18.1/mm/swap.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/swap.c	2007-05-19 23:58:35.000000000 +0900
@@ -155,45 +155,45 @@
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
 }
 
 void fastcall lru_cache_add_active(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
-static void __lru_add_drain(int cpu)
+void lru_add_drain(void)
 {
-	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+	struct pagevec *pvec;
+	int cpu;
 
-	/* CPU is dead, so no locking needed. */
+	pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
+
+	pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-}
-
-void lru_add_drain(void)
-{
-	__lru_add_drain(get_cpu());
-	put_cpu();
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 #ifdef CONFIG_NUMA
@@ -492,7 +492,9 @@
 	if (action == CPU_DEAD) {
 		atomic_add(*committed, &vm_committed_space);
 		*committed = 0;
-		__lru_add_drain((long)hcpu);
+		/* FIXME: */
+//		__lru_add_drain((long)hcpu);
+		lru_add_drain();
 	}
 	return NOTIFY_OK;
 }
diff -urN ./linux-2.6.18.1/mm/vmscan.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/vmscan.c
--- ./linux-2.6.18.1/mm/vmscan.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/vmscan.c	2007-05-20 00:11:13.000000000 +0900
@@ -22,6 +22,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
@@ -657,7 +658,7 @@
 		nr_scanned += nr_scan;
 		nr_freed = shrink_page_list(&page_list, sc);
 		nr_reclaimed += nr_freed;
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (current_is_kswapd()) {
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
 			__count_vm_events(KSWAPD_STEAL, nr_freed);
@@ -688,9 +689,14 @@
 			}
 		}
   	} while (nr_scanned < max_scan);
+	/*
+	 * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
+	 * per-CPU data. PREEMPT_RT has that data protected even in
+	 * __mod_page_state(), so no need to keep IRQs disabled.
+	 */
 	spin_unlock(&zone->lru_lock);
 done:
-	local_irq_enable();
+	local_irq_enable_nort();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
diff -urN ./linux-2.6.18.1/mm/vmscan.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/mm/vmscan.c.orig
--- ./linux-2.6.18.1/mm/vmscan.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/vmscan.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,1636 @@
+/*
+ *  linux/mm/vmscan.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Swap reorganised 29.12.95, Stephen Tweedie.
+ *  kswapd added: 7.1.96  sct
+ *  Removed kswapd_ctl limits, and swap out as many pages as needed
+ *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
+ *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ *  Multiqueue VM started 5.8.00, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/notifier.h>
+#include <linux/rwsem.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+struct scan_control {
+	/* Incremented by the number of inactive pages that were scanned */
+	unsigned long nr_scanned;
+
+	/* This context's GFP mask */
+	gfp_t gfp_mask;
+
+	int may_writepage;
+
+	/* Can pages be swapped as part of reclaim? */
+	int may_swap;
+
+	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
+	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+	 * In this context, it doesn't matter that we scan the
+	 * whole list at once. */
+	int swap_cluster_max;
+
+	int swappiness;
+};
+
+/*
+ * The list of shrinker callbacks used by to apply pressure to
+ * ageable caches.
+ */
+struct shrinker {
+	shrinker_t		shrinker;
+	struct list_head	list;
+	int			seeks;	/* seeks to recreate an obj */
+	long			nr;	/* objs pending delete */
+};
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+									\
+			prev = lru_to_page(&(_page->lru));		\
+			prefetch(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+									\
+			prev = lru_to_page(&(_page->lru));		\
+			prefetchw(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * From 0 .. 100.  Higher means more swappy.
+ */
+int vm_swappiness = 60;
+long vm_total_pages;	/* The total number of pages which the VM controls */
+
+static LIST_HEAD(shrinker_list);
+static DECLARE_RWSEM(shrinker_rwsem);
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+{
+        struct shrinker *shrinker;
+
+        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
+        if (shrinker) {
+	        shrinker->shrinker = theshrinker;
+	        shrinker->seeks = seeks;
+	        shrinker->nr = 0;
+	        down_write(&shrinker_rwsem);
+	        list_add_tail(&shrinker->list, &shrinker_list);
+	        up_write(&shrinker_rwsem);
+	}
+	return shrinker;
+}
+EXPORT_SYMBOL(set_shrinker);
+
+/*
+ * Remove one
+ */
+void remove_shrinker(struct shrinker *shrinker)
+{
+	down_write(&shrinker_rwsem);
+	list_del(&shrinker->list);
+	up_write(&shrinker_rwsem);
+	kfree(shrinker);
+}
+EXPORT_SYMBOL(remove_shrinker);
+
+#define SHRINK_BATCH 128
+/*
+ * Call the shrink functions to age shrinkable caches
+ *
+ * Here we assume it costs one seek to replace a lru page and that it also
+ * takes a seek to recreate a cache object.  With this in mind we age equal
+ * percentages of the lru and ageable caches.  This should balance the seeks
+ * generated by these structures.
+ *
+ * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * slab to avoid swapping.
+ *
+ * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ *
+ * `lru_pages' represents the number of on-LRU pages in all the zones which
+ * are eligible for the caller's allocation attempt.  It is used for balancing
+ * slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
+ */
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages)
+{
+	struct shrinker *shrinker;
+	unsigned long ret = 0;
+
+	if (scanned == 0)
+		scanned = SWAP_CLUSTER_MAX;
+
+	if (!down_read_trylock(&shrinker_rwsem))
+		return 1;	/* Assume we'll be able to shrink next time */
+
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		unsigned long long delta;
+		unsigned long total_scan;
+		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+
+		delta = (4 * scanned) / shrinker->seeks;
+		delta *= max_pass;
+		do_div(delta, lru_pages + 1);
+		shrinker->nr += delta;
+		if (shrinker->nr < 0) {
+			printk(KERN_ERR "%s: nr=%ld\n",
+					__FUNCTION__, shrinker->nr);
+			shrinker->nr = max_pass;
+		}
+
+		/*
+		 * Avoid risking looping forever due to too large nr value:
+		 * never try to free more than twice the estimate number of
+		 * freeable entries.
+		 */
+		if (shrinker->nr > max_pass * 2)
+			shrinker->nr = max_pass * 2;
+
+		total_scan = shrinker->nr;
+		shrinker->nr = 0;
+
+		while (total_scan >= SHRINK_BATCH) {
+			long this_scan = SHRINK_BATCH;
+			int shrink_ret;
+			int nr_before;
+
+			nr_before = (*shrinker->shrinker)(0, gfp_mask);
+			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+			if (shrink_ret == -1)
+				break;
+			if (shrink_ret < nr_before)
+				ret += nr_before - shrink_ret;
+			count_vm_events(SLABS_SCANNED, this_scan);
+			total_scan -= this_scan;
+
+			cond_resched();
+		}
+
+		shrinker->nr += total_scan;
+	}
+	up_read(&shrinker_rwsem);
+	return ret;
+}
+
+/* Called without lock on whether page is mapped, so answer is unstable */
+static inline int page_mapping_inuse(struct page *page)
+{
+	struct address_space *mapping;
+
+	/* Page is in somebody's page tables. */
+	if (page_mapped(page))
+		return 1;
+
+	/* Be more reluctant to reclaim swapcache than pagecache */
+	if (PageSwapCache(page))
+		return 1;
+
+	mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+
+	/* File is mmap'd by somebody? */
+	return mapping_mapped(mapping);
+}
+
+static inline int is_page_cache_freeable(struct page *page)
+{
+	return page_count(page) - !!PagePrivate(page) == 2;
+}
+
+static int may_write_to_queue(struct backing_dev_info *bdi)
+{
+	if (current->flags & PF_SWAPWRITE)
+		return 1;
+	if (!bdi_write_congested(bdi))
+		return 1;
+	if (bdi == current->backing_dev_info)
+		return 1;
+	return 0;
+}
+
+/*
+ * We detected a synchronous write error writing a page out.  Probably
+ * -ENOSPC.  We need to propagate that into the address_space for a subsequent
+ * fsync(), msync() or close().
+ *
+ * The tricky part is that after writepage we cannot touch the mapping: nothing
+ * prevents it from being freed up.  But we have a ref on the page and once
+ * that page is locked, the mapping is pinned.
+ *
+ * We're allowed to run sleeping lock_page() here because we know the caller has
+ * __GFP_FS.
+ */
+static void handle_write_error(struct address_space *mapping,
+				struct page *page, int error)
+{
+	lock_page(page);
+	if (page_mapping(page) == mapping) {
+		if (error == -ENOSPC)
+			set_bit(AS_ENOSPC, &mapping->flags);
+		else
+			set_bit(AS_EIO, &mapping->flags);
+	}
+	unlock_page(page);
+}
+
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+/*
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
+ */
+static pageout_t pageout(struct page *page, struct address_space *mapping)
+{
+	/*
+	 * If the page is dirty, only perform writeback if that write
+	 * will be non-blocking.  To prevent this allocation from being
+	 * stalled by pagecache activity.  But note that there may be
+	 * stalls if we need to run get_block().  We could test
+	 * PagePrivate for that.
+	 *
+	 * If this process is currently in generic_file_write() against
+	 * this page's queue, we can perform writeback even if that
+	 * will block.
+	 *
+	 * If the page is swapcache, write it back even if that would
+	 * block, for some throttling. This happens by accident, because
+	 * swap_backing_dev_info is bust: it doesn't reflect the
+	 * congestion state of the swapdevs.  Easy to fix, if needed.
+	 * See swapfile.c:page_queue_congested().
+	 */
+	if (!is_page_cache_freeable(page))
+		return PAGE_KEEP;
+	if (!mapping) {
+		/*
+		 * Some data journaling orphaned pages can have
+		 * page->mapping == NULL while being dirty with clean buffers.
+		 */
+		if (PagePrivate(page)) {
+			if (try_to_free_buffers(page)) {
+				ClearPageDirty(page);
+				printk("%s: orphaned page\n", __FUNCTION__);
+				return PAGE_CLEAN;
+			}
+		}
+		return PAGE_KEEP;
+	}
+	if (mapping->a_ops->writepage == NULL)
+		return PAGE_ACTIVATE;
+	if (!may_write_to_queue(mapping->backing_dev_info))
+		return PAGE_KEEP;
+
+	if (clear_page_dirty_for_io(page)) {
+		int res;
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_NONE,
+			.nr_to_write = SWAP_CLUSTER_MAX,
+			.range_start = 0,
+			.range_end = LLONG_MAX,
+			.nonblocking = 1,
+			.for_reclaim = 1,
+		};
+
+		SetPageReclaim(page);
+		res = mapping->a_ops->writepage(page, &wbc);
+		if (res < 0)
+			handle_write_error(mapping, page, res);
+		if (res == AOP_WRITEPAGE_ACTIVATE) {
+			ClearPageReclaim(page);
+			return PAGE_ACTIVATE;
+		}
+		if (!PageWriteback(page)) {
+			/* synchronous write or broken a_ops? */
+			ClearPageReclaim(page);
+		}
+
+		return PAGE_SUCCESS;
+	}
+
+	return PAGE_CLEAN;
+}
+
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return 0;		/* truncate got there first */
+
+	write_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * The non-racy check for busy page.  It is critical to check
+	 * PageDirty _after_ making sure that the page is freeable and
+	 * not in use by anybody. 	(pagecache + us == 2)
+	 */
+	if (unlikely(page_count(page) != 2))
+		goto cannot_free;
+	smp_rmb();
+	if (unlikely(PageDirty(page)))
+		goto cannot_free;
+
+	if (PageSwapCache(page)) {
+		swp_entry_t swap = { .val = page_private(page) };
+		__delete_from_swap_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		swap_free(swap);
+		__put_page(page);	/* The pagecache ref */
+		return 1;
+	}
+
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	__put_page(page);
+	return 1;
+
+cannot_free:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
+/*
+ * shrink_page_list() returns the number of reclaimed pages
+ */
+static unsigned long shrink_page_list(struct list_head *page_list,
+					struct scan_control *sc)
+{
+	LIST_HEAD(ret_pages);
+	struct pagevec freed_pvec;
+	int pgactivate = 0;
+	unsigned long nr_reclaimed = 0;
+
+	cond_resched();
+
+	pagevec_init(&freed_pvec, 1);
+	while (!list_empty(page_list)) {
+		struct address_space *mapping;
+		struct page *page;
+		int may_enter_fs;
+		int referenced;
+
+		cond_resched();
+
+		page = lru_to_page(page_list);
+		list_del(&page->lru);
+
+		if (TestSetPageLocked(page))
+			goto keep;
+
+		BUG_ON(PageActive(page));
+
+		sc->nr_scanned++;
+
+		if (!sc->may_swap && page_mapped(page))
+			goto keep_locked;
+
+		/* Double the slab pressure for mapped and swapcache pages */
+		if (page_mapped(page) || PageSwapCache(page))
+			sc->nr_scanned++;
+
+		if (PageWriteback(page))
+			goto keep_locked;
+
+		referenced = page_referenced(page, 1);
+		/* In active use or really unfreeable?  Activate it. */
+		if (referenced && page_mapping_inuse(page))
+			goto activate_locked;
+
+#ifdef CONFIG_SWAP
+		/*
+		 * Anonymous process memory has backing store?
+		 * Try to allocate it some swap space here.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page))
+			if (!add_to_swap(page, GFP_ATOMIC))
+				goto activate_locked;
+#endif /* CONFIG_SWAP */
+
+		mapping = page_mapping(page);
+		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+		/*
+		 * The page is mapped into the page tables of one or more
+		 * processes. Try to unmap it here.
+		 */
+		if (page_mapped(page) && mapping) {
+			switch (try_to_unmap(page, 0)) {
+			case SWAP_FAIL:
+				goto activate_locked;
+			case SWAP_AGAIN:
+				goto keep_locked;
+			case SWAP_SUCCESS:
+				; /* try to free the page below */
+			}
+		}
+
+		if (PageDirty(page)) {
+			if (referenced)
+				goto keep_locked;
+			if (!may_enter_fs)
+				goto keep_locked;
+			if (!sc->may_writepage)
+				goto keep_locked;
+
+			/* Page is dirty, try to write it out here */
+			switch(pageout(page, mapping)) {
+			case PAGE_KEEP:
+				goto keep_locked;
+			case PAGE_ACTIVATE:
+				goto activate_locked;
+			case PAGE_SUCCESS:
+				if (PageWriteback(page) || PageDirty(page))
+					goto keep;
+				/*
+				 * A synchronous write - probably a ramdisk.  Go
+				 * ahead and try to reclaim the page.
+				 */
+				if (TestSetPageLocked(page))
+					goto keep;
+				if (PageDirty(page) || PageWriteback(page))
+					goto keep_locked;
+				mapping = page_mapping(page);
+			case PAGE_CLEAN:
+				; /* try to free the page below */
+			}
+		}
+
+		/*
+		 * If the page has buffers, try to free the buffer mappings
+		 * associated with this page. If we succeed we try to free
+		 * the page as well.
+		 *
+		 * We do this even if the page is PageDirty().
+		 * try_to_release_page() does not perform I/O, but it is
+		 * possible for a page to have PageDirty set, but it is actually
+		 * clean (all its buffers are clean).  This happens if the
+		 * buffers were written out directly, with submit_bh(). ext3
+		 * will do this, as well as the blockdev mapping. 
+		 * try_to_release_page() will discover that cleanness and will
+		 * drop the buffers and mark the page clean - it can be freed.
+		 *
+		 * Rarely, pages can have buffers and no ->mapping.  These are
+		 * the pages which were not successfully invalidated in
+		 * truncate_complete_page().  We try to drop those buffers here
+		 * and if that worked, and the page is no longer mapped into
+		 * process address space (page_count == 1) it can be freed.
+		 * Otherwise, leave the page on the LRU so it is swappable.
+		 */
+		if (PagePrivate(page)) {
+			if (!try_to_release_page(page, sc->gfp_mask))
+				goto activate_locked;
+			if (!mapping && page_count(page) == 1)
+				goto free_it;
+		}
+
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
+
+free_it:
+		unlock_page(page);
+		nr_reclaimed++;
+		if (!pagevec_add(&freed_pvec, page))
+			__pagevec_release_nonlru(&freed_pvec);
+		continue;
+
+activate_locked:
+		SetPageActive(page);
+		pgactivate++;
+keep_locked:
+		unlock_page(page);
+keep:
+		list_add(&page->lru, &ret_pages);
+		BUG_ON(PageLRU(page));
+	}
+	list_splice(&ret_pages, page_list);
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+	count_vm_events(PGACTIVATE, pgactivate);
+	return nr_reclaimed;
+}
+
+/*
+ * zone->lru_lock is heavily contended.  Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan:	The number of pages to look through on the list.
+ * @src:	The LRU list to pull pages off.
+ * @dst:	The temp list to put pages on to.
+ * @scanned:	The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+		struct list_head *src, struct list_head *dst,
+		unsigned long *scanned)
+{
+	unsigned long nr_taken = 0;
+	struct page *page;
+	unsigned long scan;
+
+	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+		struct list_head *target;
+		page = lru_to_page(src);
+		prefetchw_prev_lru_page(page, src, flags);
+
+		BUG_ON(!PageLRU(page));
+
+		list_del(&page->lru);
+		target = src;
+		if (likely(get_page_unless_zero(page))) {
+			/*
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
+			 */
+			ClearPageLRU(page);
+			target = dst;
+			nr_taken++;
+		} /* else it is being freed elsewhere */
+
+		list_add(&page->lru, target);
+	}
+
+	*scanned = scan;
+	return nr_taken;
+}
+
+/*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+				struct zone *zone, struct scan_control *sc)
+{
+	LIST_HEAD(page_list);
+	struct pagevec pvec;
+	unsigned long nr_scanned = 0;
+	unsigned long nr_reclaimed = 0;
+
+	pagevec_init(&pvec, 1);
+
+	lru_add_drain();
+	spin_lock_irq(&zone->lru_lock);
+	do {
+		struct page *page;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		unsigned long nr_freed;
+
+		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+					     &zone->inactive_list,
+					     &page_list, &nr_scan);
+		zone->nr_inactive -= nr_taken;
+		zone->pages_scanned += nr_scan;
+		spin_unlock_irq(&zone->lru_lock);
+
+		nr_scanned += nr_scan;
+		nr_freed = shrink_page_list(&page_list, sc);
+		nr_reclaimed += nr_freed;
+		local_irq_disable_nort();
+		if (current_is_kswapd()) {
+			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+			__count_vm_events(KSWAPD_STEAL, nr_freed);
+		} else
+			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+		__count_vm_events(PGACTIVATE, nr_freed);
+
+		if (nr_taken == 0)
+			goto done;
+
+		spin_lock(&zone->lru_lock);
+		/*
+		 * Put back any unfreeable pages.
+		 */
+		while (!list_empty(&page_list)) {
+			page = lru_to_page(&page_list);
+			BUG_ON(PageLRU(page));
+			SetPageLRU(page);
+			list_del(&page->lru);
+			if (PageActive(page))
+				add_page_to_active_list(zone, page);
+			else
+				add_page_to_inactive_list(zone, page);
+			if (!pagevec_add(&pvec, page)) {
+				spin_unlock_irq(&zone->lru_lock);
+				__pagevec_release(&pvec);
+				spin_lock_irq(&zone->lru_lock);
+			}
+		}
+  	} while (nr_scanned < max_scan);
+	/*
+	 * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
+	 * per-CPU data. PREEMPT_RT has that data protected even in
+	 * __mod_page_state(), so no need to keep IRQs disabled.
+	 */
+	spin_unlock(&zone->lru_lock);
+done:
+	local_irq_enable_nort();
+	pagevec_release(&pvec);
+	return nr_reclaimed;
+}
+
+/*
+ * This moves pages from the active list to the inactive list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone->lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_count against each page.
+ * But we had to alter page->flags anyway.
+ */
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+				struct scan_control *sc)
+{
+	unsigned long pgmoved;
+	int pgdeactivate = 0;
+	unsigned long pgscanned;
+	LIST_HEAD(l_hold);	/* The pages which were snipped off */
+	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
+	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
+	struct page *page;
+	struct pagevec pvec;
+	int reclaim_mapped = 0;
+
+	if (sc->may_swap) {
+		long mapped_ratio;
+		long distress;
+		long swap_tendency;
+
+		/*
+		 * `distress' is a measure of how much trouble we're having
+		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+		 */
+		distress = 100 >> zone->prev_priority;
+
+		/*
+		 * The point of this algorithm is to decide when to start
+		 * reclaiming mapped memory instead of just pagecache.  Work out
+		 * how much memory
+		 * is mapped.
+		 */
+		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+				global_page_state(NR_ANON_PAGES)) * 100) /
+					vm_total_pages;
+
+		/*
+		 * Now decide how much we really want to unmap some pages.  The
+		 * mapped ratio is downgraded - just because there's a lot of
+		 * mapped memory doesn't necessarily mean that page reclaim
+		 * isn't succeeding.
+		 *
+		 * The distress ratio is important - we don't want to start
+		 * going oom.
+		 *
+		 * A 100% value of vm_swappiness overrides this algorithm
+		 * altogether.
+		 */
+		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+
+		/*
+		 * Now use this metric to decide whether to start moving mapped
+		 * memory onto the inactive list.
+		 */
+		if (swap_tendency >= 100)
+			reclaim_mapped = 1;
+	}
+
+	lru_add_drain();
+	spin_lock_irq(&zone->lru_lock);
+	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+				    &l_hold, &pgscanned);
+	zone->pages_scanned += pgscanned;
+	zone->nr_active -= pgmoved;
+	spin_unlock_irq(&zone->lru_lock);
+
+	while (!list_empty(&l_hold)) {
+		cond_resched();
+		page = lru_to_page(&l_hold);
+		list_del(&page->lru);
+		if (page_mapped(page)) {
+			if (!reclaim_mapped ||
+			    (total_swap_pages == 0 && PageAnon(page)) ||
+			    page_referenced(page, 0)) {
+				list_add(&page->lru, &l_active);
+				continue;
+			}
+		}
+		list_add(&page->lru, &l_inactive);
+	}
+
+	pagevec_init(&pvec, 1);
+	pgmoved = 0;
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(&l_inactive)) {
+		page = lru_to_page(&l_inactive);
+		prefetchw_prev_lru_page(page, &l_inactive, flags);
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
+		list_move(&page->lru, &zone->inactive_list);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			zone->nr_inactive += pgmoved;
+			spin_unlock_irq(&zone->lru_lock);
+			pgdeactivate += pgmoved;
+			pgmoved = 0;
+			if (buffer_heads_over_limit)
+				pagevec_strip(&pvec);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	zone->nr_inactive += pgmoved;
+	pgdeactivate += pgmoved;
+	if (buffer_heads_over_limit) {
+		spin_unlock_irq(&zone->lru_lock);
+		pagevec_strip(&pvec);
+		spin_lock_irq(&zone->lru_lock);
+	}
+
+	pgmoved = 0;
+	while (!list_empty(&l_active)) {
+		page = lru_to_page(&l_active);
+		prefetchw_prev_lru_page(page, &l_active, flags);
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		BUG_ON(!PageActive(page));
+		list_move(&page->lru, &zone->active_list);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			zone->nr_active += pgmoved;
+			pgmoved = 0;
+			spin_unlock_irq(&zone->lru_lock);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	zone->nr_active += pgmoved;
+
+	__count_zone_vm_events(PGREFILL, zone, pgscanned);
+	__count_vm_events(PGDEACTIVATE, pgdeactivate);
+	spin_unlock_irq(&zone->lru_lock);
+
+	pagevec_release(&pvec);
+}
+
+/*
+ * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ */
+static unsigned long shrink_zone(int priority, struct zone *zone,
+				struct scan_control *sc)
+{
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
+
+	atomic_inc(&zone->reclaim_in_progress);
+
+	/*
+	 * Add one to `nr_to_scan' just to make sure that the kernel will
+	 * slowly sift through the active list.
+	 */
+	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
+	nr_active = zone->nr_scan_active;
+	if (nr_active >= sc->swap_cluster_max)
+		zone->nr_scan_active = 0;
+	else
+		nr_active = 0;
+
+	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
+	nr_inactive = zone->nr_scan_inactive;
+	if (nr_inactive >= sc->swap_cluster_max)
+		zone->nr_scan_inactive = 0;
+	else
+		nr_inactive = 0;
+
+	while (nr_active || nr_inactive) {
+		if (nr_active) {
+			nr_to_scan = min(nr_active,
+					(unsigned long)sc->swap_cluster_max);
+			nr_active -= nr_to_scan;
+			shrink_active_list(nr_to_scan, zone, sc);
+		}
+
+		if (nr_inactive) {
+			nr_to_scan = min(nr_inactive,
+					(unsigned long)sc->swap_cluster_max);
+			nr_inactive -= nr_to_scan;
+			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+								sc);
+		}
+	}
+
+	throttle_vm_writeout();
+
+	atomic_dec(&zone->reclaim_in_progress);
+	return nr_reclaimed;
+}
+
+/*
+ * This is the direct reclaim path, for page-allocating processes.  We only
+ * try to reclaim pages from zones which will satisfy the caller's allocation
+ * request.
+ *
+ * We reclaim from a zone even if that zone is over pages_high.  Because:
+ * a) The caller may be trying to free *extra* pages to satisfy a higher-order
+ *    allocation or
+ * b) The zones may be over pages_high but they must go *over* pages_high to
+ *    satisfy the `incremental min' zone defense algorithm.
+ *
+ * Returns the number of reclaimed pages.
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
+ */
+static unsigned long shrink_zones(int priority, struct zone **zones,
+					struct scan_control *sc)
+{
+	unsigned long nr_reclaimed = 0;
+	int i;
+
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *zone = zones[i];
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		zone->temp_priority = priority;
+		if (zone->prev_priority > priority)
+			zone->prev_priority = priority;
+
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			continue;	/* Let kswapd poll it */
+
+		nr_reclaimed += shrink_zone(priority, zone, sc);
+	}
+	return nr_reclaimed;
+}
+ 
+/*
+ * This is the main entry point to direct page reclaim.
+ *
+ * If a full scan of the inactive list fails to free enough memory then we
+ * are "out of memory" and something needs to be killed.
+ *
+ * If the caller is !__GFP_FS then the probability of a failure is reasonably
+ * high - the zone may be full of dirty or under-writeback pages, which this
+ * caller can't do much about.  We kick pdflush and take explicit naps in the
+ * hope that some of these pages can be written.  But if the allocating task
+ * holds filesystem locks which prevent writeout this might not work, and the
+ * allocation attempt will fail.
+ */
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+{
+	int priority;
+	int ret = 0;
+	unsigned long total_scanned = 0;
+	unsigned long nr_reclaimed = 0;
+	struct reclaim_state *reclaim_state = current->reclaim_state;
+	unsigned long lru_pages = 0;
+	int i;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+		.swappiness = vm_swappiness,
+	};
+
+	count_vm_event(ALLOCSTALL);
+
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		zone->temp_priority = DEF_PRIORITY;
+		lru_pages += zone->nr_active + zone->nr_inactive;
+	}
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		sc.nr_scanned = 0;
+		if (!priority)
+			disable_swap_token();
+		nr_reclaimed += shrink_zones(priority, zones, &sc);
+		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+		if (reclaim_state) {
+			nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
+		total_scanned += sc.nr_scanned;
+		if (nr_reclaimed >= sc.swap_cluster_max) {
+			ret = 1;
+			goto out;
+		}
+
+		/*
+		 * Try to write back as many pages as we just scanned.  This
+		 * tends to cause slow streaming writers to write data to the
+		 * disk smoothly, at the dirtying rate, which is nice.   But
+		 * that's undesirable in laptop mode, where we *want* lumpy
+		 * writeout.  So in laptop mode, write out the whole world.
+		 */
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
+			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+			sc.may_writepage = 1;
+		}
+
+		/* Take a nap, wait for some writeback to complete */
+		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+			blk_congestion_wait(WRITE, HZ/10);
+	}
+out:
+	for (i = 0; zones[i] != 0; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		zone->prev_priority = zone->temp_priority;
+	}
+	return ret;
+}
+
+/*
+ * For kswapd, balance_pgdat() will work across all this node's zones until
+ * they are all at pages_high.
+ *
+ * Returns the number of pages which were actually freed.
+ *
+ * There is special handling here for zones which are full of pinned pages.
+ * This can happen if the pages are all mlocked, or if they are all used by
+ * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
+ * What we do is to detect the case where all pages in the zone have been
+ * scanned twice and there has been zero successful reclaim.  Mark the zone as
+ * dead and from now on, only perform a short scan.  Basically we're polling
+ * the zone for when the problem goes away.
+ *
+ * kswapd scans the zones in the highmem->normal->dma direction.  It skips
+ * zones which have free_pages > pages_high, but once a zone is found to have
+ * free_pages <= pages_high, we scan that zone and the lower zones regardless
+ * of the number of free pages in the lower zones.  This interoperates with
+ * the page allocator fallback scheme to ensure that aging of pages is balanced
+ * across the zones.
+ */
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+{
+	int all_zones_ok;
+	int priority;
+	int i;
+	unsigned long total_scanned;
+	unsigned long nr_reclaimed;
+	struct reclaim_state *reclaim_state = current->reclaim_state;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 1,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.swappiness = vm_swappiness,
+	};
+
+loop_again:
+	total_scanned = 0;
+	nr_reclaimed = 0;
+	sc.may_writepage = !laptop_mode;
+	count_vm_event(PAGEOUTRUN);
+
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		zone->temp_priority = DEF_PRIORITY;
+	}
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+		unsigned long lru_pages = 0;
+
+		/* The swap token gets in the way of swapout... */
+		if (!priority)
+			disable_swap_token();
+
+		all_zones_ok = 1;
+
+		/*
+		 * Scan in the highmem->dma direction for the highest
+		 * zone which needs scanning
+		 */
+		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			if (!populated_zone(zone))
+				continue;
+
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
+
+			if (!zone_watermark_ok(zone, order, zone->pages_high,
+					       0, 0)) {
+				end_zone = i;
+				goto scan;
+			}
+		}
+		goto out;
+scan:
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			lru_pages += zone->nr_active + zone->nr_inactive;
+		}
+
+		/*
+		 * Now scan the zone in the dma->highmem direction, stopping
+		 * at the last zone which needs scanning.
+		 *
+		 * We do this because the page allocator works in the opposite
+		 * direction.  This prevents the page allocator from allocating
+		 * pages behind kswapd's direction of progress, which would
+		 * cause too much scanning of the lower zones.
+		 */
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+			int nr_slab;
+
+			if (!populated_zone(zone))
+				continue;
+
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
+
+			if (!zone_watermark_ok(zone, order, zone->pages_high,
+					       end_zone, 0))
+				all_zones_ok = 0;
+			zone->temp_priority = priority;
+			if (zone->prev_priority > priority)
+				zone->prev_priority = priority;
+			sc.nr_scanned = 0;
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			reclaim_state->reclaimed_slab = 0;
+			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+						lru_pages);
+			nr_reclaimed += reclaim_state->reclaimed_slab;
+			total_scanned += sc.nr_scanned;
+			if (zone->all_unreclaimable)
+				continue;
+			if (nr_slab == 0 && zone->pages_scanned >=
+				    (zone->nr_active + zone->nr_inactive) * 4)
+				zone->all_unreclaimable = 1;
+			/*
+			 * If we've done a decent amount of scanning and
+			 * the reclaim ratio is low, start doing writepage
+			 * even in laptop mode
+			 */
+			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
+				sc.may_writepage = 1;
+		}
+		if (all_zones_ok)
+			break;		/* kswapd: all done */
+		/*
+		 * OK, kswapd is getting into trouble.  Take a nap, then take
+		 * another pass across the zones.
+		 */
+		if (total_scanned && priority < DEF_PRIORITY - 2)
+			blk_congestion_wait(WRITE, HZ/10);
+
+		/*
+		 * We do this so kswapd doesn't build up large priorities for
+		 * example when it is freeing in parallel with allocators. It
+		 * matches the direct reclaim path behaviour in terms of impact
+		 * on zone->*_priority.
+		 */
+		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+			break;
+	}
+out:
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		zone->prev_priority = zone->temp_priority;
+	}
+	if (!all_zones_ok) {
+		cond_resched();
+		goto loop_again;
+	}
+
+	return nr_reclaimed;
+}
+
+/*
+ * The background pageout daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This basically trickles out pages so that we have _some_
+ * free memory available even if there is no other activity
+ * that frees anything up. This is needed for things like routing
+ * etc, where we otherwise might have all activity going on in
+ * asynchronous contexts that cannot page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
+ */
+static int kswapd(void *p)
+{
+	unsigned long order;
+	pg_data_t *pgdat = (pg_data_t*)p;
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	struct reclaim_state reclaim_state = {
+		.reclaimed_slab = 0,
+	};
+	cpumask_t cpumask;
+
+	cpumask = node_to_cpumask(pgdat->node_id);
+	if (!cpus_empty(cpumask))
+		set_cpus_allowed(tsk, cpumask);
+	current->reclaim_state = &reclaim_state;
+
+	/*
+	 * Tell the memory management that we're a "memory allocator",
+	 * and that if we need more memory we should get access to it
+	 * regardless (see "__alloc_pages()"). "kswapd" should
+	 * never get caught in the normal page freeing logic.
+	 *
+	 * (Kswapd normally doesn't need memory anyway, but sometimes
+	 * you need a small amount of memory in order to be able to
+	 * page out something else, and this flag essentially protects
+	 * us from recursively trying to free more memory as we're
+	 * trying to free the first piece of memory in the first place).
+	 */
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+	order = 0;
+	for ( ; ; ) {
+		unsigned long new_order;
+
+		try_to_freeze();
+
+		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+		new_order = pgdat->kswapd_max_order;
+		pgdat->kswapd_max_order = 0;
+		if (order < new_order) {
+			/*
+			 * Don't sleep if someone wants a larger 'order'
+			 * allocation
+			 */
+			order = new_order;
+		} else {
+			schedule();
+			order = pgdat->kswapd_max_order;
+		}
+		finish_wait(&pgdat->kswapd_wait, &wait);
+
+		balance_pgdat(pgdat, order);
+	}
+	return 0;
+}
+
+/*
+ * A zone is low on free memory, so wake its kswapd task to service it.
+ */
+void wakeup_kswapd(struct zone *zone, int order)
+{
+	pg_data_t *pgdat;
+
+	if (!populated_zone(zone))
+		return;
+
+	pgdat = zone->zone_pgdat;
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+		return;
+	if (pgdat->kswapd_max_order < order)
+		pgdat->kswapd_max_order = order;
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+		return;
+	if (!waitqueue_active(&pgdat->kswapd_wait))
+		return;
+	wake_up_interruptible(&pgdat->kswapd_wait);
+}
+
+#ifdef CONFIG_PM
+/*
+ * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+				      int prio, struct scan_control *sc)
+{
+	struct zone *zone;
+	unsigned long nr_to_scan, ret = 0;
+
+	for_each_zone(zone) {
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+			continue;
+
+		/* For pass = 0 we don't shrink the active list */
+		if (pass > 0) {
+			zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+			if (zone->nr_scan_active >= nr_pages || pass > 3) {
+				zone->nr_scan_active = 0;
+				nr_to_scan = min(nr_pages, zone->nr_active);
+				shrink_active_list(nr_to_scan, zone, sc);
+			}
+		}
+
+		zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+			zone->nr_scan_inactive = 0;
+			nr_to_scan = min(nr_pages, zone->nr_inactive);
+			ret += shrink_inactive_list(nr_to_scan, zone, sc);
+			if (ret >= nr_pages)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
+ */
+unsigned long shrink_all_memory(unsigned long nr_pages)
+{
+	unsigned long lru_pages, nr_slab;
+	unsigned long ret = 0;
+	int pass;
+	struct reclaim_state reclaim_state;
+	struct zone *zone;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 0,
+		.swap_cluster_max = nr_pages,
+		.may_writepage = 1,
+		.swappiness = vm_swappiness,
+	};
+
+	current->reclaim_state = &reclaim_state;
+
+	lru_pages = 0;
+	for_each_zone(zone)
+		lru_pages += zone->nr_active + zone->nr_inactive;
+
+	nr_slab = global_page_state(NR_SLAB);
+	/* If slab caches are huge, it's better to hit them first */
+	while (nr_slab >= lru_pages) {
+		reclaim_state.reclaimed_slab = 0;
+		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+		if (!reclaim_state.reclaimed_slab)
+			break;
+
+		ret += reclaim_state.reclaimed_slab;
+		if (ret >= nr_pages)
+			goto out;
+
+		nr_slab -= reclaim_state.reclaimed_slab;
+	}
+
+	/*
+	 * We try to shrink LRUs in 5 passes:
+	 * 0 = Reclaim from inactive_list only
+	 * 1 = Reclaim from active list but don't reclaim mapped
+	 * 2 = 2nd pass of type 1
+	 * 3 = Reclaim mapped (normal reclaim)
+	 * 4 = 2nd pass of type 3
+	 */
+	for (pass = 0; pass < 5; pass++) {
+		int prio;
+
+		/* Needed for shrinking slab caches later on */
+		if (!lru_pages)
+			for_each_zone(zone) {
+				lru_pages += zone->nr_active;
+				lru_pages += zone->nr_inactive;
+			}
+
+		/* Force reclaiming mapped pages in the passes #3 and #4 */
+		if (pass > 2) {
+			sc.may_swap = 1;
+			sc.swappiness = 100;
+		}
+
+		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+			unsigned long nr_to_scan = nr_pages - ret;
+
+			sc.nr_scanned = 0;
+			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+			if (ret >= nr_pages)
+				goto out;
+
+			reclaim_state.reclaimed_slab = 0;
+			shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+			ret += reclaim_state.reclaimed_slab;
+			if (ret >= nr_pages)
+				goto out;
+
+			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+				blk_congestion_wait(WRITE, HZ / 10);
+		}
+
+		lru_pages = 0;
+	}
+
+	/*
+	 * If ret = 0, we could not shrink LRUs, but there may be something
+	 * in slab caches
+	 */
+	if (!ret)
+		do {
+			reclaim_state.reclaimed_slab = 0;
+			shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+			ret += reclaim_state.reclaimed_slab;
+		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+
+out:
+	current->reclaim_state = NULL;
+
+	return ret;
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* It's optimal to keep kswapds on the same CPUs as their memory, but
+   not required for correctness.  So if the last cpu in a node goes
+   away, we get changed to run anywhere: as the first one comes back,
+   restore their cpu bindings. */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action, void *hcpu)
+{
+	pg_data_t *pgdat;
+	cpumask_t mask;
+
+	if (action == CPU_ONLINE) {
+		for_each_online_pgdat(pgdat) {
+			mask = node_to_cpumask(pgdat->node_id);
+			if (any_online_cpu(mask) != NR_CPUS)
+				/* One of our CPUs online: restore mask */
+				set_cpus_allowed(pgdat->kswapd, mask);
+		}
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+	int ret = 0;
+
+	if (pgdat->kswapd)
+		return 0;
+
+	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+	if (IS_ERR(pgdat->kswapd)) {
+		/* failure at boot is fatal */
+		BUG_ON(system_state == SYSTEM_BOOTING);
+		printk("Failed to start kswapd on node %d\n",nid);
+		ret = -1;
+	}
+	return ret;
+}
+
+static int __init kswapd_init(void)
+{
+	int nid;
+
+	swap_setup();
+	for_each_online_node(nid)
+ 		kswapd_run(nid);
+	hotcpu_notifier(cpu_callback, 0);
+	return 0;
+}
+
+module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	/* Minimum pages needed in order to stay on node */
+	const unsigned long nr_pages = 1 << order;
+	struct task_struct *p = current;
+	struct reclaim_state reclaim_state;
+	int priority;
+	unsigned long nr_reclaimed = 0;
+	struct scan_control sc = {
+		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.swap_cluster_max = max_t(unsigned long, nr_pages,
+					SWAP_CLUSTER_MAX),
+		.gfp_mask = gfp_mask,
+		.swappiness = vm_swappiness,
+	};
+
+	disable_swap_token();
+	cond_resched();
+	/*
+	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
+	 * and we also need to be able to write out pages for RECLAIM_WRITE
+	 * and RECLAIM_SWAP.
+	 */
+	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	/*
+	 * Free memory by calling shrink zone with increasing priorities
+	 * until we have enough memory freed.
+	 */
+	priority = ZONE_RECLAIM_PRIORITY;
+	do {
+		nr_reclaimed += shrink_zone(priority, zone, &sc);
+		priority--;
+	} while (priority >= 0 && nr_reclaimed < nr_pages);
+
+	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+		/*
+		 * shrink_slab() does not currently allow us to determine how
+		 * many pages were freed in this zone. So we just shake the slab
+		 * a bit and then go off node for this particular allocation
+		 * despite possibly having freed enough memory to allocate in
+		 * this zone.  If we freed local memory then the next
+		 * allocations will be local again.
+		 *
+		 * shrink_slab will free memory on all zones and may take
+		 * a long time.
+		 */
+		shrink_slab(sc.nr_scanned, gfp_mask, order);
+	}
+
+	p->reclaim_state = NULL;
+	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	return nr_reclaimed >= nr_pages;
+}
+
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	cpumask_t mask;
+	int node_id;
+
+	/*
+	 * Zone reclaim reclaims unmapped file backed pages.
+	 *
+	 * A small portion of unmapped file backed pages is needed for
+	 * file I/O otherwise pages read by file I/O will be immediately
+	 * thrown out if the zone is overallocated. So we do not reclaim
+	 * if less than a specified percentage of the zone is used by
+	 * unmapped file backed pages.
+	 */
+	if (zone_page_state(zone, NR_FILE_PAGES) -
+	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+		return 0;
+
+	/*
+	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+	 * not have reclaimable pages and if we should not delay the allocation
+	 * then do not scan.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0 ||
+		(current->flags & PF_MEMALLOC))
+			return 0;
+
+	/*
+	 * Only run zone reclaim on the local zone or on zones that do not
+	 * have associated processors. This will favor the local processor
+	 * over remote processors and spread off node memory allocations
+	 * as wide as possible.
+	 */
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+	return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif
diff -urN ./linux-2.6.18.1/mm/vmstat.c linux-2.6.18.1-cabi-20070529-RT_HRT/mm/vmstat.c
--- ./linux-2.6.18.1/mm/vmstat.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/mm/vmstat.c	2007-05-19 23:58:35.000000000 +0900
@@ -189,10 +189,14 @@
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
-	s8 *p = pcp->vm_stat_diff + item;
+	struct per_cpu_pageset *pcp;
+	int cpu;
 	long x;
+	s8 *p;
 
+	cpu = get_cpu();
+	pcp = zone_pcp(zone, cpu);
+	p = pcp->vm_stat_diff + item;
 	x = delta + *p;
 
 	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
@@ -200,6 +204,7 @@
 		x = 0;
 	}
 	*p = x;
+	put_cpu();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
@@ -242,9 +247,13 @@
  */
 static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
-	s8 *p = pcp->vm_stat_diff + item;
-
+	struct per_cpu_pageset *pcp;
+	int cpu;
+	s8 *p;
+
+	cpu = get_cpu();
+	pcp = zone_pcp(zone, cpu);
+	p = pcp->vm_stat_diff + item;
 	(*p)++;
 
 	if (unlikely(*p > pcp->stat_threshold)) {
@@ -253,20 +262,35 @@
 		zone_page_state_add(*p + overstep, zone, item);
 		*p = -overstep;
 	}
+	put_cpu();
 }
 
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long flags;
+	struct zone *zone;
+
+	zone = page_zone(page);
+	local_irq_save(flags);
+	__inc_zone_state(zone, item);
+	local_irq_restore(flags);
+#else
 	__inc_zone_state(page_zone(page), item);
+#endif
 }
 EXPORT_SYMBOL(__inc_zone_page_state);
 
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
 	struct zone *zone = page_zone(page);
-	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
-	s8 *p = pcp->vm_stat_diff + item;
-
+	struct per_cpu_pageset *pcp;
+	int cpu;
+	s8 *p;
+
+	cpu = get_cpu();
+	pcp = zone_pcp(zone, cpu);
+	p = pcp->vm_stat_diff + item;
 	(*p)--;
 
 	if (unlikely(*p < - pcp->stat_threshold)) {
@@ -275,6 +299,7 @@
 		zone_page_state_add(*p - overstep, zone, item);
 		*p = overstep;
 	}
+	put_cpu();
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
diff -urN ./linux-2.6.18.1/net/core/dev.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/dev.c
--- ./linux-2.6.18.1/net/core/dev.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/dev.c	2007-05-20 00:11:13.000000000 +0900
@@ -1379,7 +1379,7 @@
 	return 0;
 }
 
-#define HARD_TX_LOCK(dev, cpu) {			\
+#define HARD_TX_LOCK(dev) {				\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		netif_tx_lock(dev);			\
 	}						\
@@ -1503,11 +1503,17 @@
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
+		int cpu = raw_smp_processor_id(); /* ok because BHs are off */
 
 		if (dev->xmit_lock_owner != cpu) {
-
-			HARD_TX_LOCK(dev, cpu);
+#endif
+			HARD_TX_LOCK(dev);
 
 			if (!netif_queue_stopped(dev)) {
 				rc = 0;
@@ -1658,6 +1664,11 @@
 
 			BUG_TRAP(!atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_all();
 		}
 	}
 
@@ -1676,14 +1687,30 @@
 			smp_mb__before_clear_bit();
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
+			/*
+			 * We are executing in softirq context here, and
+			 * if softirqs are preemptible, we must avoid
+			 * infinite reactivation of the softirq by
+			 * either the tx handler, or by netif_schedule().
+			 * (it would result in an infinitely looping
+			 *  softirq context)
+			 * So we take the spinlock unconditionally.
+			 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			spin_lock(&dev->queue_lock);
+			qdisc_run(dev);
+			spin_unlock(&dev->queue_lock);
+#else
 			if (spin_trylock(&dev->queue_lock)) {
 				qdisc_run(dev);
 				spin_unlock(&dev->queue_lock);
 			} else {
 				netif_schedule(dev);
 			}
+#endif
 		}
 	}
+
 }
 
 static __inline__ int deliver_skb(struct sk_buff *skb,
@@ -1904,12 +1931,13 @@
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
 	void *have;
 
 	local_irq_disable();
+	queue = &__get_cpu_var(softnet_data);
 
 	while (!list_empty(&queue->poll_list)) {
 		struct net_device *dev;
@@ -1918,6 +1946,10 @@
 			goto softnet_break;
 
 		local_irq_enable();
+		if (unlikely(cond_resched_all())) {
+			local_irq_disable();
+			continue;
+		}
 
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
@@ -1955,8 +1987,10 @@
 	return;
 
 softnet_break:
+	preempt_disable();
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	preempt_enable();
 	goto out;
 }
 
diff -urN ./linux-2.6.18.1/net/core/flow.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/flow.c
--- ./linux-2.6.18.1/net/core/flow.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/flow.c	2007-05-19 23:58:35.000000000 +0900
@@ -41,6 +41,8 @@
 
 static u32 flow_hash_shift;
 #define flow_hash_size	(1 << flow_hash_shift)
+
+// #warning FIXME: this code is PREEMPT_RT-unsafe
 static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
diff -urN ./linux-2.6.18.1/net/core/netpoll.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/netpoll.c
--- ./linux-2.6.18.1/net/core/netpoll.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/netpoll.c	2007-05-19 23:58:35.000000000 +0900
@@ -141,7 +141,7 @@
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    npinfo->poll_owner != smp_processor_id() &&
+	    npinfo->poll_owner != raw_smp_processor_id() &&
 	    spin_trylock(&npinfo->poll_lock)) {
 		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
@@ -176,7 +176,9 @@
 		return;
 
 	/* Process pending work on NIC */
+	WARN_ON_RT(irqs_disabled());
 	np->dev->poll_controller(np->dev);
+	WARN_ON_RT(irqs_disabled());
 	if (np->dev->poll)
 		poll_napi(np);
 
@@ -205,28 +207,31 @@
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
-
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if(skb->destructor)
-				dev_kfree_skb_any(skb); /* put this one back */
-			else
-				__kfree_skb(skb);
-		}
 	}
 
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if(skb->destructor)
+			dev_kfree_skb_any(skb); /* put this one back */
+		else
+			__kfree_skb(skb);
+	}
 }
 
 static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
@@ -281,8 +286,8 @@
 	npinfo = np->dev->npinfo;
 
 	/* avoid recursion */
-	if (npinfo->poll_owner == smp_processor_id() ||
-	    np->dev->xmit_lock_owner == smp_processor_id()) {
+	if (npinfo->poll_owner == raw_smp_processor_id() ||
+	    np->dev->xmit_lock_owner == raw_smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
diff -urN ./linux-2.6.18.1/net/core/sock.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/sock.c
--- ./linux-2.6.18.1/net/core/sock.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/core/sock.c	2007-05-19 23:58:35.000000000 +0900
@@ -1421,7 +1421,7 @@
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible_sync(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
 }
diff -urN ./linux-2.6.18.1/net/decnet/dn_dev.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/decnet/dn_dev.c
--- ./linux-2.6.18.1/net/decnet/dn_dev.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/decnet/dn_dev.c	2007-05-19 23:58:35.000000000 +0900
@@ -87,9 +87,9 @@
 	.t3 =		10,
 	.name =		"ethernet",
 	.ctl_name =	NET_DECNET_CONF_ETHER,
-	.up =		dn_eth_up,
-	.down = 	dn_eth_down,
-	.timer3 =	dn_send_brd_hello,
+	.dn_up =		dn_eth_up,
+	.dn_down = 	dn_eth_down,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 {
 	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
@@ -99,7 +99,7 @@
 	.t3 =		10,
 	.name =		"ipgre",
 	.ctl_name =	NET_DECNET_CONF_GRE,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #if 0
 {
@@ -110,7 +110,7 @@
 	.t3 =		120,
 	.name =		"x25",
 	.ctl_name =	NET_DECNET_CONF_X25,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 #endif
 #if 0
@@ -122,7 +122,7 @@
 	.t3 =		10,
 	.name =		"ppp",
 	.ctl_name =	NET_DECNET_CONF_PPP,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #endif
 {
@@ -133,7 +133,7 @@
 	.t3 =		120,
 	.name =		"ddcmp",
 	.ctl_name =	NET_DECNET_CONF_DDCMP,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 {
 	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
@@ -143,7 +143,7 @@
 	.t3 =		10,
 	.name =		"loopback",
 	.ctl_name =	NET_DECNET_CONF_LOOPBACK,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 }
 };
 
@@ -332,11 +332,11 @@
 		 */
 		tmp = dn_db->parms.forwarding;
 		dn_db->parms.forwarding = old;
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = tmp;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return err;
@@ -371,11 +371,11 @@
 		if (value > 2)
 			return -EINVAL;
 
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = value;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return 0;
@@ -1057,10 +1057,10 @@
 	struct dn_ifaddr *ifa;
 
 	if (dn_db->t3 <= dn_db->parms.t2) {
-		if (dn_db->parms.timer3) {
+		if (dn_db->parms.dn_timer3) {
 			for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
 				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
-					dn_db->parms.timer3(dev, ifa);
+					dn_db->parms.dn_timer3(dev, ifa);
 			}
 		}
 		dn_db->t3 = dn_db->parms.t3;
@@ -1111,8 +1111,8 @@
 	init_timer(&dn_db->timer);
 
 	dn_db->uptime = jiffies;
-	if (dn_db->parms.up) {
-		if (dn_db->parms.up(dev) < 0) {
+	if (dn_db->parms.dn_up) {
+		if (dn_db->parms.dn_up(dev) < 0) {
 			dev->dn_ptr = NULL;
 			kfree(dn_db);
 			return NULL;
@@ -1207,8 +1207,8 @@
 	dn_dev_check_default(dev);
 	neigh_ifdown(&dn_neigh_table, dev);
 
-	if (dn_db->parms.down)
-		dn_db->parms.down(dev);
+	if (dn_db->parms.dn_down)
+		dn_db->parms.dn_down(dev);
 
 	dev->dn_ptr = NULL;
 
diff -urN ./linux-2.6.18.1/net/ipv4/netfilter/arp_tables.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/arp_tables.c
--- ./linux-2.6.18.1/net/ipv4/netfilter/arp_tables.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/arp_tables.c	2007-05-19 23:58:35.000000000 +0900
@@ -249,7 +249,7 @@
 
 	read_lock_bh(&table->lock);
 	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -955,7 +955,7 @@
 
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
diff -urN ./linux-2.6.18.1/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/ip_conntrack_core.c
--- ./linux-2.6.18.1/net/ipv4/netfilter/ip_conntrack_core.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/ip_conntrack_core.c	2007-05-19 23:58:35.000000000 +0900
@@ -82,7 +82,7 @@
 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
 
-DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DEFINE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
@@ -103,20 +103,20 @@
 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
 {
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ecache->ct == ct)
 		__ip_ct_deliver_cached_events(ecache);
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
-void __ip_ct_event_cache_init(struct ip_conntrack *ct)
+void __ip_ct_event_cache_init(struct ip_conntrack_ecache *ecache,
+			      struct ip_conntrack *ct)
 {
-	struct ip_conntrack_ecache *ecache;
-
 	/* take care of delivering potentially old events */
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__ip_ct_deliver_cached_events(ecache);
@@ -132,8 +132,11 @@
 	struct ip_conntrack_ecache *ecache;
 	int cpu;
 
+	/*
+	 * First get all locks, then do the flush and drop the locks.
+	 */
 	for_each_possible_cpu(cpu) {
-		ecache = &per_cpu(ip_conntrack_ecache, cpu);
+		ecache = &__get_cpu_var_locked(ip_conntrack_ecache, cpu);
 		if (ecache->ct)
 			ip_conntrack_put(ecache->ct);
 	}
diff -urN ./linux-2.6.18.1/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/ip_conntrack_standalone.c
--- ./linux-2.6.18.1/net/ipv4/netfilter/ip_conntrack_standalone.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/ip_conntrack_standalone.c	2007-05-19 23:58:35.000000000 +0900
@@ -910,7 +910,7 @@
 EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
 EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
 EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
-EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(ip_conntrack_ecache);
 #endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
diff -urN ./linux-2.6.18.1/net/ipv4/netfilter/ip_tables.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/ip_tables.c
--- ./linux-2.6.18.1/net/ipv4/netfilter/ip_tables.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/netfilter/ip_tables.c	2007-05-19 23:58:35.000000000 +0900
@@ -248,7 +248,7 @@
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	table_base = (void *)private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
diff -urN ./linux-2.6.18.1/net/ipv4/route.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/route.c
--- ./linux-2.6.18.1/net/ipv4/route.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/route.c	2007-05-19 23:58:35.000000000 +0900
@@ -207,13 +207,13 @@
 	struct rtable	*chain;
 };
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
+	defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
  */
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
 # define RT_HASH_LOCK_SZ	256
 #else
 # if NR_CPUS >= 32
@@ -239,7 +239,7 @@
 			spin_lock_init(&rt_hash_locks[i]); \
 		}
 #else
-# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL)
 # define rt_hash_lock_init()
 #endif
 
diff -urN ./linux-2.6.18.1/net/ipv4/tcp.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/tcp.c
--- ./linux-2.6.18.1/net/ipv4/tcp.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/tcp.c	2007-05-19 23:58:35.000000000 +0900
@@ -1133,10 +1133,10 @@
 	preempt_disable();
 	if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
 	    !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) {
-		preempt_enable_no_resched();
+		preempt_enable();
 		tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
 	} else
-		preempt_enable_no_resched();
+		preempt_enable();
 #endif
 
 	do {
@@ -2241,6 +2241,8 @@
 }
 __setup("thash_entries=", set_thash_entries);
 
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+					unsigned long bits, int flags);
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
@@ -2274,6 +2276,10 @@
 					NULL,
 					0);
 	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+	tcp_hashinfo.ebitmask =
+		alloc_large_system_bitmask("TCP established",
+					  tcp_hashinfo.ehash_size,
+					  HASH_HIGHMEM);
 	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
 		rwlock_init(&tcp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
diff -urN ./linux-2.6.18.1/net/ipv4/tcp_ipv4.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/tcp_ipv4.c
--- ./linux-2.6.18.1/net/ipv4/tcp_ipv4.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/ipv4/tcp_ipv4.c	2007-05-19 23:58:35.000000000 +0900
@@ -1453,7 +1453,12 @@
 	struct tcp_iter_state* st = seq->private;
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+	for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask,
+					 tcp_hashinfo.ehash_size);
+	     st->bucket < tcp_hashinfo.ehash_size;
+	     st->bucket = find_next_bit(tcp_hashinfo.ebitmask,
+					tcp_hashinfo.ehash_size,
+					st->bucket+1)) {
 		struct sock *sk;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw;
diff -urN ./linux-2.6.18.1/net/sched/sch_generic.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/sched/sch_generic.c
--- ./linux-2.6.18.1/net/sched/sch_generic.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/sched/sch_generic.c	2007-05-20 00:11:13.000000000 +0900
@@ -14,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -31,6 +32,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -109,8 +111,10 @@
 		 * will be requeued.
 		 */
 		if (!nolock) {
+#ifdef CONFIG_PREEMPT_RT
+			netif_tx_lock(dev);
+#else
 			if (!netif_tx_trylock(dev)) {
-			collision:
 				/* So, someone grabbed the driver. */
 				
 				/* It may be transient configuration error,
@@ -118,7 +122,7 @@
 				   it by checking xmit owner and drop the
 				   packet when deadloop is detected.
 				*/
-				if (dev->xmit_lock_owner == smp_processor_id()) {
+				if (dev->xmit_lock_owner == raw_smp_processor_id()) {
 					kfree_skb(skb);
 					if (net_ratelimit())
 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
@@ -127,6 +131,7 @@
 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
 				goto requeue;
 			}
+#endif
 		}
 		
 		{
@@ -136,7 +141,15 @@
 			if (!netif_queue_stopped(dev)) {
 				int ret;
 
-				ret = dev_hard_start_xmit(skb, dev);
+				WARN_ON_RT(irqs_disabled());
+				ret = dev->hard_start_xmit(skb, dev);
+#ifdef CONFIG_PREEMPT_RT
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled raw interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+#endif
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						netif_tx_unlock(dev);
@@ -146,7 +159,10 @@
 				}
 				if (ret == NETDEV_TX_LOCKED && nolock) {
 					spin_lock(&dev->queue_lock);
-					goto collision; 
+					preempt_disable();
+					__get_cpu_var(netdev_rx_stat).cpu_collision++;
+					preempt_enable();
+					goto requeue;
 				}
 			}
 
@@ -566,9 +582,12 @@
 	/* Wait for outstanding dev_queue_xmit calls. */
 	synchronize_rcu();
 
-	/* Wait for outstanding qdisc_run calls. */
+	/*
+	 * Wait for outstanding qdisc_run calls.
+	 * TODO: shouldnt this be wakeup-based, instead of polling it?
+	 */
 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
-		yield();
+		msleep(1);
 
 	if (dev->gso_skb) {
 		kfree_skb(dev->gso_skb);
diff -urN ./linux-2.6.18.1/net/sched/sch_generic.c.orig linux-2.6.18.1-cabi-20070529-RT_HRT/net/sched/sch_generic.c.orig
--- ./linux-2.6.18.1/net/sched/sch_generic.c.orig	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/sched/sch_generic.c.orig	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,662 @@
+/*
+ * net/sched/sch_generic.c	Generic packet scheduler routines.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
+ *              - Ingress support
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+/* Main transmission queue. */
+
+/* Main qdisc structure lock. 
+
+   However, modifications
+   to data, participating in scheduling must be additionally
+   protected with dev->queue_lock spinlock.
+
+   The idea is the following:
+   - enqueue, dequeue are serialized via top level device
+     spinlock dev->queue_lock.
+   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
+     and this lock is used only in process context.
+   - updates to tree are made under rtnl semaphore or
+     from softirq context (__qdisc_destroy rcu-callback)
+     hence this lock needs local bh disabling.
+
+   qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
+ */
+DEFINE_RWLOCK(qdisc_tree_lock);
+
+void qdisc_lock_tree(struct net_device *dev)
+{
+	write_lock_bh(&qdisc_tree_lock);
+	spin_lock_bh(&dev->queue_lock);
+}
+
+void qdisc_unlock_tree(struct net_device *dev)
+{
+	spin_unlock_bh(&dev->queue_lock);
+	write_unlock_bh(&qdisc_tree_lock);
+}
+
+/* 
+   dev->queue_lock serializes queue accesses for this device
+   AND dev->qdisc pointer itself.
+
+   netif_tx_lock serializes accesses to device driver.
+
+   dev->queue_lock and netif_tx_lock are mutually exclusive,
+   if one is grabbed, another must be free.
+ */
+
+
+/* Kick device.
+   Note, that this procedure can be called by a watchdog timer, so that
+   we do not check dev->tbusy flag here.
+
+   Returns:  0  - queue is empty.
+            >0  - queue is not empty, but throttled.
+	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
+
+   NOTE: Called under dev->queue_lock with locally disabled BH.
+*/
+
+static inline int qdisc_restart(struct net_device *dev)
+{
+	struct Qdisc *q = dev->qdisc;
+	struct sk_buff *skb;
+
+	/* Dequeue packet */
+	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
+		unsigned nolock = (dev->features & NETIF_F_LLTX);
+
+		dev->gso_skb = NULL;
+
+		/*
+		 * When the driver has LLTX set it does its own locking
+		 * in start_xmit. No need to add additional overhead by
+		 * locking again. These checks are worth it because
+		 * even uncongested locks can be quite expensive.
+		 * The driver can do trylock like here too, in case
+		 * of lock congestion it should return -1 and the packet
+		 * will be requeued.
+		 */
+		if (!nolock) {
+#ifdef CONFIG_PREEMPT_RT
+			netif_tx_lock(dev);
+#else
+			if (!netif_tx_trylock(dev)) {
+				/* So, someone grabbed the driver. */
+				
+				/* It may be transient configuration error,
+				   when hard_start_xmit() recurses. We detect
+				   it by checking xmit owner and drop the
+				   packet when deadloop is detected.
+				*/
+				if (dev->xmit_lock_owner == raw_smp_processor_id()) {
+					kfree_skb(skb);
+					if (net_ratelimit())
+						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
+					return -1;
+				}
+				__get_cpu_var(netdev_rx_stat).cpu_collision++;
+				goto requeue;
+			}
+#endif
+		}
+		
+		{
+			/* And release queue */
+			spin_unlock(&dev->queue_lock);
+
+			if (!netif_queue_stopped(dev)) {
+				int ret;
+
+				WARN_ON_RT(irqs_disabled());
+				ret = dev->hard_start_xmit(skb, dev);
+#ifdef CONFIG_PREEMPT_RT
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled raw interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+#endif
+				if (ret == NETDEV_TX_OK) { 
+					if (!nolock) {
+						netif_tx_unlock(dev);
+					}
+					spin_lock(&dev->queue_lock);
+					return -1;
+				}
+				if (ret == NETDEV_TX_LOCKED && nolock) {
+					spin_lock(&dev->queue_lock);
+					preempt_disable();
+					__get_cpu_var(netdev_rx_stat).cpu_collision++;
+					preempt_enable();
+					goto requeue;
+				}
+			}
+
+			/* NETDEV_TX_BUSY - we need to requeue */
+			/* Release the driver */
+			if (!nolock) { 
+				netif_tx_unlock(dev);
+			} 
+			spin_lock(&dev->queue_lock);
+			q = dev->qdisc;
+		}
+
+		/* Device kicked us out :(
+		   This is possible in three cases:
+
+		   0. driver is locked
+		   1. fastroute is enabled
+		   2. device cannot determine busy state
+		      before start of transmission (f.e. dialout)
+		   3. device is buggy (ppp)
+		 */
+
+requeue:
+		if (skb->next)
+			dev->gso_skb = skb;
+		else
+			q->ops->requeue(skb, q);
+		netif_schedule(dev);
+		return 1;
+	}
+	BUG_ON((int) q->q.qlen < 0);
+	return q->q.qlen;
+}
+
+void __qdisc_run(struct net_device *dev)
+{
+	if (unlikely(dev->qdisc == &noop_qdisc))
+		goto out;
+
+	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
+		/* NOTHING */;
+
+out:
+	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
+}
+
+static void dev_watchdog(unsigned long arg)
+{
+	struct net_device *dev = (struct net_device *)arg;
+
+	netif_tx_lock(dev);
+	if (dev->qdisc != &noop_qdisc) {
+		if (netif_device_present(dev) &&
+		    netif_running(dev) &&
+		    netif_carrier_ok(dev)) {
+			if (netif_queue_stopped(dev) &&
+			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
+
+				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
+				       dev->name);
+				dev->tx_timeout(dev);
+			}
+			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
+				dev_hold(dev);
+		}
+	}
+	netif_tx_unlock(dev);
+
+	dev_put(dev);
+}
+
+static void dev_watchdog_init(struct net_device *dev)
+{
+	init_timer(&dev->watchdog_timer);
+	dev->watchdog_timer.data = (unsigned long)dev;
+	dev->watchdog_timer.function = dev_watchdog;
+}
+
+void __netdev_watchdog_up(struct net_device *dev)
+{
+	if (dev->tx_timeout) {
+		if (dev->watchdog_timeo <= 0)
+			dev->watchdog_timeo = 5*HZ;
+		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
+			dev_hold(dev);
+	}
+}
+
+static void dev_watchdog_up(struct net_device *dev)
+{
+	__netdev_watchdog_up(dev);
+}
+
+static void dev_watchdog_down(struct net_device *dev)
+{
+	netif_tx_lock_bh(dev);
+	if (del_timer(&dev->watchdog_timer))
+		dev_put(dev);
+	netif_tx_unlock_bh(dev);
+}
+
+void netif_carrier_on(struct net_device *dev)
+{
+	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
+		linkwatch_fire_event(dev);
+	if (netif_running(dev))
+		__netdev_watchdog_up(dev);
+}
+
+void netif_carrier_off(struct net_device *dev)
+{
+	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
+		linkwatch_fire_event(dev);
+}
+
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+   under all circumstances. It is difficult to invent anything faster or
+   cheaper.
+ */
+
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+{
+	kfree_skb(skb);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
+{
+	return NULL;
+}
+
+static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+	if (net_ratelimit())
+		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
+		       skb->dev->name);
+	kfree_skb(skb);
+	return NET_XMIT_CN;
+}
+
+struct Qdisc_ops noop_qdisc_ops = {
+	.id		=	"noop",
+	.priv_size	=	0,
+	.enqueue	=	noop_enqueue,
+	.dequeue	=	noop_dequeue,
+	.requeue	=	noop_requeue,
+	.owner		=	THIS_MODULE,
+};
+
+struct Qdisc noop_qdisc = {
+	.enqueue	=	noop_enqueue,
+	.dequeue	=	noop_dequeue,
+	.flags		=	TCQ_F_BUILTIN,
+	.ops		=	&noop_qdisc_ops,	
+	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
+};
+
+static struct Qdisc_ops noqueue_qdisc_ops = {
+	.id		=	"noqueue",
+	.priv_size	=	0,
+	.enqueue	=	noop_enqueue,
+	.dequeue	=	noop_dequeue,
+	.requeue	=	noop_requeue,
+	.owner		=	THIS_MODULE,
+};
+
+static struct Qdisc noqueue_qdisc = {
+	.enqueue	=	NULL,
+	.dequeue	=	noop_dequeue,
+	.flags		=	TCQ_F_BUILTIN,
+	.ops		=	&noqueue_qdisc_ops,
+	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
+};
+
+
+static const u8 prio2band[TC_PRIO_MAX+1] =
+	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
+
+/* 3-band FIFO queue: old style, but should be a bit faster than
+   generic prio+fifo combination.
+ */
+
+#define PFIFO_FAST_BANDS 3
+
+static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
+					     struct Qdisc *qdisc)
+{
+	struct sk_buff_head *list = qdisc_priv(qdisc);
+	return list + prio2band[skb->priority & TC_PRIO_MAX];
+}
+
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+	struct sk_buff_head *list = prio2list(skb, qdisc);
+
+	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
+		qdisc->q.qlen++;
+		return __qdisc_enqueue_tail(skb, qdisc, list);
+	}
+
+	return qdisc_drop(skb, qdisc);
+}
+
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
+{
+	int prio;
+	struct sk_buff_head *list = qdisc_priv(qdisc);
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		if (!skb_queue_empty(list + prio)) {
+			qdisc->q.qlen--;
+			return __qdisc_dequeue_head(qdisc, list + prio);
+		}
+	}
+
+	return NULL;
+}
+
+static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+	qdisc->q.qlen++;
+	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
+}
+
+static void pfifo_fast_reset(struct Qdisc* qdisc)
+{
+	int prio;
+	struct sk_buff_head *list = qdisc_priv(qdisc);
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+		__qdisc_reset_queue(qdisc, list + prio);
+
+	qdisc->qstats.backlog = 0;
+	qdisc->q.qlen = 0;
+}
+
+static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
+{
+	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
+
+	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
+	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	return skb->len;
+
+rtattr_failure:
+	return -1;
+}
+
+static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
+{
+	int prio;
+	struct sk_buff_head *list = qdisc_priv(qdisc);
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+		skb_queue_head_init(list + prio);
+
+	return 0;
+}
+
+static struct Qdisc_ops pfifo_fast_ops = {
+	.id		=	"pfifo_fast",
+	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
+	.enqueue	=	pfifo_fast_enqueue,
+	.dequeue	=	pfifo_fast_dequeue,
+	.requeue	=	pfifo_fast_requeue,
+	.init		=	pfifo_fast_init,
+	.reset		=	pfifo_fast_reset,
+	.dump		=	pfifo_fast_dump,
+	.owner		=	THIS_MODULE,
+};
+
+struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
+{
+	void *p;
+	struct Qdisc *sch;
+	unsigned int size;
+	int err = -ENOBUFS;
+
+	/* ensure that the Qdisc and the private data are 32-byte aligned */
+	size = QDISC_ALIGN(sizeof(*sch));
+	size += ops->priv_size + (QDISC_ALIGNTO - 1);
+
+	p = kzalloc(size, GFP_KERNEL);
+	if (!p)
+		goto errout;
+	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+	sch->padded = (char *) sch - (char *) p;
+
+	INIT_LIST_HEAD(&sch->list);
+	skb_queue_head_init(&sch->q);
+	sch->ops = ops;
+	sch->enqueue = ops->enqueue;
+	sch->dequeue = ops->dequeue;
+	sch->dev = dev;
+	dev_hold(dev);
+	sch->stats_lock = &dev->queue_lock;
+	atomic_set(&sch->refcnt, 1);
+
+	return sch;
+errout:
+	return ERR_PTR(-err);
+}
+
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+	struct Qdisc *sch;
+	
+	sch = qdisc_alloc(dev, ops);
+	if (IS_ERR(sch))
+		goto errout;
+
+	if (!ops->init || ops->init(sch, NULL) == 0)
+		return sch;
+
+	qdisc_destroy(sch);
+errout:
+	return NULL;
+}
+
+/* Under dev->queue_lock and BH! */
+
+void qdisc_reset(struct Qdisc *qdisc)
+{
+	struct Qdisc_ops *ops = qdisc->ops;
+
+	if (ops->reset)
+		ops->reset(qdisc);
+}
+
+/* this is the rcu callback function to clean up a qdisc when there 
+ * are no further references to it */
+
+static void __qdisc_destroy(struct rcu_head *head)
+{
+	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
+	struct Qdisc_ops  *ops = qdisc->ops;
+
+#ifdef CONFIG_NET_ESTIMATOR
+	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+#endif
+	write_lock(&qdisc_tree_lock);
+	if (ops->reset)
+		ops->reset(qdisc);
+	if (ops->destroy)
+		ops->destroy(qdisc);
+	write_unlock(&qdisc_tree_lock);
+	module_put(ops->owner);
+
+	dev_put(qdisc->dev);
+	kfree((char *) qdisc - qdisc->padded);
+}
+
+/* Under dev->queue_lock and BH! */
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+	struct list_head cql = LIST_HEAD_INIT(cql);
+	struct Qdisc *cq, *q, *n;
+
+	if (qdisc->flags & TCQ_F_BUILTIN ||
+		!atomic_dec_and_test(&qdisc->refcnt))
+		return;
+
+	if (!list_empty(&qdisc->list)) {
+		if (qdisc->ops->cl_ops == NULL)
+			list_del(&qdisc->list);
+		else
+			list_move(&qdisc->list, &cql);
+	}
+
+	/* unlink inner qdiscs from dev->qdisc_list immediately */
+	list_for_each_entry(cq, &cql, list)
+		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
+			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
+				if (q->ops->cl_ops == NULL)
+					list_del_init(&q->list);
+				else
+					list_move_tail(&q->list, &cql);
+			}
+	list_for_each_entry_safe(cq, n, &cql, list)
+		list_del_init(&cq->list);
+
+	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
+}
+
+void dev_activate(struct net_device *dev)
+{
+	/* No queueing discipline is attached to device;
+	   create default one i.e. pfifo_fast for devices,
+	   which need queueing and noqueue_qdisc for
+	   virtual interfaces
+	 */
+
+	if (dev->qdisc_sleeping == &noop_qdisc) {
+		struct Qdisc *qdisc;
+		if (dev->tx_queue_len) {
+			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
+			if (qdisc == NULL) {
+				printk(KERN_INFO "%s: activation failed\n", dev->name);
+				return;
+			}
+			write_lock_bh(&qdisc_tree_lock);
+			list_add_tail(&qdisc->list, &dev->qdisc_list);
+			write_unlock_bh(&qdisc_tree_lock);
+		} else {
+			qdisc =  &noqueue_qdisc;
+		}
+		write_lock_bh(&qdisc_tree_lock);
+		dev->qdisc_sleeping = qdisc;
+		write_unlock_bh(&qdisc_tree_lock);
+	}
+
+	if (!netif_carrier_ok(dev))
+		/* Delay activation until next carrier-on event */
+		return;
+
+	spin_lock_bh(&dev->queue_lock);
+	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
+	if (dev->qdisc != &noqueue_qdisc) {
+		dev->trans_start = jiffies;
+		dev_watchdog_up(dev);
+	}
+	spin_unlock_bh(&dev->queue_lock);
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+	struct Qdisc *qdisc;
+
+	spin_lock_bh(&dev->queue_lock);
+	qdisc = dev->qdisc;
+	dev->qdisc = &noop_qdisc;
+
+	qdisc_reset(qdisc);
+
+	spin_unlock_bh(&dev->queue_lock);
+
+	dev_watchdog_down(dev);
+
+	/* Wait for outstanding dev_queue_xmit calls. */
+	synchronize_rcu();
+
+	/*
+	 * Wait for outstanding qdisc_run calls.
+	 * TODO: shouldnt this be wakeup-based, instead of polling it?
+	 */
+	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
+		msleep(1);
+
+	if (dev->gso_skb) {
+		kfree_skb(dev->gso_skb);
+		dev->gso_skb = NULL;
+	}
+}
+
+void dev_init_scheduler(struct net_device *dev)
+{
+	qdisc_lock_tree(dev);
+	dev->qdisc = &noop_qdisc;
+	dev->qdisc_sleeping = &noop_qdisc;
+	INIT_LIST_HEAD(&dev->qdisc_list);
+	qdisc_unlock_tree(dev);
+
+	dev_watchdog_init(dev);
+}
+
+void dev_shutdown(struct net_device *dev)
+{
+	struct Qdisc *qdisc;
+
+	qdisc_lock_tree(dev);
+	qdisc = dev->qdisc_sleeping;
+	dev->qdisc = &noop_qdisc;
+	dev->qdisc_sleeping = &noop_qdisc;
+	qdisc_destroy(qdisc);
+#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
+        if ((qdisc = dev->qdisc_ingress) != NULL) {
+		dev->qdisc_ingress = NULL;
+		qdisc_destroy(qdisc);
+        }
+#endif
+	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
+	qdisc_unlock_tree(dev);
+}
+
+EXPORT_SYMBOL(__netdev_watchdog_up);
+EXPORT_SYMBOL(netif_carrier_on);
+EXPORT_SYMBOL(netif_carrier_off);
+EXPORT_SYMBOL(noop_qdisc);
+EXPORT_SYMBOL(noop_qdisc_ops);
+EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_alloc);
+EXPORT_SYMBOL(qdisc_destroy);
+EXPORT_SYMBOL(qdisc_reset);
+EXPORT_SYMBOL(qdisc_lock_tree);
+EXPORT_SYMBOL(qdisc_unlock_tree);
diff -urN ./linux-2.6.18.1/net/unix/af_unix.c linux-2.6.18.1-cabi-20070529-RT_HRT/net/unix/af_unix.c
--- ./linux-2.6.18.1/net/unix/af_unix.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/net/unix/af_unix.c	2007-05-19 23:58:35.000000000 +0900
@@ -307,10 +307,11 @@
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible_sync(sk->sk_sleep);
 		sk_wake_async(sk, 2, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
+	preempt_check_resched_delayed();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
diff -urN ./linux-2.6.18.1/scripts/Makefile linux-2.6.18.1-cabi-20070529-RT_HRT/scripts/Makefile
--- ./linux-2.6.18.1/scripts/Makefile	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/scripts/Makefile	2007-05-19 23:58:35.000000000 +0900
@@ -12,6 +12,9 @@
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+ifdef CONFIG_LPPTEST
+hostprogs-y      += testlpp
+endif
 
 always		:= $(hostprogs-y)
 
バイナリー・ファイル./linux-2.6.18.1/scripts/testlppとlinux-2.6.18.1-cabi-20070529-RT_HRT/scripts/testlppは違います
diff -urN ./linux-2.6.18.1/scripts/testlpp.c linux-2.6.18.1-cabi-20070529-RT_HRT/scripts/testlpp.c
--- ./linux-2.6.18.1/scripts/testlpp.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/scripts/testlpp.c	2007-05-19 23:58:35.000000000 +0900
@@ -0,0 +1,159 @@
+/*
+ * testlpp.c: use the /dev/lpptest device to test IRQ handling
+ *            latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner
+ *
+ * licensed under the GPL
+ */
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+#define HIST_SIZE 10000
+
+static int hist_total;
+static unsigned long hist[HIST_SIZE];
+
+static void hist_hit(unsigned long usecs)
+{
+	hist_total++;
+	if (usecs >= HIST_SIZE-1)
+		hist[HIST_SIZE-1]++;
+	else
+		hist[usecs]++;
+}
+
+static void print_hist(void)
+{
+	int i;
+
+	printf("LPP latency histogram:\n");
+
+	for (i = 0; i < HIST_SIZE; i++) {
+		if (hist[i])
+			printf("%3d usecs: %9ld\n", i, hist[i]);
+	}
+}
+
+static inline unsigned long long int rdtsc(void)
+{
+	unsigned long long int x, y;
+	for (;;) {
+		__asm__ volatile ("rdtsc" : "=A" (x));
+		__asm__ volatile ("rdtsc" : "=A" (y));
+		if (y - x < 1000)
+			return y;
+	}
+}
+
+static unsigned long long calibrate_loop(void)
+{
+	unsigned long long mytime1, mytime2;
+
+	mytime1 = rdtsc();
+	usleep(500000);
+	mytime2 = rdtsc();
+
+	return (mytime2 - mytime1) * 2;
+}
+
+#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec)
+
+#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec)
+
+int fd, total;
+unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec;
+
+void cleanup(int sig)
+{
+	ioctl (fd, LPPTEST_ENABLE, &tim);
+	if (sig)
+		printf("[ interrupted - exiting ]\n");
+	printf("\ntotal number of responses: %d\n", total);
+	printf("average reponse latency:   %.2lf usecs\n",
+		time_to_usecs(sum_tim/total));
+	printf("minimum latency:           %.2lf usecs\n",
+			time_to_usecs(min_tim));
+	printf("maximum latency:           %.2lf usecs\n",
+			time_to_usecs(max_tim));
+	print_hist();
+	exit(0);
+}
+
+#define HZ 3000
+
+int main (int argc, char **argv)
+{
+	unsigned int nr_requests = 0;
+
+	if (argc > 2) {
+		fprintf(stderr, "usage: testlpp [<nr_of_requests>]\n");
+		exit(-1);
+	}
+	if (argc == 2)
+		nr_requests = atol(argv[1]);
+
+	if (getuid() != 0) {
+		fprintf(stderr, "need to run as root!\n");
+		exit(-1);
+	}
+	mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1));
+
+	fd = open("/dev/lpptest", O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n");
+		exit(-1);
+	}
+
+	signal(SIGINT,&cleanup);
+
+	ioctl (fd, LPPTEST_DISABLE, &tim);
+
+	fprintf(stderr, "calibrating cycles to usecs: ");
+	cycles_per_sec = calibrate_loop();
+	fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000);
+	if (nr_requests)
+		fprintf(stderr, "[max # of requests: %u]\n", nr_requests);
+	fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ);
+
+	while(1) {
+		ioctl (fd, LPPTEST_TEST, &tim);
+		if (tim == 0)
+			printf ("No response from target.\n");
+		else {
+			hist_hit(time_to_usecs_l(tim));
+			if (tim > max_tim) {
+				printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim);
+				max_tim = tim;
+			}
+			if (tim < min_tim)
+				min_tim = tim;
+			total++;
+			if (total == nr_requests)
+				break;
+			sum_tim += tim;
+		}
+		usleep(1000000/HZ);
+	}
+	cleanup(0);
+
+	return 0;
+}
+
+
diff -urN ./linux-2.6.18.1/security/selinux/hooks.c linux-2.6.18.1-cabi-20070529-RT_HRT/security/selinux/hooks.c
--- ./linux-2.6.18.1/security/selinux/hooks.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/security/selinux/hooks.c	2007-05-19 23:58:35.000000000 +0900
@@ -3419,8 +3419,8 @@
 {
 	u16 family;
 	u16 sock_class = 0;
-	char *addrp;
-	int len, err = 0;
+	char *addrp = NULL /* shut up gcc */;
+	int len = 0 /* shut up gcc */, err = 0;
 	u32 sock_sid = 0;
 	struct socket *sock;
 	struct avc_audit_data ad;
diff -urN ./linux-2.6.18.1/sound/core/control_compat.c linux-2.6.18.1-cabi-20070529-RT_HRT/sound/core/control_compat.c
--- ./linux-2.6.18.1/sound/core/control_compat.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/sound/core/control_compat.c	2007-05-19 23:58:35.000000000 +0900
@@ -219,7 +219,7 @@
 				    struct snd_ctl_elem_value32 __user *data32,
 				    int *typep, int *countp)
 {
-	int i, type, count, size;
+	int i, type, count = 0 /* shut up gcc warning */, size;
 	unsigned int indirect;
 
 	if (copy_from_user(&data->id, &data32->id, sizeof(data->id)))
diff -urN ./linux-2.6.18.1/sound/core/hwdep.c linux-2.6.18.1-cabi-20070529-RT_HRT/sound/core/hwdep.c
--- ./linux-2.6.18.1/sound/core/hwdep.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/sound/core/hwdep.c	2007-05-19 23:58:35.000000000 +0900
@@ -158,6 +158,7 @@
 {
 	int err = -ENXIO;
 	struct snd_hwdep *hw = file->private_data;
+	struct module *mod = hw->card->module;
 	mutex_lock(&hw->open_mutex);
 	if (hw->ops.release) {
 		err = hw->ops.release(hw, file);
@@ -167,7 +168,7 @@
 		hw->used--;
 	snd_card_file_remove(hw->card, file);
 	mutex_unlock(&hw->open_mutex);
-	module_put(hw->card->module);
+	module_put(mod);
 	return err;
 }
 
diff -urN ./linux-2.6.18.1/sound/core/pcm_lib.c linux-2.6.18.1-cabi-20070529-RT_HRT/sound/core/pcm_lib.c
--- ./linux-2.6.18.1/sound/core/pcm_lib.c	2006-10-14 12:34:03.000000000 +0900
+++ linux-2.6.18.1-cabi-20070529-RT_HRT/sound/core/pcm_lib.c	2007-05-19 23:58:35.000000000 +0900
@@ -132,6 +132,7 @@
 	snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN);
 #ifdef CONFIG_SND_PCM_XRUN_DEBUG
 	if (substream->pstr->xrun_debug) {
+		user_trace_stop();
 		snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n",
 			   substream->pcm->card->number,
 			   substream->pcm->device,