Index: linux-2.6.25.8-rt7/security/selinux/avc.c =================================================================== --- linux-2.6.25.8-rt7.orig/security/selinux/avc.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/security/selinux/avc.c 2008-06-23 19:13:01.000000000 -0400 @@ -312,6 +312,7 @@ static inline int avc_reclaim_node(void) if (!spin_trylock_irqsave(&avc_cache.slots_lock[hvalue], flags)) continue; + rcu_read_lock(); list_for_each_entry(node, &avc_cache.slots[hvalue], list) { if (atomic_dec_and_test(&node->ae.used)) { /* Recently Unused */ @@ -319,11 +320,13 @@ static inline int avc_reclaim_node(void) avc_cache_stats_incr(reclaims); ecx++; if (ecx >= AVC_CACHE_RECLAIM) { + rcu_read_unlock(); spin_unlock_irqrestore(&avc_cache.slots_lock[hvalue], flags); goto out; } } } + rcu_read_unlock(); spin_unlock_irqrestore(&avc_cache.slots_lock[hvalue], flags); } out: @@ -821,8 +824,14 @@ int avc_ss_reset(u32 seqno) for (i = 0; i < AVC_CACHE_SLOTS; i++) { spin_lock_irqsave(&avc_cache.slots_lock[i], flag); + /* + * On -rt the outer spinlock does not prevent RCU + * from being performed: + */ + rcu_read_lock(); list_for_each_entry(node, &avc_cache.slots[i], list) avc_node_delete(node); + rcu_read_unlock(); spin_unlock_irqrestore(&avc_cache.slots_lock[i], flag); } Index: linux-2.6.25.8-rt7/security/selinux/netif.c =================================================================== --- linux-2.6.25.8-rt7.orig/security/selinux/netif.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/security/selinux/netif.c 2008-06-23 19:13:01.000000000 -0400 @@ -259,11 +259,13 @@ static void sel_netif_flush(void) int idx; struct sel_netif *netif; + rcu_read_lock(); spin_lock_bh(&sel_netif_lock); for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) list_for_each_entry(netif, &sel_netif_hash[idx], list) sel_netif_destroy(netif); spin_unlock_bh(&sel_netif_lock); + rcu_read_unlock(); } static int sel_netif_avc_callback(u32 event, u32 ssid, u32 tsid, Index: linux-2.6.25.8-rt7/kernel/rcupreempt_trace.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rcupreempt_trace.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rcupreempt_trace.c 2008-06-23 19:13:02.000000000 -0400 @@ -296,8 +296,14 @@ static int rcupreempt_debugfs_init(void) NULL, &rcuctrs_fops); if (!ctrsdir) goto free_out; + + if (!rcu_trace_boost_create(rcudir)) + goto free_out; + return 0; free_out: + if (ctrsdir) + debugfs_remove(ctrsdir); if (statdir) debugfs_remove(statdir); if (gpdir) @@ -309,15 +315,21 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) { + rcu_trace_boost_destroy(); debugfs_remove(statdir); debugfs_remove(gpdir); debugfs_remove(ctrsdir); Index: linux-2.6.25.8-rt7/include/linux/rcupreempt.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/rcupreempt.h 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/rcupreempt.h 2008-06-23 19:13:04.000000000 -0400 @@ -44,7 +44,39 @@ #define rcu_qsctr_inc(cpu) #define rcu_bh_qsctr_inc(cpu) -#define call_rcu_bh(head, rcu) call_rcu(head, rcu) + +/* + * Someone might want to pass call_rcu_bh as a function pointer. + * So this needs to just be a rename and not a macro function. + * (no parentheses) + */ +#define call_rcu_bh call_rcu + +#ifdef CONFIG_PREEMPT_RCU_BOOST +/* + * Task state with respect to being RCU-boosted. This state is changed + * by the task itself in response to the following three events: + * 1. Preemption (or block on lock) while in RCU read-side critical section. + * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section. + * + * The RCU-boost task also updates the state when boosting priority. + */ +enum rcu_boost_state { + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */ + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */ + RCU_BOOSTED = 2, /* Boosting complete. */ + RCU_BOOST_INVALID = 3, /* For bogus state sightings. */ +}; + +#define N_RCU_BOOST_STATE (RCU_BOOST_INVALID + 1) + +int __init rcu_preempt_boost_init(void); + +#else /* CONFIG_PREEPMT_RCU_BOOST */ + +#define rcu_preempt_boost_init() do { } while (0) + +#endif /* CONFIG_PREEMPT_RCU_BOOST */ extern void __rcu_read_lock(void) __acquires(RCU); extern void __rcu_read_unlock(void) __releases(RCU); @@ -89,14 +121,26 @@ static inline void rcu_enter_nohz(void) { smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ __get_cpu_var(dynticks_progress_counter)++; - WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1); + if (unlikely(__get_cpu_var(dynticks_progress_counter) & 0x1)) { + printk("BUG: bad accounting of dynamic ticks\n"); + printk(" will try to fix, but it is best to reboot\n"); + WARN_ON(1); + /* try to fix it */ + __get_cpu_var(dynticks_progress_counter)++; + } } static inline void rcu_exit_nohz(void) { __get_cpu_var(dynticks_progress_counter)++; smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ - WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1)); + if (unlikely(!(__get_cpu_var(dynticks_progress_counter) & 0x1))) { + printk("BUG: bad accounting of dynamic ticks\n"); + printk(" will try to fix, but it is best to reboot\n"); + WARN_ON(1); + /* try to fix it */ + __get_cpu_var(dynticks_progress_counter)++; + } } #else /* CONFIG_NO_HZ */ @@ -104,5 +148,7 @@ static inline void rcu_exit_nohz(void) #define rcu_exit_nohz() do { } while (0) #endif /* CONFIG_NO_HZ */ +extern void rcu_process_callbacks(struct softirq_action *unused); + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPREEMPT_H */ Index: linux-2.6.25.8-rt7/include/linux/init_task.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/init_task.h 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/init_task.h 2008-06-23 19:14:27.000000000 -0400 @@ -10,6 +10,7 @@ #include #include #include +#include #define INIT_FDTABLE \ { \ @@ -87,6 +88,24 @@ extern struct nsproxy init_nsproxy; .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh), \ } +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define INIT_RCU_BOOST_PRIO .rcu_prio = MAX_PRIO, +#define INIT_PREEMPT_RCU_BOOST(tsk) \ + .rcub_rbdp = NULL, \ + .rcub_state = RCU_BOOST_IDLE, \ + .rcub_entry = LIST_HEAD_INIT(tsk.rcub_entry), +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +#define INIT_RCU_BOOST_PRIO +#define INIT_PREEMPT_RCU_BOOST(tsk) +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + +#ifdef CONFIG_PREEMPT_RT +# define INIT_RW_OWNERS(tsk) .owned_read_locks = { \ + [0 ... (MAX_RWLOCK_DEPTH - 1) ] = { .task = &tsk } }, +#else +# define INIT_RW_OWNERS(tsk) +#endif + extern struct group_info init_groups; #define INIT_STRUCT_PID { \ @@ -148,6 +167,7 @@ extern struct group_info init_groups; .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ + INIT_RCU_BOOST_PRIO \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ @@ -186,7 +206,8 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ - .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .posix_timer_list = NULL, \ + .pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ @@ -196,6 +217,8 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PREEMPT_RCU_BOOST(tsk) \ + INIT_RW_OWNERS(tsk) \ } Index: linux-2.6.25.8-rt7/include/linux/rcupdate.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/rcupdate.h 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/rcupdate.h 2008-06-23 19:13:02.000000000 -0400 @@ -245,5 +245,49 @@ extern long rcu_batches_completed_bh(voi extern void rcu_init(void); extern int rcu_needs_cpu(int cpu); +struct dentry; + +#ifdef CONFIG_PREEMPT_RCU_BOOST +extern void init_rcu_boost_late(void); +extern void rcu_boost_readers(void); +extern void rcu_unboost_readers(void); +extern void __rcu_preempt_boost(void); +#ifdef CONFIG_RCU_TRACE +extern int rcu_trace_boost_create(struct dentry *rcudir); +extern void rcu_trace_boost_destroy(void); +#endif /* CONFIG_RCU_TRACE */ +#define rcu_preempt_boost() /* cpp to avoid #include hell. */ \ + do { \ + if (unlikely(current->rcu_read_lock_nesting > 0)) \ + __rcu_preempt_boost(); \ + } while (0) +extern void __rcu_preempt_unboost(void); +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +static inline void init_rcu_boost_late(void) +{ +} +static inline void rcu_preempt_boost(void) +{ +} +static inline void __rcu_preempt_unboost(void) +{ +} +static inline void rcu_boost_readers(void) +{ +} +static inline void rcu_unboost_readers(void) +{ +} +#ifdef CONFIG_RCU_TRACE +static inline int rcu_trace_boost_create(struct dentry *rcudir) +{ + return 0; +} +static inline void rcu_trace_boost_destroy(void) +{ +} +#endif /* CONFIG_RCU_TRACE */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPDATE_H */ Index: linux-2.6.25.8-rt7/include/linux/sched.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/sched.h 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/sched.h 2008-06-23 19:14:36.000000000 -0400 @@ -92,6 +92,28 @@ struct sched_param { #include +#ifdef CONFIG_PREEMPT +extern int kernel_preemption; +#else +# define kernel_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY +extern int voluntary_preemption; +#else +# define voluntary_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif + +#ifdef CONFIG_PREEMPT_HARDIRQS +extern int hardirq_preemption; +#else +# define hardirq_preemption 0 +#endif + struct mem_cgroup; struct exec_domain; struct futex_pi_state; @@ -160,6 +182,8 @@ print_cfs_rq(struct seq_file *m, int cpu } #endif +extern struct semaphore kernel_sem; + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -171,16 +195,18 @@ print_cfs_rq(struct seq_file *m, int cpu * mistake. */ #define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING_MUTEX 1 +#define TASK_INTERRUPTIBLE 2 +#define TASK_UNINTERRUPTIBLE 4 +#define __TASK_STOPPED 8 +#define __TASK_TRACED 16 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 64 /* in tsk->state again */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 +#define TASK_NONINTERACTIVE 128 +#define TASK_DEAD 256 +#define TASK_WAKEKILL 512 /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) @@ -189,10 +215,12 @@ print_cfs_rq(struct seq_file *m, int cpu /* Convenience macros for the sake of wake_up */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) -#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) +#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED | \ + TASK_RUNNING_MUTEX) /* get_task_state() */ -#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ +#define TASK_REPORT (TASK_RUNNING | TASK_RUNNING_MUTEX | \ + TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED) @@ -208,6 +236,28 @@ print_cfs_rq(struct seq_file *m, int cpu #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) +// #define PREEMPT_DIRECT + +#ifdef CONFIG_X86_LOCAL_APIC +extern void nmi_show_all_regs(void); +#else +# define nmi_show_all_regs() do { } while (0) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct exec_domain; + /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -246,6 +296,8 @@ extern asmlinkage void schedule_tail(str extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern int runqueue_is_locked(void); + extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); @@ -269,6 +321,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern int irq_show_regs_callback(int cpu, struct pt_regs *regs); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -289,6 +342,12 @@ extern void hrtick_resched(void); extern void sched_show_task(struct task_struct *p); +#ifdef CONFIG_GENERIC_HARDIRQS +extern int debug_direct_keyboard; +#else +# define debug_direct_keyboard 0 +#endif + #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void spawn_softlockup_task(void); @@ -329,6 +388,11 @@ extern signed long schedule_timeout_inte extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +/* + * This one can be called with interrupts disabled, only + * to be used by lowlevel arch code! + */ +asmlinkage void __sched __schedule(void); struct nsproxy; struct user_namespace; @@ -557,6 +621,19 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define set_rcu_prio(p, prio) /* cpp to avoid #include hell */ \ + do { \ + (p)->rcu_prio = (prio); \ + } while (0) +#define get_rcu_prio(p) (p)->rcu_prio /* cpp to avoid #include hell */ +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +static inline void set_rcu_prio(struct task_struct *p, int prio) +{ +} +#define get_rcu_prio(p) (MAX_PRIO) /* cpp to use MAX_PRIO before it's defined */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { @@ -891,8 +968,8 @@ struct sched_class { void (*task_new) (struct rq *rq, struct task_struct *p); void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); - void (*join_domain)(struct rq *rq); - void (*leave_domain)(struct rq *rq); + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); @@ -991,6 +1068,16 @@ struct sched_rt_entity { #endif }; +#ifdef CONFIG_PREEMPT_RT +struct rw_mutex; +struct reader_lock_struct { + struct rw_mutex *lock; + struct list_head list; + struct task_struct *task; + int count; +}; + +#endif struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -1001,12 +1088,13 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; #endif -#endif int prio, static_prio, normal_prio; +#ifdef CONFIG_PREEMPT_RCU_BOOST + int rcu_prio; +#endif const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -1041,6 +1129,12 @@ struct task_struct { #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif +#ifdef CONFIG_PREEMPT_RCU_BOOST + struct rcu_boost_dat *rcub_rbdp; + enum rcu_boost_state rcub_state; + struct list_head rcub_entry; + unsigned long rcu_preempt_counter; +#endif struct list_head tasks; /* @@ -1104,6 +1198,8 @@ struct task_struct { unsigned long long it_sched_expires; struct list_head cpu_timers[3]; + struct task_struct* posix_timer_list; + /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; @@ -1169,7 +1265,7 @@ struct task_struct { spinlock_t alloc_lock; /* Protection of the PI data structures: */ - spinlock_t pi_lock; + raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ @@ -1182,6 +1278,7 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif + int pagefault_disabled; #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; int hardirqs_enabled; @@ -1205,6 +1302,38 @@ struct task_struct { unsigned int lockdep_recursion; #endif +#define MAX_PREEMPT_TRACE 25 +#define MAX_RWLOCK_DEPTH 5 + +#ifdef CONFIG_PREEMPT_RT + int reader_lock_count; + struct reader_lock_struct owned_read_locks[MAX_RWLOCK_DEPTH]; +#endif + +#ifdef CONFIG_PREEMPT_TRACE + unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; + unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; +#endif + +#define MAX_LOCK_STACK MAX_PREEMPT_TRACE +#ifdef CONFIG_DEBUG_PREEMPT + int lock_count; +# ifdef CONFIG_PREEMPT_RT + struct rt_mutex *owned_lock[MAX_LOCK_STACK]; +# endif +#endif +#ifdef CONFIG_DETECT_SOFTLOCKUP + unsigned long softlockup_count; /* Count to keep track how long the + * thread is in the kernel without + * sleeping. + */ +#endif + /* realtime bits */ + +#ifdef CONFIG_DEBUG_RT_MUTEXES + void *last_kernel_lock; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1271,26 +1400,22 @@ struct task_struct { int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif +#ifdef CONFIG_PREEMPT_RT + /* + * Temporary hack, until we find a solution to + * handle printk in atomic operations. + */ + int in_printk; +#endif }; -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#ifdef CONFIG_PREEMPT_RT +# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) +#else +# define set_printk_might_sleep(x) do { } while(0) +#endif -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +#include static inline int rt_prio(int prio) { @@ -1437,6 +1562,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1444,6 +1578,7 @@ static inline void put_task_struct(struc if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif /* * Per process flags @@ -1454,6 +1589,7 @@ static inline void put_task_struct(struc #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_NOSCHED 0x00000020 /* Userspace does not expect scheduling */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1461,6 +1597,7 @@ static inline void put_task_struct(struc #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_KMAP 0x00004000 /* this context has a kmap */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ @@ -1472,6 +1609,8 @@ static inline void put_task_struct(struc #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_SOFTIRQ 0x04000000 /* softirq context */ +#define PF_HARDIRQ 0x08000000 /* hardirq context */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ @@ -1514,6 +1653,35 @@ static inline int set_cpus_allowed(struc extern unsigned long long sched_clock(void); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init(void) +{ +} + +static inline u64 sched_clock_cpu(int cpu) +{ + return sched_clock(); +} + +static inline void sched_clock_tick(void) +{ +} + +static inline void sched_clock_idle_sleep_event(void) +{ +} + +static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +{ +} +#else +extern void sched_clock_init(void); +extern u64 sched_clock_cpu(int cpu); +extern void sched_clock_tick(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#endif + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): @@ -1566,9 +1734,14 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_compat_yield; +extern void task_setprio(struct task_struct *p, int prio); + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline int rt_mutex_getprio(struct task_struct *p) @@ -1590,6 +1763,7 @@ extern struct task_struct *curr_task(int extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); +void __yield(void); /* * The default (Linux) execution domain. @@ -1661,6 +1835,9 @@ extern void do_timer(unsigned long ticks extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_process_mutex(struct task_struct * tsk); +extern int wake_up_process_sync(struct task_struct * tsk); +extern int wake_up_process_mutex_sync(struct task_struct * tsk); extern void wake_up_new_task(struct task_struct *tsk, unsigned long clone_flags); #ifdef CONFIG_SMP @@ -1753,12 +1930,20 @@ extern struct mm_struct * mm_alloc(void) /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); +extern void __mmdrop_delayed(struct mm_struct *); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -1931,6 +2116,11 @@ static inline void clear_tsk_need_resche clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } +static inline int test_tsk_need_resched(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); +} + static inline int signal_pending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); @@ -1943,9 +2133,29 @@ static inline int fatal_signal_pending(s return signal_pending(p) && __fatal_signal_pending(p); } +static inline int _need_resched(void) +{ + return unlikely(test_tsk_need_resched(current)); +} + static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return _need_resched(); +} + +static inline void set_tsk_need_resched_delayed(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline int need_resched_delayed(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED)); } /* @@ -1967,15 +2177,23 @@ static inline int cond_resched(void) return _cond_resched(); } #endif -extern int cond_resched_lock(spinlock_t * lock); +extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock); +extern int __cond_resched_spinlock(spinlock_t *spinlock); + +#define cond_resched_lock(lock) \ + PICK_SPIN_OP_RET(__cond_resched_raw_spinlock, __cond_resched_spinlock,\ + lock) + extern int cond_resched_softirq(void); +extern int cond_resched_softirq_context(void); +extern int cond_resched_hardirq_context(void); /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, * but a general need for low latency) */ -static inline int spin_needbreak(spinlock_t *lock) +static inline int __raw_spin_needbreak(raw_spinlock_t *lock) { #ifdef CONFIG_PREEMPT return spin_is_contended(lock); @@ -1984,6 +2202,37 @@ static inline int spin_needbreak(spinloc #endif } +#ifdef CONFIG_PREEMPT_RT +static inline int __spin_needbreak(spinlock_t *lock) +{ + return lock->break_lock; +} +#else +static inline int __spin_needbreak(spinlock_t *lock) +{ + /* should never be call outside of RT */ + BUG(); + return 0; +} +#endif + +#define spin_needbreak(lock) \ + PICK_SPIN_OP_RET(__raw_spin_needbreak, __spin_needbreak, lock) + +static inline int softirq_need_resched(void) +{ + if (softirq_preemption && (current->flags & PF_SOFTIRQ)) + return need_resched(); + return 0; +} + +static inline int hardirq_need_resched(void) +{ + if (hardirq_preemption && (current->flags & PF_HARDIRQ)) + return need_resched(); + return 0; +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. @@ -2031,6 +2280,18 @@ static inline void arch_pick_mmap_layout } #endif +#ifdef CONFIG_TRACING +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); +#else +static inline void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +} +#endif + extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); @@ -2094,6 +2355,12 @@ static inline void inc_syscw(struct task } #endif +#ifdef CONFIG_PREEMPT_TRACE +void print_preempt_trace(struct task_struct *tsk); +#else +# define print_preempt_trace(tsk) do { } while (0) +#endif + #ifdef CONFIG_SMP void migration_init(void); #else @@ -2106,6 +2373,15 @@ static inline void migration_init(void) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_SMP +static inline int task_is_current(struct task_struct *task) +{ + return task->oncpu; +} +#endif + +#define TASK_STATE_TO_CHAR_STR "RMSDTtZX" + #endif /* __KERNEL__ */ #endif Index: linux-2.6.25.8-rt7/kernel/Kconfig.preempt =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/Kconfig.preempt 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/Kconfig.preempt 2008-06-23 19:13:44.000000000 -0400 @@ -1,14 +1,13 @@ - choice - prompt "Preemption Model" - default PREEMPT_NONE + prompt "Preemption Mode" + default PREEMPT_RT config PREEMPT_NONE bool "No Forced Preemption (Server)" help - This is the traditional Linux preemption model, geared towards + This is the traditional Linux preemption model geared towards throughput. It will still provide good latencies most of the - time, but there are no guarantees and occasional longer delays + time but there are no guarantees and occasional long delays are possible. Select this option if you are building a kernel for a server or @@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum + preemption points have been selected to minimize the maximum latency of rescheduling, providing faster application reactions, at the cost of slightly lower throughput. @@ -33,25 +32,95 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_DESKTOP bool "Preemptible Kernel (Low-Latency Desktop)" help This option reduces the latency of the kernel by making - all kernel code (that is not executing in a critical section) + all kernel code that is not executing in a critical section preemptible. This allows reaction to interactive events by permitting a low priority process to be preempted involuntarily even if it is in kernel mode executing a system call and would - otherwise not be about to reach a natural preemption point. - This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slightly lower throughput - and a slight runtime overhead to kernel code. + otherwise not about to reach a preemption point. This allows + applications to run more 'smoothly' even when the system is + under load, at the cost of slighly lower throughput and a + slight runtime overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 50% of the time.) Select this if you are building a kernel for a desktop or embedded system with latency requirements in the milliseconds range. +config PREEMPT_RT + bool "Complete Preemption (Real-Time)" + select PREEMPT_SOFTIRQS + select PREEMPT_HARDIRQS + select PREEMPT_RCU + select RT_MUTEXES + help + This option further reduces the scheduling latency of the + kernel by replacing almost every spinlock used by the kernel + with preemptible mutexes and thus making all but the most + critical kernel code involuntarily preemptible. The remaining + handful of lowlevel non-preemptible codepaths are short and + have a deterministic latency of a couple of tens of + microseconds (depending on the hardware). This also allows + applications to run more 'smoothly' even when the system is + under load, at the cost of lower throughput and runtime + overhead to kernel code. + + (According to profiles, when this mode is selected then even + during kernel-intense workloads the system is in an immediately + preemptible state more than 95% of the time.) + + Select this if you are building a kernel for a desktop, + embedded or real-time system with guaranteed latency + requirements of 100 usecs or lower. + endchoice +config PREEMPT + bool + default y + depends on PREEMPT_DESKTOP || PREEMPT_RT + +config PREEMPT_SOFTIRQS + bool "Thread Softirqs" + default n +# depends on PREEMPT + help + This option reduces the latency of the kernel by 'threading' + soft interrupts. This means that all softirqs will execute + in softirqd's context. While this helps latency, it can also + reduce performance. + + The threading of softirqs can also be controlled via + /proc/sys/kernel/softirq_preemption runtime flag and the + sofirq-preempt=0/1 boot-time option. + + Say N if you are unsure. + +config PREEMPT_HARDIRQS + bool "Thread Hardirqs" + default n + depends on !GENERIC_HARDIRQS_NO__DO_IRQ + select PREEMPT_SOFTIRQS + help + This option reduces the latency of the kernel by 'threading' + hardirqs. This means that all (or selected) hardirqs will run + in their own kernel thread context. While this helps latency, + this feature can also reduce performance. + + The threading of hardirqs can also be controlled via the + /proc/sys/kernel/hardirq_preemption runtime flag and the + hardirq-preempt=0/1 boot-time option. Per-irq threading can + be enabled/disable via the /proc/irq///threaded + runtime flags. + + Say N if you are unsure. + config PREEMPT_RCU bool "Preemptible RCU" depends on PREEMPT @@ -66,6 +135,20 @@ config PREEMPT_RCU Say N if you are unsure. +config PREEMPT_RCU_BOOST + bool "Enable priority boosting of RCU read-side critical sections" + depends on PREEMPT_RCU + default y if PREEMPT_RT + help + This option permits priority boosting of RCU read-side critical + sections tat have been preempted and a RT process is waiting + on a synchronize_rcu. + + An RCU thread is also created that periodically wakes up and + performs a synchronize_rcu to make sure that all readers eventually + do complete to prevent an indefinite delay of grace periods and + possible OOM problems. + config RCU_TRACE bool "Enable tracing for RCU - currently stats in debugfs" depends on PREEMPT_RCU Index: linux-2.6.25.8-rt7/kernel/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/Makefile 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/Makefile 2008-06-23 19:14:35.000000000 -0400 @@ -7,14 +7,30 @@ obj-y = sched.o fork.o exec_domain.o sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o nsproxy.o srcu.o \ - notifier.o ksysfs.o pm_qos_params.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o + +CFLAGS_REMOVE_sched.o = -mno-spe + +ifdef CONFIG_FTRACE +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_marker.o = -pg +endif obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -26,6 +42,7 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_PREEMPT_RT) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o @@ -56,9 +73,11 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RWLOCK_TORTURE_TEST) += rwlock_torture.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o +obj-$(CONFIG_PREEMPT_RCU_BOOST) += rcupreempt-boost.o ifeq ($(CONFIG_PREEMPT_RCU),y) obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o endif @@ -68,6 +87,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayac obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is Index: linux-2.6.25.8-rt7/kernel/fork.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/fork.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/fork.c 2008-06-23 19:14:28.000000000 -0400 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,8 @@ #include #include #include +#include +#include #include #include #include @@ -73,6 +76,15 @@ DEFINE_PER_CPU(unsigned long, process_co __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); + +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + int nr_processes(void) { int cpu; @@ -117,10 +129,13 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct(struct task_struct *tsk) +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_cb(struct rcu_head *rhp) { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + BUG_ON(atomic_read(&tsk->usage)); WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); security_task_free(tsk); @@ -132,8 +147,27 @@ void __put_task_struct(struct task_struc free_task(tsk); } +#else + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + BUG_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + + if (!profile_handoff_task(tsk)) + free_task(tsk); +} +#endif + void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -161,6 +195,9 @@ void __init fork_init(unsigned long memp init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -226,6 +263,7 @@ static int dup_mmap(struct mm_struct *mm mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; @@ -990,6 +1028,9 @@ static void rt_mutex_init_task(struct ta #ifdef CONFIG_RT_MUTEXES plist_head_init(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + p->last_kernel_lock = NULL; +# endif #endif } @@ -1041,7 +1082,7 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif @@ -1079,7 +1120,13 @@ static struct task_struct *copy_process( #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_flipctr_idx = 0; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_PREEMPT_RCU_BOOST + p->rcu_prio = MAX_PRIO; + p->rcub_rbdp = NULL; + p->rcub_state = RCU_BOOST_IDLE; + INIT_LIST_HEAD(&p->rcub_entry); +#endif +#endif /* CONFIG_PREEMPT_RCU */ p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); @@ -1114,7 +1161,7 @@ static struct task_struct *copy_process( INIT_LIST_HEAD(&p->cpu_timers[0]); INIT_LIST_HEAD(&p->cpu_timers[1]); INIT_LIST_HEAD(&p->cpu_timers[2]); - + p->posix_timer_list = NULL; p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; @@ -1154,6 +1201,7 @@ static struct task_struct *copy_process( p->hardirq_context = 0; p->softirq_context = 0; #endif + p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1193,6 +1241,22 @@ static struct task_struct *copy_process( retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; +#ifdef CONFIG_DEBUG_PREEMPT + p->lock_count = 0; +#endif + +#ifdef CONFIG_PREEMPT_RT + p->reader_lock_count = 0; + { + int i; + for (i = 0; i < MAX_RWLOCK_DEPTH; i++) { + INIT_LIST_HEAD(&p->owned_read_locks[i].list); + p->owned_read_locks[i].count = 0; + p->owned_read_locks[i].lock = NULL; + p->owned_read_locks[i].task = p; + } + } +#endif if (pid != &init_struct_pid) { retval = -ENOMEM; @@ -1277,11 +1341,13 @@ static struct task_struct *copy_process( * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) @@ -1344,7 +1410,9 @@ static struct task_struct *copy_process( attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1788,3 +1856,124 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* + * This must be called from time to time on ia64, and is a + * no-op on other archs. Used to be in cpu_idle(), but with + * the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + Index: linux-2.6.25.8-rt7/kernel/rcupreempt-boost.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/rcupreempt-boost.c 2008-06-23 19:13:48.000000000 -0400 @@ -0,0 +1,588 @@ +/* + * Read-Copy Update preempt priority boosting + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright Red Hat Inc, 2007 + * + * Authors: Steven Rostedt + * + * Based on the original work by Paul McKenney . + * + */ +#include +#include +#include +#include +#include +#include +#include + +DEFINE_RAW_SPINLOCK(rcu_boost_wake_lock); +static int rcu_boost_prio = MAX_PRIO; /* Prio to set preempted RCU readers */ +static long rcu_boost_counter; /* used to keep track of who boosted */ +static int rcu_preempt_thread_secs = 3; /* Seconds between waking rcupreemptd thread */ + +struct rcu_boost_dat { + raw_spinlock_t rbs_lock; /* Sync changes to this struct */ + int rbs_prio; /* CPU copy of rcu_boost_prio */ + struct list_head rbs_toboost; /* Preempted RCU readers */ + struct list_head rbs_boosted; /* RCU readers that have been boosted */ +#ifdef CONFIG_RCU_TRACE + /* The rest are for statistics */ + unsigned long rbs_stat_task_boost_called; + unsigned long rbs_stat_task_boosted; + unsigned long rbs_stat_boost_called; + unsigned long rbs_stat_try_boost; + unsigned long rbs_stat_boosted; + unsigned long rbs_stat_unboost_called; + unsigned long rbs_stat_unboosted; + unsigned long rbs_stat_try_boost_readers; + unsigned long rbs_stat_boost_readers; + unsigned long rbs_stat_try_unboost_readers; + unsigned long rbs_stat_unboost_readers; + unsigned long rbs_stat_over_taken; +#endif /* CONFIG_RCU_TRACE */ +}; + +static DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_data); +#define RCU_BOOST_ME &__get_cpu_var(rcu_boost_data) + +#ifdef CONFIG_RCU_TRACE + +#define RCUPREEMPT_BOOST_TRACE_BUF_SIZE 4096 +static char rcupreempt_boost_trace_buf[RCUPREEMPT_BOOST_TRACE_BUF_SIZE]; + +static ssize_t rcuboost_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + int cnt = 0; + int cpu; + struct rcu_boost_dat *rbd; + ssize_t bcount; + unsigned long task_boost_called = 0; + unsigned long task_boosted = 0; + unsigned long boost_called = 0; + unsigned long try_boost = 0; + unsigned long boosted = 0; + unsigned long unboost_called = 0; + unsigned long unboosted = 0; + unsigned long try_boost_readers = 0; + unsigned long boost_readers = 0; + unsigned long try_unboost_readers = 0; + unsigned long unboost_readers = 0; + unsigned long over_taken = 0; + + mutex_lock(&mutex); + + for_each_online_cpu(cpu) { + rbd = &per_cpu(rcu_boost_data, cpu); + + task_boost_called += rbd->rbs_stat_task_boost_called; + task_boosted += rbd->rbs_stat_task_boosted; + boost_called += rbd->rbs_stat_boost_called; + try_boost += rbd->rbs_stat_try_boost; + boosted += rbd->rbs_stat_boosted; + unboost_called += rbd->rbs_stat_unboost_called; + unboosted += rbd->rbs_stat_unboosted; + try_boost_readers += rbd->rbs_stat_try_boost_readers; + boost_readers += rbd->rbs_stat_boost_readers; + try_unboost_readers += rbd->rbs_stat_try_boost_readers; + unboost_readers += rbd->rbs_stat_boost_readers; + over_taken += rbd->rbs_stat_over_taken; + } + + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "task_boost_called = %ld\n", + task_boost_called); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "task_boosted = %ld\n", + task_boosted); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "boost_called = %ld\n", + boost_called); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "try_boost = %ld\n", + try_boost); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "boosted = %ld\n", + boosted); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "unboost_called = %ld\n", + unboost_called); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "unboosted = %ld\n", + unboosted); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "try_boost_readers = %ld\n", + try_boost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "boost_readers = %ld\n", + boost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "try_unboost_readers = %ld\n", + try_unboost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "unboost_readers = %ld\n", + unboost_readers); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "over_taken = %ld\n", + over_taken); + cnt += snprintf(&rcupreempt_boost_trace_buf[cnt], + RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt, + "rcu_boost_prio = %d\n", + rcu_boost_prio); + bcount = simple_read_from_buffer(buffer, count, ppos, + rcupreempt_boost_trace_buf, strlen(rcupreempt_boost_trace_buf)); + mutex_unlock(&mutex); + + return bcount; +} + +static struct file_operations rcuboost_fops = { + .read = rcuboost_read, +}; + +static struct dentry *rcuboostdir; +int rcu_trace_boost_create(struct dentry *rcudir) +{ + rcuboostdir = debugfs_create_file("rcuboost", 0444, rcudir, + NULL, &rcuboost_fops); + if (!rcuboostdir) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(rcu_trace_boost_create); + +void rcu_trace_boost_destroy(void) +{ + if (rcuboostdir) + debugfs_remove(rcuboostdir); + rcuboostdir = NULL; +} +EXPORT_SYMBOL_GPL(rcu_trace_boost_destroy); + +#define RCU_BOOST_TRACE_FUNC_DECL(type) \ + static void rcu_trace_boost_##type(struct rcu_boost_dat *rbd) \ + { \ + rbd->rbs_stat_##type++; \ + } +RCU_BOOST_TRACE_FUNC_DECL(task_boost_called) +RCU_BOOST_TRACE_FUNC_DECL(task_boosted) +RCU_BOOST_TRACE_FUNC_DECL(boost_called) +RCU_BOOST_TRACE_FUNC_DECL(try_boost) +RCU_BOOST_TRACE_FUNC_DECL(boosted) +RCU_BOOST_TRACE_FUNC_DECL(unboost_called) +RCU_BOOST_TRACE_FUNC_DECL(unboosted) +RCU_BOOST_TRACE_FUNC_DECL(try_boost_readers) +RCU_BOOST_TRACE_FUNC_DECL(boost_readers) +RCU_BOOST_TRACE_FUNC_DECL(try_unboost_readers) +RCU_BOOST_TRACE_FUNC_DECL(unboost_readers) +RCU_BOOST_TRACE_FUNC_DECL(over_taken) +#else /* CONFIG_RCU_TRACE */ +/* These were created by the above macro "RCU_BOOST_TRACE_FUNC_DECL" */ +# define rcu_trace_boost_task_boost_called(rbd) do { } while (0) +# define rcu_trace_boost_task_boosted(rbd) do { } while (0) +# define rcu_trace_boost_boost_called(rbd) do { } while (0) +# define rcu_trace_boost_try_boost(rbd) do { } while (0) +# define rcu_trace_boost_boosted(rbd) do { } while (0) +# define rcu_trace_boost_unboost_called(rbd) do { } while (0) +# define rcu_trace_boost_unboosted(rbd) do { } while (0) +# define rcu_trace_boost_try_boost_readers(rbd) do { } while (0) +# define rcu_trace_boost_boost_readers(rbd) do { } while (0) +# define rcu_trace_boost_try_unboost_readers(rbd) do { } while (0) +# define rcu_trace_boost_unboost_readers(rbd) do { } while (0) +# define rcu_trace_boost_over_taken(rbd) do { } while (0) +#endif /* CONFIG_RCU_TRACE */ + +static inline int rcu_is_boosted(struct task_struct *task) +{ + return !list_empty(&task->rcub_entry); +} + +/* + * Helper function to boost a task's prio. + */ +static void rcu_boost_task(struct task_struct *task) +{ + WARN_ON(!irqs_disabled()); + WARN_ON_SMP(!spin_is_locked(&task->pi_lock)); + + rcu_trace_boost_task_boost_called(RCU_BOOST_ME); + + if (task->rcu_prio < task->prio) { + rcu_trace_boost_task_boosted(RCU_BOOST_ME); + task_setprio(task, task->rcu_prio); + } +} + +/** + * __rcu_preepmt_boost - Called by sleeping RCU readers. + * + * When the RCU read-side critical section is preempted + * (or schedules out due to RT mutex) + * it places itself onto a list to notify that it is sleeping + * while holding a RCU read lock. If there is already a + * synchronize_rcu happening, then it will increase its + * priority (if necessary). + */ +void __rcu_preempt_boost(void) +{ + struct task_struct *curr = current; + struct rcu_boost_dat *rbd; + int prio; + unsigned long flags; + + WARN_ON(!current->rcu_read_lock_nesting); + + rcu_trace_boost_boost_called(RCU_BOOST_ME); + + /* check to see if we are already boosted */ + if (unlikely(rcu_is_boosted(curr))) + return; + + /* + * To keep us from preempting between grabing + * the rbd and locking it, we use local_irq_save + */ + local_irq_save(flags); + rbd = &__get_cpu_var(rcu_boost_data); + spin_lock(&rbd->rbs_lock); + + spin_lock(&curr->pi_lock); + + curr->rcub_rbdp = rbd; + + rcu_trace_boost_try_boost(rbd); + + prio = rt_mutex_getprio(curr); + + if (list_empty(&curr->rcub_entry)) + list_add_tail(&curr->rcub_entry, &rbd->rbs_toboost); + if (prio <= rbd->rbs_prio) + goto out; + + rcu_trace_boost_boosted(curr->rcub_rbdp); + + set_rcu_prio(curr, rbd->rbs_prio); + rcu_boost_task(curr); + + out: + spin_unlock(&curr->pi_lock); + spin_unlock_irqrestore(&rbd->rbs_lock, flags); +} + +/** + * __rcu_preempt_unboost - called when releasing the RCU read lock + * + * When releasing the RCU read lock, a check is made to see if + * the task was preempted. If it was, it removes itself from the + * RCU data lists and if necessary, sets its priority back to + * normal. + */ +void __rcu_preempt_unboost(void) +{ + struct task_struct *curr = current; + struct rcu_boost_dat *rbd; + int prio; + unsigned long flags; + + rcu_trace_boost_unboost_called(RCU_BOOST_ME); + + /* if not boosted, then ignore */ + if (likely(!rcu_is_boosted(curr))) + return; + + /* + * Need to be very careful with NMIs. + * If we take the lock and an NMI comes in + * and it may try to unboost us if curr->rcub_rbdp + * is still set. So we zero it before grabbing the lock. + * But this also means that we might be boosted again + * so the boosting code needs to be aware of this. + */ + rbd = curr->rcub_rbdp; + curr->rcub_rbdp = NULL; + + /* + * Now an NMI might have came in after we grab + * the below lock. This check makes sure that + * the NMI doesn't try grabbing the lock + * while we already have it. + */ + if (unlikely(!rbd)) + return; + + spin_lock_irqsave(&rbd->rbs_lock, flags); + /* + * It is still possible that an NMI came in + * between the "is_boosted" check and setting + * the rcu_rbdp to NULL. This would mean that + * the NMI already dequeued us. + */ + if (unlikely(!rcu_is_boosted(curr))) + goto out; + + list_del_init(&curr->rcub_entry); + + rcu_trace_boost_unboosted(rbd); + + set_rcu_prio(curr, MAX_PRIO); + + spin_lock(&curr->pi_lock); + prio = rt_mutex_getprio(curr); + task_setprio(curr, prio); + + curr->rcub_rbdp = NULL; + + spin_unlock(&curr->pi_lock); + out: + spin_unlock_irqrestore(&rbd->rbs_lock, flags); +} + +/* + * For each rcu_boost_dat structure, update all the tasks that + * are on the lists to the priority of the caller of + * synchronize_rcu. + */ +static int __rcu_boost_readers(struct rcu_boost_dat *rbd, int prio, unsigned long flags) +{ + struct task_struct *curr = current; + struct task_struct *p; + + spin_lock(&rbd->rbs_lock); + + rbd->rbs_prio = prio; + + /* + * Move the already boosted readers onto the list and reboost + * them. + */ + list_splice_init(&rbd->rbs_boosted, + &rbd->rbs_toboost); + + while (!list_empty(&rbd->rbs_toboost)) { + p = list_entry(rbd->rbs_toboost.next, + struct task_struct, rcub_entry); + list_move_tail(&p->rcub_entry, + &rbd->rbs_boosted); + set_rcu_prio(p, prio); + spin_lock(&p->pi_lock); + rcu_boost_task(p); + spin_unlock(&p->pi_lock); + + /* + * Now we release the lock to allow for a higher + * priority task to come in and boost the readers + * even higher. Or simply to let a higher priority + * task to run now. + */ + spin_unlock(&rbd->rbs_lock); + spin_unlock_irqrestore(&rcu_boost_wake_lock, flags); + + cpu_relax(); + spin_lock_irqsave(&rcu_boost_wake_lock, flags); + /* + * Another task may have taken over. + */ + if (curr->rcu_preempt_counter != rcu_boost_counter) { + rcu_trace_boost_over_taken(rbd); + return 1; + } + + spin_lock(&rbd->rbs_lock); + } + + spin_unlock(&rbd->rbs_lock); + + return 0; +} + +/** + * rcu_boost_readers - called by synchronize_rcu to boost sleeping RCU readers. + * + * This function iterates over all the per_cpu rcu_boost_data descriptors + * and boosts any sleeping (or slept) RCU readers. + */ +void rcu_boost_readers(void) +{ + struct task_struct *curr = current; + struct rcu_boost_dat *rbd; + unsigned long flags; + int prio; + int cpu; + int ret; + + spin_lock_irqsave(&rcu_boost_wake_lock, flags); + + prio = rt_mutex_getprio(curr); + + rcu_trace_boost_try_boost_readers(RCU_BOOST_ME); + + if (prio >= rcu_boost_prio) { + /* already boosted */ + spin_unlock_irqrestore(&rcu_boost_wake_lock, flags); + return; + } + + rcu_boost_prio = prio; + + rcu_trace_boost_boost_readers(RCU_BOOST_ME); + + /* Flag that we are the one to unboost */ + curr->rcu_preempt_counter = ++rcu_boost_counter; + + for_each_online_cpu(cpu) { + rbd = &per_cpu(rcu_boost_data, cpu); + ret = __rcu_boost_readers(rbd, prio, flags); + if (ret) + break; + } + + spin_unlock_irqrestore(&rcu_boost_wake_lock, flags); + +} + +/** + * rcu_unboost_readers - set the boost level back to normal. + * + * This function DOES NOT change the priority of any RCU reader + * that was boosted. The RCU readers do that when they release + * the RCU lock. This function only sets the global + * rcu_boost_prio to MAX_PRIO so that new RCU readers that sleep + * do not increase their priority. + */ +void rcu_unboost_readers(void) +{ + struct rcu_boost_dat *rbd; + unsigned long flags; + int cpu; + + spin_lock_irqsave(&rcu_boost_wake_lock, flags); + + rcu_trace_boost_try_unboost_readers(RCU_BOOST_ME); + + if (current->rcu_preempt_counter != rcu_boost_counter) + goto out; + + rcu_trace_boost_unboost_readers(RCU_BOOST_ME); + + /* + * We could also put in something that + * would allow other synchronize_rcu callers + * of lower priority that are still waiting + * to boost the prio. + */ + rcu_boost_prio = MAX_PRIO; + + for_each_online_cpu(cpu) { + rbd = &per_cpu(rcu_boost_data, cpu); + + spin_lock(&rbd->rbs_lock); + rbd->rbs_prio = rcu_boost_prio; + spin_unlock(&rbd->rbs_lock); + } + + out: + spin_unlock_irqrestore(&rcu_boost_wake_lock, flags); +} + +/* + * The krcupreemptd wakes up every "rcu_preempt_thread_secs" + * seconds at the minimum priority of 1 to do a + * synchronize_rcu. This ensures that grace periods finish + * and that we do not starve the system. If there are RT + * tasks above priority 1 that are hogging the system and + * preventing release of memory, then its the fault of the + * system designer running RT tasks too aggressively and the + * system is flawed regardless. + */ +static int krcupreemptd(void *data) +{ + struct sched_param param = { .sched_priority = 1 }; + int ret; + int prio; + + ret = sched_setscheduler(current, SCHED_FIFO, ¶m); + printk("krcupreemptd setsched %d\n", ret); + prio = current->prio; + printk(" prio = %d\n", prio); + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + schedule_timeout(rcu_preempt_thread_secs * HZ); + + __set_current_state(TASK_RUNNING); + if (prio != current->prio) { + prio = current->prio; + printk("krcupreemptd new prio is %d??\n",prio); + } + + synchronize_rcu(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +int __init rcu_preempt_boost_init(void) +{ + struct rcu_boost_dat *rbd; + int cpu; + + for_each_possible_cpu(cpu) { + rbd = &per_cpu(rcu_boost_data, cpu); + + spin_lock_init(&rbd->rbs_lock); + rbd->rbs_prio = MAX_PRIO; + INIT_LIST_HEAD(&rbd->rbs_toboost); + INIT_LIST_HEAD(&rbd->rbs_boosted); + } + + return 0; +} + +static int __init rcu_preempt_start_krcupreemptd(void) +{ + struct task_struct *p; + + p = kthread_create(krcupreemptd, NULL, + "krcupreemptd"); + + if (IS_ERR(p)) { + printk("krcupreemptd failed\n"); + return NOTIFY_BAD; + } + wake_up_process(p); + + return 0; +} + +__initcall(rcu_preempt_start_krcupreemptd); Index: linux-2.6.25.8-rt7/kernel/rcupreempt.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rcupreempt.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rcupreempt.c 2008-06-23 19:13:48.000000000 -0400 @@ -79,7 +79,7 @@ */ #define GP_STAGES 2 struct rcu_data { - spinlock_t lock; /* Protect rcu_data fields. */ + raw_spinlock_t lock; /* Protect rcu_data fields. */ long completed; /* Number of last completed batch. */ int waitlistcount; struct tasklet_struct rcu_tasklet; @@ -132,7 +132,7 @@ enum rcu_try_flip_states { }; struct rcu_ctrlblk { - spinlock_t fliplock; /* Protect state-machine transitions. */ + raw_spinlock_t fliplock; /* Protect state-machine transitions. */ long completed; /* Number of last completed batch. */ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of the rcu state machine */ @@ -140,7 +140,7 @@ struct rcu_ctrlblk { static DEFINE_PER_CPU(struct rcu_data, rcu_data); static struct rcu_ctrlblk rcu_ctrlblk = { - .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), + .fliplock = RAW_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, .rcu_try_flip_state = rcu_try_flip_idle_state, }; @@ -351,6 +351,8 @@ void __rcu_read_unlock(void) ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; local_irq_restore(flags); + + __rcu_preempt_unboost(); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -948,7 +950,7 @@ void __devinit rcu_online_cpu(int cpu) #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_process_callbacks(struct softirq_action *unused) +void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; struct rcu_head *next, *list; @@ -1126,6 +1128,8 @@ void __init __rcu_init(void) rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); + + rcu_preempt_boost_init(); } /* Index: linux-2.6.25.8-rt7/kernel/rtmutex.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rtmutex.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rtmutex.c 2008-06-23 19:14:38.000000000 -0400 @@ -8,12 +8,19 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt + * * See Documentation/rt-mutex-design.txt for details. */ #include #include #include #include +#include #include "rtmutex_common.h" @@ -80,6 +87,7 @@ static void fixup_rt_mutex_waiters(struc */ #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +# define rt_rwlock_cmpxchg(rwm,c,n) (cmpxchg(&(rwm)->owner, c, n) == c) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -88,14 +96,59 @@ static inline void mark_rt_mutex_waiters owner = *p; } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } +#ifdef CONFIG_PREEMPT_RT +static inline void mark_rt_rwlock_check(struct rw_mutex *rwm) +{ + unsigned long owner, *p = (unsigned long *) &rwm->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_RWLOCK_CHECK) != owner); +} +#endif /* CONFIG_PREEMPT_RT */ #else # define rt_mutex_cmpxchg(l,c,n) (0) +# define rt_rwlock_cmpxchg(l,c,n) ({ (void)c; (void)n; 0; }) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); } +#ifdef CONFIG_PREEMPT_RT +static inline void mark_rt_rwlock_check(struct rw_mutex *rwm) +{ + rwm->owner = (struct task_struct *) + ((unsigned long)rwm->owner | RT_RWLOCK_CHECK); +} +#endif /* CONFIG_PREEMPT_RT */ +#endif + +#ifdef CONFIG_PREEMPT_RT +static inline int task_is_reader(struct task_struct *task) +{ + return task == RT_RW_READER; +} +#else +static inline int task_is_reader(struct task_struct *task) { return 0; } +#endif + +int pi_initialized; + +/* + * we initialize the wait_list runtime. (Could be done build-time and/or + * boot-time.) + */ +static inline void init_lists(struct rt_mutex *lock) +{ + if (unlikely(!lock->wait_list.prio_list.prev)) { + plist_head_init(&lock->wait_list, &lock->wait_lock); +#ifdef CONFIG_DEBUG_RT_MUTEXES + pi_initialized++; #endif + } +} + +static int rt_mutex_get_readers_prio(struct task_struct *task, int prio); /* * Calculate task priority from the waiter list priority @@ -105,11 +158,14 @@ static inline void mark_rt_mutex_waiters */ int rt_mutex_getprio(struct task_struct *task) { + int prio = min(task->normal_prio, get_rcu_prio(task)); + + prio = rt_mutex_get_readers_prio(task, prio); + if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; + return prio; - return min(task_top_pi_waiter(task)->pi_list_entry.prio, - task->normal_prio); + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio); } /* @@ -148,6 +204,11 @@ static void rt_mutex_adjust_prio(struct */ int max_lock_depth = 1024; +static int rt_mutex_adjust_readers(struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task, + struct rt_mutex *lock, + int recursion_depth); /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. @@ -157,7 +218,8 @@ static int rt_mutex_adjust_prio_chain(st int deadlock_detect, struct rt_mutex *orig_lock, struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) + struct task_struct *top_task, + int recursion_depth) { struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; @@ -191,7 +253,7 @@ static int rt_mutex_adjust_prio_chain(st return deadlock_detect ? -EDEADLK : 0; } - retry: + retry: /* * Task can not go away as we did a get_task() before ! */ @@ -253,13 +315,25 @@ static int rt_mutex_adjust_prio_chain(st plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); + + /* + * Readers are special. We may need to boost more than one owner. + */ + if (task_is_reader(task)) { + ret = rt_mutex_adjust_readers(orig_lock, orig_waiter, + top_task, lock, + recursion_depth); + spin_unlock_irqrestore(&lock->wait_lock, flags); + goto out; + } + get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + spin_lock(&task->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +351,10 @@ static int rt_mutex_adjust_prio_chain(st __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + spin_unlock(&task->pi_lock); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -291,7 +365,7 @@ static int rt_mutex_adjust_prio_chain(st spin_unlock_irqrestore(&task->pi_lock, flags); out_put_task: put_task_struct(task); - + out: return ret; } @@ -300,11 +374,10 @@ static int rt_mutex_adjust_prio_chain(st * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, int mode) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; - unsigned long flags; if (!rt_mutex_owner_pending(lock)) return 0; @@ -312,9 +385,11 @@ static inline int try_to_steal_lock(stru if (pendowner == current) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + WARN_ON(task_is_reader(rt_mutex_owner(lock))); + + spin_lock(&pendowner->pi_lock); + if (!lock_is_stealable(pendowner, mode)) { + spin_unlock(&pendowner->pi_lock); return 0; } @@ -324,7 +399,7 @@ static inline int try_to_steal_lock(stru * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); return 1; } @@ -332,7 +407,7 @@ static inline int try_to_steal_lock(stru next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + spin_unlock(&pendowner->pi_lock); /* * We are going to steal the lock and a waiter was @@ -349,10 +424,10 @@ static inline int try_to_steal_lock(stru * might be current: */ if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); plist_add(&next->pi_list_entry, ¤t->pi_waiters); __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); } return 1; } @@ -366,7 +441,7 @@ static inline int try_to_steal_lock(stru * * Must be called with lock->wait_lock held. */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int do_try_to_take_rt_mutex(struct rt_mutex *lock, int mode) { /* * We have to be careful here if the atomic speedups are @@ -389,7 +464,7 @@ static int try_to_take_rt_mutex(struct r */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, mode)) return 0; /* We got the lock. */ @@ -402,6 +477,11 @@ static int try_to_take_rt_mutex(struct r return 1; } +static inline int try_to_take_rt_mutex(struct rt_mutex *lock) +{ + return do_try_to_take_rt_mutex(lock, STEAL_NORMAL); +} + /* * Task blocks on lock. * @@ -411,14 +491,13 @@ static int try_to_take_rt_mutex(struct r */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - int detect_deadlock) + int detect_deadlock, unsigned long flags) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); __rt_mutex_adjust_prio(current); waiter->task = current; waiter->lock = lock; @@ -432,22 +511,29 @@ static int task_blocks_on_rt_mutex(struc current->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + /* readers are handled differently */ + if (task_is_reader(owner)) { + res = rt_mutex_adjust_readers(lock, waiter, + current, lock, 0); + return res; + } + + spin_lock(&owner->pi_lock); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + spin_unlock(&owner->pi_lock); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; - if (!chain_walk) + if (!chain_walk || task_is_reader(owner)) return 0; /* @@ -457,12 +543,12 @@ static int task_blocks_on_rt_mutex(struc */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + current, 0); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); return res; } @@ -475,13 +561,13 @@ static int task_blocks_on_rt_mutex(struc * * Called with lock->wait_lock held. */ -static void wakeup_next_waiter(struct rt_mutex *lock) +static void wakeup_next_waiter(struct rt_mutex *lock, int savestate) { struct rt_mutex_waiter *waiter; struct task_struct *pendowner; - unsigned long flags; + struct rt_mutex_waiter *next; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -496,9 +582,44 @@ static void wakeup_next_waiter(struct rt pendowner = waiter->task; waiter->task = NULL; + /* + * Do the wakeup before the ownership change to give any spinning + * waiter grantees a headstart over the other threads that will + * trigger once owner changes. + */ + if (!savestate) + wake_up_process(pendowner); + else { + /* + * We can skip the actual (expensive) wakeup if the + * waiter is already running, but we have to be careful + * of race conditions because they may be about to sleep. + * + * The waiter-side protocol has the following pattern: + * 1: Set state != RUNNING + * 2: Conditionally sleep if waiter->task != NULL; + * + * And the owner-side has the following: + * A: Set waiter->task = NULL + * B: Conditionally wake if the state != RUNNING + * + * As long as we ensure 1->2 order, and A->B order, we + * will never miss a wakeup. + * + * Therefore, this barrier ensures that waiter->task = NULL + * is visible before we test the pendowner->state. The + * corresponding barrier is in the sleep logic. + */ + smp_mb(); + + /* If !RUNNING && !RUNNING_MUTEX */ + if (pendowner->state & ~TASK_RUNNING_MUTEX) + wake_up_process_mutex(pendowner); + } + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -507,7 +628,13 @@ static void wakeup_next_waiter(struct rt * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + + if (rt_mutex_has_waiters(lock)) + next = rt_mutex_top_waiter(lock); + else + next = NULL; + + spin_lock(&pendowner->pi_lock); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -515,15 +642,10 @@ static void wakeup_next_waiter(struct rt pendowner->pi_blocked_on = NULL; - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); + if (next) plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); - wake_up_process(pendowner); + spin_unlock(&pendowner->pi_lock); } /* @@ -532,22 +654,22 @@ static void wakeup_next_waiter(struct rt * Must be called with lock->wait_lock held */ static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + unsigned long flags) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + spin_lock(¤t->pi_lock); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock(¤t->pi_lock); - if (first && owner != current) { + if (first && owner != current && !task_is_reader(owner)) { - spin_lock_irqsave(&owner->pi_lock, flags); + spin_lock(&owner->pi_lock); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -562,7 +684,7 @@ static void remove_waiter(struct rt_mute if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + spin_unlock(&owner->pi_lock); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -573,11 +695,11 @@ static void remove_waiter(struct rt_mute /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current, 0); - spin_lock(&lock->wait_lock); + spin_lock_irq(&lock->wait_lock); } /* @@ -598,164 +720,215 @@ void rt_mutex_adjust_pi(struct task_stru return; } - spin_unlock_irqrestore(&task->pi_lock, flags); - /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task, 0); } /* - * Slow path lock function: + * preemptible spin_lock functions: */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) + +#ifdef CONFIG_PREEMPT_RT + +static inline void +rt_spin_lock_fastlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (likely(!current->in_printk)) + might_sleep(); + else if (in_atomic() || irqs_disabled()) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) + rt_mutex_deadlock_account_lock(lock, current); + else + slowfn(lock); +} + +static inline void +rt_spin_lock_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + /* Temporary HACK! */ + if (unlikely(rt_mutex_owner(lock) != current) && current->in_printk) + /* don't grab locks for printk in atomic */ + return; + + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +static inline void +update_current(unsigned long new_state, unsigned long *saved_state) +{ + unsigned long state = xchg(¤t->state, new_state); + if (unlikely(state == TASK_RUNNING)) + *saved_state = TASK_RUNNING; +} + +#ifdef CONFIG_SMP +static int adaptive_wait(struct rt_mutex_waiter *waiter, + struct task_struct *orig_owner) +{ + for (;;) { + + /* we are the owner? */ + if (!waiter->task) + return 0; + + /* Owner changed? Then lets update the original */ + if (orig_owner != rt_mutex_owner(waiter->lock)) + return 0; + + /* Owner went to bed, so should we */ + if (!task_is_current(orig_owner)) + return 1; + + cpu_relax(); + } +} +#else +static int adaptive_wait(struct rt_mutex_waiter *waiter, + struct task_struct *orig_owner) +{ + return 1; +} +#endif + +/* + * Slow path lock function spin_lock style: this variant is very + * careful not to miss any non-lock wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void noinline __sched +rt_spin_lock_slowlock(struct rt_mutex *lock) { struct rt_mutex_waiter waiter; - int ret = 0; + unsigned long saved_state, state, flags; + struct task_struct *orig_owner; + int missed = 0; debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; + waiter.write_lock = 0; - spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); - return 0; - } + spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); - set_current_state(state); + BUG_ON(rt_mutex_owner(lock) == current); - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll take + * any intermediate wakeup into account as well, independently + * of the lock sleep/wakeup mechanism. When we get a real + * wakeup the task->state is TASK_RUNNING and we change + * saved_state accordingly. If we did not get a real wakeup + * then we return with the saved state. + */ + saved_state = current->state; for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) - break; + unsigned long saved_flags; + int saved_lock_depth = current->lock_depth; - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; + /* Try to acquire the lock */ + if (do_try_to_take_rt_mutex(lock, STEAL_LATERAL)) { + /* If we never blocked break out now */ + if (!missed) + goto unlock; + break; } + missed = 1; /* * waiter.task is NULL the first time we come here and * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. + * but the lock got stolen by an higher prio task. */ if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter.task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; + task_blocks_on_rt_mutex(lock, &waiter, 0, flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) continue; - } - if (unlikely(ret)) - break; } - spin_unlock(&lock->wait_lock); + /* + * Prevent schedule() to drop BKL, while waiting for + * the lock ! We restore lock_depth when we come back. + */ + saved_flags = current->flags & PF_NOSCHED; + current->lock_depth = -1; + current->flags &= ~PF_NOSCHED; + orig_owner = rt_mutex_owner(lock); + get_task_struct(orig_owner); + spin_unlock_irqrestore(&lock->wait_lock, flags); debug_rt_mutex_print_deadlock(&waiter); - if (waiter.task) - schedule_rt_mutex(lock); - - spin_lock(&lock->wait_lock); - set_current_state(state); + if (adaptive_wait(&waiter, orig_owner)) { + put_task_struct(orig_owner); + update_current(TASK_UNINTERRUPTIBLE, &saved_state); + /* + * The xchg() in update_current() is an implicit + * barrier which we rely upon to ensure current->state + * is visible before we test waiter.task. + */ + if (waiter.task) + schedule_rt_mutex(lock); + } else + put_task_struct(orig_owner); + + spin_lock_irqsave(&lock->wait_lock, flags); + current->flags |= saved_flags; + current->lock_depth = saved_lock_depth; } - set_current_state(TASK_RUNNING); + state = xchg(¤t->state, saved_state); + if (unlikely(state == TASK_RUNNING)) + current->state = TASK_RUNNING; + /* + * Extremely rare case, if we got woken up by a non-mutex wakeup, + * and we managed to steal the lock despite us not being the + * highest-prio waiter (due to SCHED_OTHER changing prio), then we + * can end up with a non-NULL waiter.task: + */ if (unlikely(waiter.task)) - remove_waiter(lock, &waiter); - + remove_waiter(lock, &waiter, flags); /* * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. + * unconditionally. We might have to fix that up: */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); + unlock: + spin_unlock_irqrestore(&lock->wait_lock, flags); debug_rt_mutex_free_waiter(&waiter); - - return ret; } /* - * Slow path try-lock function: + * Slow path to release a rt_mutex spin_lock style */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) +static void noinline __sched +rt_spin_lock_slowunlock(struct rt_mutex *lock) { - int ret = 0; - - spin_lock(&lock->wait_lock); - - if (likely(rt_mutex_owner(lock) != current)) { - - ret = try_to_take_rt_mutex(lock); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } - - spin_unlock(&lock->wait_lock); - - return ret; -} + unsigned long flags; -/* - * Slow path to release a rt-mutex: - */ -static void __sched -rt_mutex_slowunlock(struct rt_mutex *lock) -{ - spin_lock(&lock->wait_lock); + spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -763,55 +936,1557 @@ rt_mutex_slowunlock(struct rt_mutex *loc if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); return; } - wakeup_next_waiter(lock); + wakeup_next_waiter(lock, 1); - spin_unlock(&lock->wait_lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); - /* Undo pi boosting if necessary: */ + /* Undo pi boosting.when necessary */ rt_mutex_adjust_prio(current); } -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) +void __lockfunc rt_spin_lock(spinlock_t *lock) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, NULL, detect_deadlock); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock); } +EXPORT_SYMBOL(rt_spin_lock); -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, timeout, detect_deadlock); + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); } +EXPORT_SYMBOL(__rt_spin_lock); -static inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, __rt_spin_lock); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +#endif + +void __lockfunc rt_spin_unlock(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, 1, _RET_IP_); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) +{ + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(__rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), we lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_unlock_wait); + +int __lockfunc rt_spin_trylock(spinlock_t *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + int ret; + + *flags = 0; + ret = rt_mutex_trylock(&lock->lock); + if (ret) + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_irqsave); + +int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + rt_spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + rt_spin_unlock(lock); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_spin_lock); + +void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(__rt_spin_lock_init); + +int rt_rwlock_limit = NR_CPUS; + +static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags); +static inline void rt_reacquire_bkl(int saved_lock_depth); + +static inline void +rt_rwlock_set_owner(struct rw_mutex *rwm, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + rwm->owner = (struct task_struct *)val; +} + +static inline void init_rw_lists(struct rw_mutex *rwm) +{ + if (unlikely(!rwm->readers.prev)) { + init_lists(&rwm->mutex); + INIT_LIST_HEAD(&rwm->readers); + } +} + +/* + * The fast paths of the rw locks do not set up owners to + * the mutex. When blocking on an rwlock we must make sure + * there exists an owner. + */ +static void +update_rw_mutex_owner(struct rw_mutex *rwm) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct task_struct *mtxowner; + + mtxowner = rt_mutex_owner(mutex); + if (mtxowner) + return; + + mtxowner = rt_rwlock_owner(rwm); + WARN_ON(!mtxowner); + if (rt_rwlock_writer(rwm)) + WARN_ON(mtxowner == RT_RW_READER); + else + mtxowner = RT_RW_READER; + rt_mutex_set_owner(mutex, mtxowner, 0); +} + +#ifdef CONFIG_DEBUG_RT_MUTEXES +/* + * A rw lock is about to be added or has already been + * removed from current. Make sure it doesn't exist still. + */ +static void rw_check_held(struct rw_mutex *rwm) +{ + int reader_count = current->reader_lock_count; + int i; + + for (i = 0; i < reader_count; i++) + WARN_ON_ONCE(current->owned_read_locks[i].lock == rwm); +} +#else +# define rw_check_held(rwm) do { } while (0) +#endif + +/* + * The fast path does not add itself to the reader list to keep + * from needing to grab the spinlock. We need to add the owner + * itself. This may seem racy, but in practice, it is fine. + * The link list is protected by mutex->wait_lock. But to find + * the lock on the owner we need to read the owners reader counter. + * That counter is modified only by the owner. We are OK with that + * because to remove the lock that we are looking for, the owner + * must first grab the mutex->wait_lock. The lock will not disappear + * from the owner now, and we don't care if we see other locks + * held or not held. + */ + +static inline void +rt_rwlock_update_owner(struct rw_mutex *rwm, struct task_struct *own) +{ + struct reader_lock_struct *rls; + int i; + + if (!own || rt_rwlock_pending(rwm)) + return; + + if (own == RT_RW_READER) + return; + + /* + * We don't need to grab the pi_lock to look at the reader list + * since we hold the rwm wait_lock. We only care about the pointer + * to this lock, and we own the wait_lock, so that pointer + * can't be changed. + */ + for (i = own->reader_lock_count - 1; i >= 0; i--) { + if (own->owned_read_locks[i].lock == rwm) + break; + } + /* It is possible the owner didn't add it yet */ + if (i < 0) + return; + + rls = &own->owned_read_locks[i]; + /* It is also possible that the owner added it already */ + if (rls->list.prev && !list_empty(&rls->list)) + return; + + list_add(&rls->list, &rwm->readers); + + /* change to reader, so no one else updates too */ + rt_rwlock_set_owner(rwm, RT_RW_READER, RT_RWLOCK_CHECK); +} + +static int try_to_take_rw_read(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct rt_mutex_waiter *waiter; + struct reader_lock_struct *rls; + struct task_struct *mtxowner; + int owners; + int reader_count, i; + int incr = 1; + + assert_spin_locked(&mutex->wait_lock); + + /* mark the lock to force the owner to check on release */ + mark_rt_rwlock_check(rwm); + + /* is the owner a writer? */ + if (unlikely(rt_rwlock_writer(rwm))) + return 0; + + /* check to see if we don't already own this lock */ + for (i = current->reader_lock_count - 1; i >= 0; i--) { + if (current->owned_read_locks[i].lock == rwm) { + rls = ¤t->owned_read_locks[i]; + /* + * If this was taken via the fast path, then + * it hasn't been added to the link list yet. + */ + if (!rls->list.prev || list_empty(&rls->list)) + list_add(&rls->list, &rwm->readers); + rt_rwlock_set_owner(rwm, RT_RW_READER, 0); + rls->count++; + incr = 0; + goto taken; + } + } + + /* A writer is not the owner, but is a writer waiting */ + mtxowner = rt_mutex_owner(mutex); + + /* if the owner released it before we marked it then take it */ + if (!mtxowner && !rt_rwlock_owner(rwm)) { + /* Still unlock with the slow path (for PI handling) */ + rt_rwlock_set_owner(rwm, RT_RW_READER, 0); + goto taken; + } + + owners = atomic_read(&rwm->owners); + rt_rwlock_update_owner(rwm, rt_rwlock_owner(rwm)); + + /* Check for rwlock limits */ + if (rt_rwlock_limit && owners >= rt_rwlock_limit) + return 0; + + if (mtxowner && mtxowner != RT_RW_READER) { + int mode = mtx ? STEAL_NORMAL : STEAL_LATERAL; + + if (!try_to_steal_lock(mutex, mode)) { + /* + * readers don't own the mutex, and rwm shows that a + * writer doesn't have it either. If we enter this + * condition, then we must be pending. + */ + WARN_ON(!rt_mutex_owner_pending(mutex)); + /* + * Even though we didn't steal the lock, if the owner + * is a reader, and we are of higher priority than + * any waiting writer, we might still be able to continue. + */ + if (rt_rwlock_pending_writer(rwm)) + return 0; + if (rt_mutex_has_waiters(mutex)) { + waiter = rt_mutex_top_waiter(mutex); + if (!lock_is_stealable(waiter->task, mode)) + return 0; + /* + * The pending reader has PI waiters, + * but we are taking the lock. + * Remove the waiters from the pending owner. + */ + spin_lock(&mtxowner->pi_lock); + plist_del(&waiter->pi_list_entry, &mtxowner->pi_waiters); + spin_unlock(&mtxowner->pi_lock); + } + } else if (rt_mutex_has_waiters(mutex)) { + /* Readers do things differently with respect to PI */ + waiter = rt_mutex_top_waiter(mutex); + spin_lock(¤t->pi_lock); + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + spin_unlock(¤t->pi_lock); + } + /* Readers never own the mutex */ + rt_mutex_set_owner(mutex, RT_RW_READER, 0); + } + + /* RT_RW_READER forces slow paths */ + rt_rwlock_set_owner(rwm, RT_RW_READER, 0); + taken: + if (incr) { + atomic_inc(&rwm->owners); + rw_check_held(rwm); + spin_lock(¤t->pi_lock); + reader_count = current->reader_lock_count++; + if (likely(reader_count < MAX_RWLOCK_DEPTH)) { + rls = ¤t->owned_read_locks[reader_count]; + rls->lock = rwm; + rls->count = 1; + WARN_ON(rls->list.prev && !list_empty(&rls->list)); + list_add(&rls->list, &rwm->readers); + } else + WARN_ON_ONCE(1); + spin_unlock(¤t->pi_lock); + } + rt_mutex_deadlock_account_lock(mutex, current); + atomic_inc(&rwm->count); + return 1; +} + +static int +try_to_take_rw_write(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct task_struct *own; + + /* mark the lock to force the owner to check on release */ + mark_rt_rwlock_check(rwm); + + own = rt_rwlock_owner(rwm); + + /* owners must be zero for writer */ + if (own) { + rt_rwlock_update_owner(rwm, own); + + if (!rt_rwlock_pending(rwm)) + return 0; + } + + /* + * RT_RWLOCK_PENDING means that the lock is free, but there are + * pending owners on the mutex + */ + WARN_ON(own && !rt_mutex_owner_pending(mutex)); + + if (!do_try_to_take_rt_mutex(mutex, mtx ? STEAL_NORMAL : STEAL_LATERAL)) + return 0; + + /* + * We stole the lock. Add both WRITER and CHECK flags + * since we must release the mutex. + */ + rt_rwlock_set_owner(rwm, current, RT_RWLOCK_WRITER | RT_RWLOCK_CHECK); + + return 1; +} + +static void +rt_read_slowlock(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex_waiter waiter; + struct rt_mutex *mutex = &rwm->mutex; + int saved_lock_depth = -1; + unsigned long saved_state = -1, state, flags; + + spin_lock_irqsave(&mutex->wait_lock, flags); + init_rw_lists(rwm); + + if (try_to_take_rw_read(rwm, mtx)) { + spin_unlock_irqrestore(&mutex->wait_lock, flags); + return; + } + update_rw_mutex_owner(rwm); + + /* Owner is a writer (or a blocked writer). Block on the lock */ + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + waiter.write_lock = 0; + + if (mtx) { + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(mutex, flags); + set_current_state(TASK_UNINTERRUPTIBLE); + } else { + /* Spin lock must preserve BKL */ + saved_state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + saved_lock_depth = current->lock_depth; + } + + for (;;) { + unsigned long saved_flags; + + /* Try to acquire the lock: */ + if (try_to_take_rw_read(rwm, mtx)) + break; + update_rw_mutex_owner(rwm); + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + task_blocks_on_rt_mutex(mutex, &waiter, 0, flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) + continue; + } + saved_flags = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + if (!mtx) + current->lock_depth = -1; + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + debug_rt_mutex_print_deadlock(&waiter); + + if (!mtx || waiter.task) + schedule_rt_mutex(mutex); + + spin_lock_irqsave(&mutex->wait_lock, flags); + + current->flags |= saved_flags; + if (mtx) + set_current_state(TASK_UNINTERRUPTIBLE); + else { + current->lock_depth = saved_lock_depth; + state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + if (unlikely(state == TASK_RUNNING)) + saved_state = TASK_RUNNING; + } + } + + if (mtx) + set_current_state(TASK_RUNNING); + else { + state = xchg(¤t->state, saved_state); + if (unlikely(state == TASK_RUNNING)) + current->state = TASK_RUNNING; + } + + if (unlikely(waiter.task)) + remove_waiter(mutex, &waiter, flags); + + WARN_ON(rt_mutex_owner(mutex) && + rt_mutex_owner(mutex) != current && + rt_mutex_owner(mutex) != RT_RW_READER && + !rt_mutex_owner_pending(mutex)); + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + /* Must we reaquire the BKL? */ + if (mtx && unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + + debug_rt_mutex_free_waiter(&waiter); +} + +static inline int +__rt_read_fasttrylock(struct rw_mutex *rwm) +{ + retry: + if (likely(rt_rwlock_cmpxchg(rwm, NULL, current))) { + int reader_count; + unsigned long flags; + + rt_mutex_deadlock_account_lock(&rwm->mutex, current); + atomic_inc(&rwm->count); + smp_mb(); + /* + * It is possible that the owner was zeroed + * before we incremented count. If owner is not + * current, then retry again + */ + if (unlikely(rwm->owner != current)) { + atomic_dec(&rwm->count); + goto retry; + } + + atomic_inc(&rwm->owners); + rw_check_held(rwm); + local_irq_save(flags); + spin_lock(¤t->pi_lock); + reader_count = current->reader_lock_count++; + if (likely(reader_count < MAX_RWLOCK_DEPTH)) { + current->owned_read_locks[reader_count].lock = rwm; + current->owned_read_locks[reader_count].count = 1; + } else + WARN_ON_ONCE(1); + spin_unlock(¤t->pi_lock); + /* + * If this task is no longer the sole owner of the lock + * or someone is blocking, then we need to add the task + * to the lock. + */ + if (unlikely(rwm->owner != current)) { + struct rt_mutex *mutex = &rwm->mutex; + struct reader_lock_struct *rls; + + spin_lock(&mutex->wait_lock); + rls = ¤t->owned_read_locks[reader_count]; + if (!rls->list.prev || list_empty(&rls->list)) + list_add(&rls->list, &rwm->readers); + spin_unlock(&mutex->wait_lock); + } + local_irq_restore(flags); + return 1; + } + return 0; +} + +static inline void +rt_read_fastlock(struct rw_mutex *rwm, + void (*slowfn)(struct rw_mutex *rwm, int mtx), + int mtx) +{ + if (unlikely(!__rt_read_fasttrylock(rwm))) + slowfn(rwm, mtx); +} + +void rt_mutex_down_read(struct rw_mutex *rwm) +{ + rt_read_fastlock(rwm, rt_read_slowlock, 1); +} + +void rt_rwlock_read_lock(struct rw_mutex *rwm) +{ + rt_read_fastlock(rwm, rt_read_slowlock, 0); +} + +static inline int +rt_read_slowtrylock(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&mutex->wait_lock, flags); + init_rw_lists(rwm); + + if (try_to_take_rw_read(rwm, mtx)) + ret = 1; + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + return ret; +} + +static inline int +rt_read_fasttrylock(struct rw_mutex *rwm, + int (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx) +{ + if (likely(__rt_read_fasttrylock(rwm))) + return 1; + else + return slowfn(rwm, mtx); +} + +int __sched rt_mutex_down_read_trylock(struct rw_mutex *rwm) +{ + return rt_read_fasttrylock(rwm, rt_read_slowtrylock, 1); +} + +static void +rt_write_slowlock(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct rt_mutex_waiter waiter; + int saved_lock_depth = -1; + unsigned long flags, saved_state = -1, state; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + /* we do PI different for writers that are blocked */ + waiter.write_lock = 1; + + spin_lock_irqsave(&mutex->wait_lock, flags); + init_rw_lists(rwm); + + if (try_to_take_rw_write(rwm, mtx)) { + spin_unlock_irqrestore(&mutex->wait_lock, flags); + return; + } + update_rw_mutex_owner(rwm); + + if (mtx) { + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(mutex, flags); + set_current_state(TASK_UNINTERRUPTIBLE); + } else { + /* Spin locks must preserve the BKL */ + saved_lock_depth = current->lock_depth; + saved_state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + } + + for (;;) { + unsigned long saved_flags; + + /* Try to acquire the lock: */ + if (try_to_take_rw_write(rwm, mtx)) + break; + update_rw_mutex_owner(rwm); + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + task_blocks_on_rt_mutex(mutex, &waiter, 0, flags); + /* Wakeup during boost ? */ + if (unlikely(!waiter.task)) + continue; + } + saved_flags = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + if (!mtx) + current->lock_depth = -1; + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + debug_rt_mutex_print_deadlock(&waiter); + + if (!mtx || waiter.task) + schedule_rt_mutex(mutex); + + spin_lock_irqsave(&mutex->wait_lock, flags); + + current->flags |= saved_flags; + if (mtx) + set_current_state(TASK_UNINTERRUPTIBLE); + else { + current->lock_depth = saved_lock_depth; + state = xchg(¤t->state, TASK_UNINTERRUPTIBLE); + if (unlikely(state == TASK_RUNNING)) + saved_state = TASK_RUNNING; + } + } + + if (mtx) + set_current_state(TASK_RUNNING); + else { + state = xchg(¤t->state, saved_state); + if (unlikely(state == TASK_RUNNING)) + current->state = TASK_RUNNING; + } + + if (unlikely(waiter.task)) + remove_waiter(mutex, &waiter, flags); + + /* check on unlock if we have any waiters. */ + if (rt_mutex_has_waiters(mutex)) + mark_rt_rwlock_check(rwm); + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + /* Must we reaquire the BKL? */ + if (mtx && unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + + debug_rt_mutex_free_waiter(&waiter); + +} + +static inline void +rt_write_fastlock(struct rw_mutex *rwm, + void (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx) +{ + struct task_struct *val = (void *)((unsigned long)current | + RT_RWLOCK_WRITER); + + if (likely(rt_rwlock_cmpxchg(rwm, NULL, val))) + rt_mutex_deadlock_account_lock(&rwm->mutex, current); + else + slowfn(rwm, mtx); +} + +void rt_mutex_down_write(struct rw_mutex *rwm) +{ + rt_write_fastlock(rwm, rt_write_slowlock, 1); +} + +void rt_rwlock_write_lock(struct rw_mutex *rwm) +{ + rt_write_fastlock(rwm, rt_write_slowlock, 0); +} + +static int +rt_write_slowtrylock(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&mutex->wait_lock, flags); + init_rw_lists(rwm); + + if (try_to_take_rw_write(rwm, mtx)) + ret = 1; + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + return ret; +} + +static inline int +rt_write_fasttrylock(struct rw_mutex *rwm, + int (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx) +{ + struct task_struct *val = (void *)((unsigned long)current | + RT_RWLOCK_WRITER); + + if (likely(rt_rwlock_cmpxchg(rwm, NULL, val))) { + rt_mutex_deadlock_account_lock(&rwm->mutex, current); + return 1; + } else + return slowfn(rwm, mtx); +} + +int rt_mutex_down_write_trylock(struct rw_mutex *rwm) +{ + return rt_write_fasttrylock(rwm, rt_write_slowtrylock, 1); +} + +static void noinline __sched +rt_read_slowunlock(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + struct reader_lock_struct *rls; + unsigned long flags; + unsigned int reader_count; + int savestate = !mtx; + int i; + + spin_lock_irqsave(&mutex->wait_lock, flags); + + rt_mutex_deadlock_account_unlock(current); + + /* + * To prevent multiple readers from zeroing out the owner + * when the count goes to zero and then have another task + * grab the task. We mark the lock. This makes all tasks + * go to the slow path. Then we can check the owner without + * worry that it changed. + */ + mark_rt_rwlock_check(rwm); + + for (i = current->reader_lock_count - 1; i >= 0; i--) { + if (current->owned_read_locks[i].lock == rwm) { + spin_lock(¤t->pi_lock); + current->owned_read_locks[i].count--; + if (!current->owned_read_locks[i].count) { + current->reader_lock_count--; + WARN_ON_ONCE(i != current->reader_lock_count); + atomic_dec(&rwm->owners); + rls = ¤t->owned_read_locks[i]; + WARN_ON(!rls->list.prev || list_empty(&rls->list)); + list_del_init(&rls->list); + rls->lock = NULL; + rw_check_held(rwm); + } + spin_unlock(¤t->pi_lock); + break; + } + } + WARN_ON_ONCE(i < 0); + + /* + * If the last two (or more) readers unlocked at the same + * time, the owner could be cleared since the count went to + * zero. If this has happened, the rwm owner will not + * be set to current or readers. This means that another reader + * already reset the lock, so there is nothing left to do. + */ + if (unlikely(rt_rwlock_owner(rwm) != current && + rt_rwlock_owner(rwm) != RT_RW_READER)) { + /* Update the owner if necessary */ + rt_rwlock_update_owner(rwm, rt_rwlock_owner(rwm)); + goto out; + } + + /* + * If there are more readers and we are under the limit + * let the last reader do the wakeups. + */ + reader_count = atomic_read(&rwm->count); + if (reader_count && + (!rt_rwlock_limit || atomic_read(&rwm->owners) >= rt_rwlock_limit)) + goto out; + + /* If no one is blocked, then clear all ownership */ + if (!rt_mutex_has_waiters(mutex)) { + rwm->prio = MAX_PRIO; + /* + * If count is not zero, we are under the limit with + * no other readers. + */ + if (reader_count) + goto out; + + /* We could still have a pending reader waiting */ + if (rt_mutex_owner_pending(mutex)) { + /* set the rwm back to pending */ + rwm->owner = RT_RWLOCK_PENDING_READ; + } else { + rwm->owner = NULL; + mutex->owner = NULL; + } + goto out; + } + + /* + * If the next waiter is a reader, this can be because of + * two things. One is that we hit the reader limit, or + * Two, there is a pending writer. + * We still only wake up one reader at a time (even if + * we could wake up more). This is because we dont + * have any idea if a writer is pending. + */ + waiter = rt_mutex_top_waiter(mutex); + if (waiter->write_lock) { + /* only wake up if there are no readers */ + if (reader_count) + goto out; + rwm->owner = RT_RWLOCK_PENDING_WRITE; + } else { + /* + * It is also possible that the reader limit decreased. + * If the limit did decrease, we may not be able to + * wake up the reader if we are currently above the limit. + */ + if (rt_rwlock_limit && + unlikely(atomic_read(&rwm->owners) >= rt_rwlock_limit)) + goto out; + if (!reader_count) + rwm->owner = RT_RWLOCK_PENDING_READ; + } + + pendowner = waiter->task; + wakeup_next_waiter(mutex, savestate); + + /* + * If we woke up a reader but the lock is already held by readers + * we need to set the mutex owner to RT_RW_READER, since the + * wakeup_next_waiter set it to the pending reader. + */ + if (reader_count) { + WARN_ON(waiter->write_lock); + rt_mutex_set_owner(mutex, RT_RW_READER, 0); + } + + if (rt_mutex_has_waiters(mutex)) { + waiter = rt_mutex_top_waiter(mutex); + rwm->prio = waiter->task->prio; + /* + * If readers still own this lock, then we need + * to update the pi_list too. Readers have a separate + * path in the PI chain. + */ + if (reader_count) { + spin_lock(&pendowner->pi_lock); + plist_del(&waiter->pi_list_entry, + &pendowner->pi_waiters); + spin_unlock(&pendowner->pi_lock); + } + } else + rwm->prio = MAX_PRIO; + + out: + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + /* Undo pi boosting.when necessary */ + rt_mutex_adjust_prio(current); +} + +static inline void +rt_read_fastunlock(struct rw_mutex *rwm, + void (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx) +{ + WARN_ON(!atomic_read(&rwm->count)); + WARN_ON(!atomic_read(&rwm->owners)); + WARN_ON(!rwm->owner); + smp_mb(); + atomic_dec(&rwm->count); + if (likely(rt_rwlock_cmpxchg(rwm, current, NULL))) { + struct reader_lock_struct *rls; + unsigned long flags; + int reader_count; + int owners; + + spin_lock_irqsave(¤t->pi_lock, flags); + reader_count = --current->reader_lock_count; + spin_unlock_irqrestore(¤t->pi_lock, flags); + + rt_mutex_deadlock_account_unlock(current); + if (unlikely(reader_count < 0)) { + reader_count = 0; + WARN_ON_ONCE(1); + } + owners = atomic_dec_return(&rwm->owners); + if (unlikely(owners < 0)) { + atomic_set(&rwm->owners, 0); + WARN_ON_ONCE(1); + } + rls = ¤t->owned_read_locks[reader_count]; + WARN_ON_ONCE(rls->lock != rwm); + WARN_ON(rls->list.prev && !list_empty(&rls->list)); + WARN_ON(rls->count != 1); + rls->lock = NULL; + rw_check_held(rwm); + } else + slowfn(rwm, mtx); +} + +void rt_mutex_up_read(struct rw_mutex *rwm) +{ + rt_read_fastunlock(rwm, rt_read_slowunlock, 1); +} + +void rt_rwlock_read_unlock(struct rw_mutex *rwm) +{ + rt_read_fastunlock(rwm, rt_read_slowunlock, 0); +} + +static void noinline __sched +rt_write_slowunlock(struct rw_mutex *rwm, int mtx) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + int savestate = !mtx; + unsigned long flags; + + spin_lock_irqsave(&mutex->wait_lock, flags); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(mutex)) { + rwm->owner = NULL; + mutex->owner = NULL; + spin_unlock_irqrestore(&mutex->wait_lock, flags); + return; + } + + debug_rt_mutex_unlock(mutex); + + /* + * This is where it gets a bit tricky. + * We can have both readers and writers waiting below us. + * They are ordered by priority. For each reader we wake + * up, we check to see if there's another reader waiting. + * If that is the case, we continue to wake up the readers + * until we hit a writer. Once we hit a writer, then we + * stop (and don't wake it up). + * + * If the next waiter is a writer, than we just wake up + * the writer and we are done. + */ + + waiter = rt_mutex_top_waiter(mutex); + pendowner = waiter->task; + wakeup_next_waiter(mutex, savestate); + + /* another writer is next? */ + if (waiter->write_lock) { + rwm->owner = RT_RWLOCK_PENDING_WRITE; + goto out; + } + + rwm->owner = RT_RWLOCK_PENDING_READ; + + if (!rt_mutex_has_waiters(mutex)) + goto out; + + /* + * Wake up all readers. + * This gets a bit more complex. More than one reader can't + * own the mutex. We give it to the first (highest prio) + * reader, and then wake up the rest of the readers until + * we wake up all readers or come to a writer. The woken + * up readers that don't own the lock will try to take it + * when they schedule. Doing this lets a high prio writer + * come along and steal the lock. + */ + waiter = rt_mutex_top_waiter(mutex); + while (waiter && !waiter->write_lock) { + struct task_struct *reader = waiter->task; + + spin_lock(&pendowner->pi_lock); + plist_del(&waiter->list_entry, &mutex->wait_list); + + /* nop if not on a list */ + plist_del(&waiter->pi_list_entry, &pendowner->pi_waiters); + spin_unlock(&pendowner->pi_lock); + + spin_lock(&reader->pi_lock); + waiter->task = NULL; + reader->pi_blocked_on = NULL; + spin_unlock(&reader->pi_lock); + + if (savestate) + wake_up_process_mutex(reader); + else + wake_up_process(reader); + + if (rt_mutex_has_waiters(mutex)) + waiter = rt_mutex_top_waiter(mutex); + else + waiter = NULL; + } + + /* If a writer is still pending, then update its plist. */ + if (rt_mutex_has_waiters(mutex)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(mutex); + + spin_lock(&pendowner->pi_lock); + /* delete incase we didn't go through the loop */ + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + + /* This could also be a reader (if reader_limit is set) */ + if (next->write_lock) + /* add back in as top waiter */ + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); + spin_unlock(&pendowner->pi_lock); + + rwm->prio = next->task->prio; + } else + rwm->prio = MAX_PRIO; + + out: + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + /* Undo pi boosting.when necessary */ + rt_mutex_adjust_prio(current); +} + +static inline void +rt_write_fastunlock(struct rw_mutex *rwm, + void (*slowfn)(struct rw_mutex *rwm, int mtx), int mtx) +{ + struct task_struct *val = (void *)((unsigned long)current | + RT_RWLOCK_WRITER); + + WARN_ON(rt_rwlock_owner(rwm) != current); + if (likely(rt_rwlock_cmpxchg(rwm, (struct task_struct *)val, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(rwm, mtx); +} + +void rt_mutex_up_write(struct rw_mutex *rwm) +{ + rt_write_fastunlock(rwm, rt_write_slowunlock, 1); +} + +void rt_rwlock_write_unlock(struct rw_mutex *rwm) +{ + rt_write_fastunlock(rwm, rt_write_slowunlock, 0); +} + +/* + * We own the lock for write, and we want to convert it to a read, + * so we simply take the lock as read, and wake up all other readers. + */ +void __sched +rt_mutex_downgrade_write(struct rw_mutex *rwm) +{ + struct rt_mutex *mutex = &rwm->mutex; + struct reader_lock_struct *rls; + struct rt_mutex_waiter *waiter; + unsigned long flags; + int reader_count; + + spin_lock_irqsave(&mutex->wait_lock, flags); + init_rw_lists(rwm); + + /* we have the lock and are sole owner, then update the accounting */ + atomic_inc(&rwm->count); + atomic_inc(&rwm->owners); + rw_check_held(rwm); + + spin_lock(¤t->pi_lock); + reader_count = current->reader_lock_count++; + rls = ¤t->owned_read_locks[reader_count]; + if (likely(reader_count < MAX_RWLOCK_DEPTH)) { + rls->lock = rwm; + rls->count = 1; + } else + WARN_ON_ONCE(1); + spin_unlock(¤t->pi_lock); + + if (!rt_mutex_has_waiters(mutex)) { + /* We are sole owner, we are done */ + rwm->owner = current; + rwm->prio = MAX_PRIO; + mutex->owner = NULL; + spin_unlock_irqrestore(&mutex->wait_lock, flags); + return; + } + + /* Set us up for multiple readers or conflicts */ + + list_add(&rls->list, &rwm->readers); + rwm->owner = RT_RW_READER; + + /* + * This is like the write unlock, but we already own the + * reader. We still want to wake up other readers that are + * waiting, until we hit the reader limit, or a writer. + */ + + waiter = rt_mutex_top_waiter(mutex); + while (waiter && !waiter->write_lock) { + struct task_struct *reader = waiter->task; + + spin_lock(¤t->pi_lock); + plist_del(&waiter->list_entry, &mutex->wait_list); + + /* nop if not on a list */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + spin_unlock(¤t->pi_lock); + + spin_lock(&reader->pi_lock); + waiter->task = NULL; + reader->pi_blocked_on = NULL; + spin_unlock(&reader->pi_lock); + + /* downgrade is only for mutexes */ + wake_up_process(reader); + + if (rt_mutex_has_waiters(mutex)) + waiter = rt_mutex_top_waiter(mutex); + else + waiter = NULL; + } + + /* If a writer is still pending, then update its plist. */ + if (rt_mutex_has_waiters(mutex)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(mutex); + + /* setup this mutex prio for read */ + rwm->prio = next->task->prio; + + spin_lock(¤t->pi_lock); + /* delete incase we didn't go through the loop */ + plist_del(&next->pi_list_entry, ¤t->pi_waiters); + spin_unlock(¤t->pi_lock); + /* No need to add back since readers don't have PI waiters */ + } else + rwm->prio = MAX_PRIO; + + rt_mutex_set_owner(mutex, RT_RW_READER, 0); + + spin_unlock_irqrestore(&mutex->wait_lock, flags); + + /* + * Undo pi boosting when necessary. + * If one of the awoken readers boosted us, we don't want to keep + * that priority. + */ + rt_mutex_adjust_prio(current); +} + +void rt_mutex_rwsem_init(struct rw_mutex *rwm, const char *name) +{ + struct rt_mutex *mutex = &rwm->mutex; + + rwm->owner = NULL; + atomic_set(&rwm->count, 0); + atomic_set(&rwm->owners, 0); + rwm->prio = MAX_PRIO; + INIT_LIST_HEAD(&rwm->readers); + + __rt_mutex_init(mutex, name); +} + +static int rt_mutex_get_readers_prio(struct task_struct *task, int prio) +{ + struct reader_lock_struct *rls; + struct rw_mutex *rwm; + int lock_prio; + int i; + + for (i = 0; i < task->reader_lock_count; i++) { + rls = &task->owned_read_locks[i]; + rwm = rls->lock; + if (rwm) { + lock_prio = rwm->prio; + if (prio > lock_prio) + prio = lock_prio; + } + } + + return prio; +} + +static int rt_mutex_adjust_readers(struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task, + struct rt_mutex *lock, + int recursion_depth) +{ + struct reader_lock_struct *rls; + struct rt_mutex_waiter *waiter; + struct task_struct *task; + struct rw_mutex *rwm = container_of(lock, struct rw_mutex, mutex); + + if (rt_mutex_has_waiters(lock)) { + waiter = rt_mutex_top_waiter(lock); + /* + * Do we need to grab the task->pi_lock? + * Really, we are only reading it. If it + * changes, then that should follow this chain + * too. + */ + rwm->prio = waiter->task->prio; + } else + rwm->prio = MAX_PRIO; + + if (recursion_depth >= MAX_RWLOCK_DEPTH) { + WARN_ON(1); + return 1; + } + + list_for_each_entry(rls, &rwm->readers, list) { + task = rls->task; + get_task_struct(task); + /* + * rt_mutex_adjust_prio_chain will do + * the put_task_struct + */ + rt_mutex_adjust_prio_chain(task, 0, orig_lock, + orig_waiter, top_task, + recursion_depth+1); + } + + return 0; +} +#else +static int rt_mutex_adjust_readers(struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task, + struct rt_mutex *lock, + int recursion_depth) +{ + return 0; +} + +static int rt_mutex_get_readers_prio(struct task_struct *task, int prio) +{ + return prio; +} +#endif /* CONFIG_PREEMPT_RT */ + +static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags) +{ + int saved_lock_depth = current->lock_depth; + + current->lock_depth = -1; + /* + * try_to_take_lock set the waiters, make sure it's + * still correct. + */ + fixup_rt_mutex_waiters(lock); + spin_unlock_irqrestore(&lock->wait_lock, flags); + + up(&kernel_sem); + + spin_lock_irq(&lock->wait_lock); + + return saved_lock_depth; +} + +static inline void rt_reacquire_bkl(int saved_lock_depth) +{ + down(&kernel_sem); + current->lock_depth = saved_lock_depth; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + int ret = 0, saved_lock_depth = -1; + struct rt_mutex_waiter waiter; + unsigned long flags; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + waiter.write_lock = 0; + + spin_lock_irqsave(&lock->wait_lock, flags); + init_lists(lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock_irqrestore(&lock->wait_lock, flags); + return 0; + } + + /* + * We drop the BKL here before we go into the wait loop to avoid a + * possible deadlock in the scheduler. + */ + if (unlikely(current->lock_depth >= 0)) + saved_lock_depth = rt_release_bkl(lock, flags); + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start(&timeout->timer, timeout->timer.expires, + HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + for (;;) { + unsigned long saved_flags; + + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + ret = task_blocks_on_rt_mutex(lock, &waiter, + detect_deadlock, flags); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) { + /* + * Reset the return value. We might + * have returned with -EDEADLK and the + * owner released the lock while we + * were walking the pi chain. + */ + ret = 0; + continue; + } + if (unlikely(ret)) + break; + } + saved_flags = current->flags & PF_NOSCHED; + current->flags &= ~PF_NOSCHED; + + spin_unlock_irq(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(&waiter); + + if (waiter.task) + schedule_rt_mutex(lock); + + spin_lock_irq(&lock->wait_lock); + + current->flags |= saved_flags; + set_current_state(state); + } + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter, flags); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + /* + * Readjust priority, when we did not get the lock. We might + * have been the pending owner and boosted. Since we did not + * take the lock, the PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + /* Must we reaquire the BKL? */ + if (unlikely(saved_lock_depth >= 0)) + rt_reacquire_bkl(saved_lock_depth); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lock->wait_lock, flags); + + if (likely(rt_mutex_owner(lock) != current)) { + + init_lists(lock); + + ret = try_to_take_rt_mutex(lock); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + unsigned long flags; + + spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + + wakeup_next_waiter(lock, 0); + + spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock)) { if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); @@ -831,6 +2506,27 @@ rt_mutex_fastunlock(struct rt_mutex *loc } /** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_KILLABLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + +/** * rt_mutex_lock - lock a rt_mutex * * @lock: the rt_mutex to be locked Index: linux-2.6.25.8-rt7/kernel/sched.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/sched.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/sched.c 2008-06-23 19:14:36.000000000 -0400 @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements @@ -59,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -66,19 +69,12 @@ #include #include #include +#include #include #include -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} +#include "sched_cpupri.h" /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -89,6 +85,11 @@ unsigned long long __attribute__((weak)) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#define __PRIO(prio) \ + ((prio) <= 99 ? 199 - (prio) : (prio) - 120) + +#define PRIO(p) __PRIO((p)->prio) + /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -106,6 +107,20 @@ unsigned long long __attribute__((weak)) #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -135,6 +150,32 @@ static inline void sg_inc_cpu_power(stru } #endif +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Tweaks for current + */ + +#ifdef CURRENT_PTR +struct task_struct * const ___current = &init_task; +struct task_struct ** const current_ptr = (struct task_struct ** const)&___current; +struct thread_info * const current_ti = &init_thread_union.thread_info; +struct thread_info ** const current_ti_ptr = (struct thread_info ** const)¤t_ti; + +EXPORT_SYMBOL(___current); +EXPORT_SYMBOL(current_ti); + +/* + * The scheduler itself doesnt want 'current' to be cached + * during context-switches: + */ +# undef current +# define current __current() +# undef current_thread_info +# define current_thread_info() __current_thread_info() +#endif + static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -152,7 +193,8 @@ static inline int task_has_rt_policy(str */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; + struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ + struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ }; #ifdef CONFIG_GROUP_SCHED @@ -332,6 +374,7 @@ struct rt_rq { unsigned long rt_nr_migratory; int overloaded; #endif + unsigned long rt_nr_uninterruptible; int rt_throttled; u64 rt_time; @@ -366,6 +409,9 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* @@ -385,7 +431,7 @@ static struct root_domain def_root_domai */ struct rq { /* runqueue lock: */ - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -424,17 +470,13 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long switch_timestamp; + unsigned long slice_avg; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows, clock_underflows; - u64 idle_clock; - unsigned int clock_deep_idle_events; - u64 tick_timestamp; + u64 clock; atomic_t nr_iowait; @@ -447,6 +489,7 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; struct task_struct *migration_thread; struct list_head migration_queue; @@ -479,6 +522,13 @@ struct rq { /* BKL stats */ unsigned int bkl_count; + + /* RT-overload stats: */ + unsigned long rto_schedule; + unsigned long rto_schedule_tail; + unsigned long rto_wakeup; + unsigned long rto_pulled; + unsigned long rto_pushed; #endif struct lock_class_key rq_lock_key; }; @@ -499,52 +549,7 @@ static inline int cpu_of(struct rq *rq) #endif } -/* - * Update the per-runqueue clock, as finegrained as the platform can give - * us, but without assuming monotonicity, etc.: - */ -static void __update_rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -#endif - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { - if (clock < rq->tick_timestamp + TICK_NSEC) - clock = rq->tick_timestamp + TICK_NSEC; - else - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; -} - -static void update_rq_clock(struct rq *rq) -{ - if (likely(smp_processor_id() == cpu_of(rq))) - __update_rq_clock(rq); -} +#include "sched_trace.h" /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -561,6 +566,11 @@ static void update_rq_clock(struct rq *r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static inline void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + unsigned long rt_needs_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -587,6 +597,31 @@ unsigned long rt_needs_cpu(int cpu) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + +#ifndef CONFIG_SMP +int task_is_current(struct task_struct *task) +{ + return task_rq(task)->curr == task; +} +#endif + /* * Debugging: various feature bits */ @@ -611,7 +646,11 @@ const_debug unsigned int sysctl_sched_fe * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif /* * period over which we measure -rt task cpu usage in us. @@ -625,7 +664,8 @@ static __read_mostly int scheduler_runni * part of the period that we allow rt tasks to run in us. * default: 0.95s */ -int sysctl_sched_rt_runtime = 950000; +/* int sysctl_sched_rt_runtime = 950000; */ +int sysctl_sched_rt_runtime = -1; /* * single value that denotes runtime == period, ie unlimited time. @@ -633,37 +673,22 @@ int sysctl_sched_rt_runtime = 950000; #define RUNTIME_INF ((u64)~0ULL) /* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. */ -unsigned long long cpu_clock(int cpu) -{ - unsigned long long now; - unsigned long flags; - struct rq *rq; - - /* - * Only call sched_clock() if the scheduler has already been - * initialized (some code might call cpu_clock() very early): - */ - if (unlikely(!scheduler_running)) - return 0; - - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); - now = rq->clock; - local_irq_restore(flags); - - return now; -} -EXPORT_SYMBOL_GPL(cpu_clock); +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif #ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) +# define _finish_arch_switch(prev) do { } while (0) #endif static inline int task_current(struct rq *rq, struct task_struct *p) @@ -671,18 +696,39 @@ static inline int task_current(struct rq return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->oncpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -694,18 +740,10 @@ static inline void finish_lock_switch(st */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { @@ -735,8 +773,8 @@ static inline void finish_lock_switch(st smp_wmb(); prev->oncpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); #endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -804,43 +842,6 @@ static struct rq *this_rq_lock(void) return rq; } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - - spin_lock(&rq->lock); - __update_rq_clock(rq); - spin_unlock(&rq->lock); - rq->clock_deep_idle_events++; -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - u64 now = sched_clock(); - - rq->idle_clock += delta_ns; - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - spin_lock(&rq->lock); - rq->prev_clock_raw = now; - rq->clock += delta_ns; - spin_unlock(&rq->lock); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - static void __resched_task(struct task_struct *p, int tif_bit); static inline void resched_task(struct task_struct *p) @@ -965,7 +966,7 @@ static enum hrtimer_restart hrtick(struc WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); spin_lock(&rq->lock); - __update_rq_clock(rq); + update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); spin_unlock(&rq->lock); @@ -1131,7 +1132,7 @@ void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); - if (cpu == smp_processor_id()) + if (cpu == raw_smp_processor_id()) return; /* @@ -1320,6 +1321,8 @@ static int task_hot(struct task_struct * #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) static inline void inc_load(struct rq *rq, const struct task_struct *p) { @@ -1400,6 +1403,8 @@ static inline int normal_prio(struct tas prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); + +// trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); return prio; } @@ -1431,6 +1436,7 @@ static void activate_task(struct rq *rq, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; + ftrace_event_task_activate(p, cpu_of(rq)); enqueue_task(rq, p, wakeup); inc_nr_running(p, rq); } @@ -1443,6 +1449,7 @@ static void deactivate_task(struct rq *r if (task_contributes_to_load(p)) rq->nr_uninterruptible++; + ftrace_event_task_deactivate(p, cpu_of(rq)); dequeue_task(rq, p, sleep); dec_nr_running(p, rq); } @@ -1626,6 +1633,7 @@ void wait_task_inactive(struct task_stru * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_kernel_sched_wait(p); running = task_running(rq, p); on_rq = p->se.on_rq; task_rq_unlock(rq, &flags); @@ -1895,13 +1903,21 @@ static int sched_balance_self(int cpu, i * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; long old_state; struct rq *rq; +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + if (rt_task(p)) + sync = 0; +#endif smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; @@ -1966,9 +1982,13 @@ out_activate: success = 1; out_running: + trace_kernel_sched_wakeup(rq, p); check_preempt_curr(rq, p); - p->state = TASK_RUNNING; + if (mutex) + p->state = TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -1981,13 +2001,31 @@ out: int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + return try_to_wake_up(p, TASK_ALL, 0, 0); } EXPORT_SYMBOL(wake_up_process); +int wake_up_process_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 1, 0); +} +EXPORT_SYMBOL(wake_up_process_sync); + +int wake_up_process_mutex(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 0, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex); + +int wake_up_process_mutex_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_ALL, 1, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex_sync); + int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); } /* @@ -2057,7 +2095,7 @@ void sched_fork(struct task_struct *p, i if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) p->oncpu = 0; #endif #ifdef CONFIG_PREEMPT @@ -2095,6 +2133,7 @@ void wake_up_new_task(struct task_struct p->sched_class->task_new(rq, p); inc_nr_running(p, rq); } + trace_kernel_sched_wakeup_new(rq, p); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2132,8 +2171,17 @@ static void fire_sched_in_preempt_notifi struct preempt_notifier *notifier; struct hlist_node *node; + if (hlist_empty(&curr->preempt_notifiers)) + return; + + /* + * The KVM sched in notifier expects to be called with + * interrupts enabled. + */ + local_irq_enable(); hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); + local_irq_disable(); } static void @@ -2161,6 +2209,26 @@ fire_sched_out_preempt_notifiers(struct #endif +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + static int once = 1; + + barrier(); + dec_preempt_count(); + + if (once && !preempt_count()) { + once = 0; + printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); + dump_stack(); + } +} + +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2218,7 +2286,7 @@ static void finish_task_switch(struct rq * Manfred Spraul */ prev_state = prev->state; - finish_arch_switch(prev); + _finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (current->sched_class->post_schedule) @@ -2226,8 +2294,12 @@ static void finish_task_switch(struct rq #endif fire_sched_in_preempt_notifiers(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -2245,12 +2317,15 @@ static void finish_task_switch(struct rq asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + preempt_disable(); // TODO: move this to fork setup + finish_task_switch(this_rq(), prev); + __preempt_enable_no_resched(); + local_irq_enable(); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); +#else + preempt_check_resched(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); @@ -2267,6 +2342,8 @@ context_switch(struct rq *rq, struct tas struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + + trace_kernel_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2297,6 +2374,11 @@ context_switch(struct rq *rq, struct tas spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif +#ifdef CURRENT_PTR + barrier(); + *current_ptr = next; + *current_ti_ptr = next->thread_info; +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2343,6 +2425,11 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + unsigned long long nr_context_switches(void) { int i; @@ -2361,6 +2448,13 @@ unsigned long nr_iowait(void) for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + return sum; } @@ -2499,6 +2593,7 @@ static void sched_migrate_task(struct ta || unlikely(cpu_is_offline(dest_cpu))) goto out; + trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -3551,7 +3646,7 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); + int this_cpu = raw_smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; @@ -3703,7 +3798,9 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (rt_task(p)) + cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp); + else if (TASK_NICE(p) > 0) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -3758,10 +3855,12 @@ void account_system_time(struct task_str /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) + if (hardirq_count() - hardirq_offset || (p->flags & PF_HARDIRQ)) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->flags & PF_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (rt_task(p)) + cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp); else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); else if (atomic_read(&rq->nr_iowait) > 0) @@ -3816,18 +3915,13 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; - u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + sched_clock_tick(); + + BUG_ON(!irqs_disabled()); spin_lock(&rq->lock); - __update_rq_clock(rq); - /* - * Let rq->clock advance by at least TICK_NSEC: - */ - if (unlikely(rq->clock < next_tick)) { - rq->clock = next_tick; - rq->clock_underflows++; - } - rq->tick_timestamp = rq->clock; + update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); update_sched_rt_period(rq); @@ -3839,26 +3933,57 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER) || \ + defined(CONFIG_PREEMPT_TRACE)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_ip(CALLER_ADDR1); + +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(eip, parent_eip); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -3870,7 +3995,10 @@ void __kprobes sub_preempt_count(int val if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); @@ -3884,8 +4012,8 @@ static noinline void __schedule_bug(stru { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", + prev->comm, preempt_count(), prev->pid, smp_processor_id()); debug_show_held_locks(prev); if (irqs_disabled()) @@ -3902,6 +4030,8 @@ static noinline void __schedule_bug(stru */ static inline void schedule_debug(struct task_struct *prev) { + WARN_ON(system_state == SYSTEM_BOOTING); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3956,14 +4086,15 @@ pick_next_task(struct rq *rq, struct tas /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; -need_resched: + rcu_preempt_boost(); + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3972,7 +4103,6 @@ need_resched: switch_count = &prev->nivcsw; release_kernel_lock(prev); -need_resched_nonpreemptible: schedule_debug(prev); @@ -3982,20 +4112,27 @@ need_resched_nonpreemptible: * Do the rq-clock update outside the rq lock: */ local_irq_disable(); - __update_rq_clock(rq); + update_rq_clock(rq); spin_lock(&rq->lock); + cpu = smp_processor_id(); clear_tsk_need_resched(prev); + clear_tsk_need_resched_delayed(prev); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + if ((prev->state & ~TASK_RUNNING_MUTEX) && + !(preempt_count() & PREEMPT_ACTIVE)) { + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && signal_pending(prev))) { prev->state = TASK_RUNNING; } else { + touch_softlockup_watchdog(); deactivate_task(rq, prev, 1); } switch_count = &prev->nvcsw; } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + #ifdef CONFIG_SMP if (prev->sched_class->pre_schedule) prev->sched_class->pre_schedule(rq, prev); @@ -4021,21 +4158,93 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else - spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + } else { + __preempt_enable_no_resched(); + spin_unlock(&rq->lock); + } hrtick_set(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) - goto need_resched_nonpreemptible; + reacquire_kernel_lock(current); + if (!irqs_disabled()) { + static int once = 1; + if (once) { + once = 0; + print_irqtrace_events(current); + WARN_ON(1); + } + } + +} - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + WARN_ON(system_state == SYSTEM_BOOTING); + /* + * Test if we have interrupts disabled. + */ + if (unlikely(irqs_disabled())) { + printk(KERN_ERR "BUG: scheduling with irqs disabled: " + "%s/0x%08x/%d\n", current->comm, preempt_count(), + current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); + } + + if (unlikely(current->flags & PF_NOSCHED)) { + current->flags &= ~PF_NOSCHED; + printk(KERN_ERR "%s:%d userspace BUG: scheduling in " + "user-atomic context!\n", current->comm, current->pid); + dump_stack(); + send_sig(SIGUSR2, current, 1); + } + + local_irq_disable(); + + do { + __schedule(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))); + + local_irq_enable(); } EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT + +/* + * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: + */ +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk(KERN_INFO "turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk(KERN_INFO "turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -4047,6 +4256,8 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -4055,6 +4266,7 @@ asmlinkage void __sched preempt_schedule return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* @@ -4064,9 +4276,9 @@ asmlinkage void __sched preempt_schedule */ saved_lock_depth = task->lock_depth; task->lock_depth = -1; - schedule(); + __schedule(); task->lock_depth = saved_lock_depth; - sub_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); /* * Check again in case we missed a preemption opportunity @@ -4078,10 +4290,10 @@ asmlinkage void __sched preempt_schedule EXPORT_SYMBOL(preempt_schedule); /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { @@ -4089,10 +4301,17 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* @@ -4102,11 +4321,9 @@ asmlinkage void __sched preempt_schedule */ saved_lock_depth = task->lock_depth; task->lock_depth = -1; - local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); task->lock_depth = saved_lock_depth; - sub_preempt_count(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -4121,7 +4338,8 @@ asmlinkage void __sched preempt_schedule int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode | TASK_RUNNING_MUTEX, + sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -4161,8 +4379,9 @@ void __wake_up(wait_queue_head_t *q, uns unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); + __wake_up_common(q, mode, nr_exclusive, 1, key); spin_unlock_irqrestore(&q->lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(__wake_up); @@ -4211,8 +4430,9 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete); @@ -4222,11 +4442,17 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); +unsigned int completion_done(struct completion *x) +{ + return x->done; +} +EXPORT_SYMBOL(completion_done); + static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { @@ -4354,28 +4580,47 @@ long __sched sleep_on_timeout(wait_queue } EXPORT_SYMBOL(sleep_on_timeout); -#ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance logic + * and by rcupreempt-boost to boost priorities of tasks sleeping + * with rcu locks. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { unsigned long flags; - int oldprio, on_rq, running; + int oldprio, prev_resched, on_rq, running; struct rq *rq; const struct sched_class *prev_class = p->sched_class; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + update_rq_clock(rq); oldprio = p->prio; @@ -4393,6 +4638,9 @@ void rt_mutex_setprio(struct task_struct p->prio = prio; +// trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p)); + prev_resched = _need_resched(); + if (running) p->sched_class->set_curr_task(rq); if (on_rq) { @@ -4400,11 +4648,12 @@ void rt_mutex_setprio(struct task_struct check_class_changed(rq, p, prev_class, oldprio, running); } +// trace_special(prev_resched, _need_resched(), 0); + +out_unlock: task_rq_unlock(rq, &flags); } -#endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -4999,19 +5248,19 @@ asmlinkage long sys_sched_yield(void) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(&rq->lock); - schedule(); + __schedule(); + + local_irq_enable(); + preempt_check_resched(); return 0; } static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) __might_sleep(__FILE__, __LINE__); #endif /* @@ -5020,10 +5269,11 @@ static void __cond_resched(void) * cond_resched() call. */ do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + local_irq_enable(); } #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) @@ -5047,13 +5297,13 @@ EXPORT_SYMBOL(_cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_raw_spinlock(raw_spinlock_t *lock) { int resched = need_resched() && system_state == SYSTEM_RUNNING; int ret = 0; if (spin_needbreak(lock) || resched) { - spin_unlock(lock); + spin_unlock_no_resched(lock); if (resched && need_resched()) __cond_resched(); else @@ -5063,12 +5313,35 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_raw_spinlock); -int __sched cond_resched_softirq(void) +#ifdef CONFIG_PREEMPT_RT + +int __cond_resched_spinlock(spinlock_t *lock) { - BUG_ON(!in_softirq()); +#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock_no_resched(lock); + __cond_resched(); + spin_lock(lock); + return 1; + } +#endif + return 0; +} +EXPORT_SYMBOL(__cond_resched_spinlock); +#endif + +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ +int __sched cond_resched_softirq(void) +{ +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); +#endif if (need_resched() && system_state == SYSTEM_RUNNING) { local_bh_enable(); __cond_resched(); @@ -5079,17 +5352,102 @@ int __sched cond_resched_softirq(void) } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ + WARN_ON_ONCE(!in_softirq()); + + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +/* + * Preempt a hardirq context if necessary (possible with hardirq threading): + */ +int cond_resched_hardirq_context(void) +{ + WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (hardirq_need_resched()) { +#ifndef CONFIG_PREEMPT_RT + irq_exit(); +#endif + local_irq_enable(); + __cond_resched(); +#ifndef CONFIG_PREEMPT_RT + local_irq_disable(); + __irq_enter(); +#endif + + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_hardirq_context); + +#ifdef CONFIG_PREEMPT_VOLUNTARY + +int voluntary_preemption = 1; + +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif + /** * yield - yield the current processor to other threads. * * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } + +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} EXPORT_SYMBOL(yield); /* @@ -5229,7 +5587,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { @@ -5237,19 +5595,23 @@ void sched_show_task(struct task_struct unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); + printk("%-13.13s %c [%p]", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', p); #if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(KERN_CONT " running "); else printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(KERN_CONT " running task "); else printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif + if (task_curr(p)) + printk("[curr] "); + else if (p->se.on_rq) + printk("[on rq #%d] ", task_cpu(p)); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long *n = end_of_stack(p); @@ -5267,6 +5629,7 @@ void sched_show_task(struct task_struct void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; + int do_unlock = 1; #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -5275,7 +5638,16 @@ void show_state_filter(unsigned long sta printk(KERN_INFO " task PC stack pid father\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -5291,7 +5663,8 @@ void show_state_filter(unsigned long sta #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: */ @@ -5326,7 +5699,7 @@ void __cpuinit init_idle(struct task_str spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) idle->oncpu = 1; #endif spin_unlock_irqrestore(&rq->lock, flags); @@ -5454,11 +5827,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + unsigned long flags; int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; + /* + * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) + * disabling interrupts - which on PREEMPT_RT does not do: + */ + local_irq_save(flags); + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -5482,6 +5862,8 @@ static int __migrate_task(struct task_st ret = 1; out: double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + return ret; } @@ -5899,6 +6281,36 @@ static void unregister_sched_domain_sysc } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5936,7 +6348,8 @@ migration_call(struct notifier_block *nf spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); + + set_rq_online(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5997,7 +6410,7 @@ migration_call(struct notifier_block *nf spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_clear(cpu, rq->rd->online); + set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6192,23 +6605,20 @@ sd_parent_degenerate(struct sched_domain static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; - const struct sched_class *class; + struct root_domain *reap = NULL; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { struct root_domain *old_rd = rq->rd; - for (class = sched_class_highest; class; class = class->next) { - if (class->leave_domain) - class->leave_domain(rq); - } + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); cpu_clear(rq->cpu, old_rd->span); - cpu_clear(rq->cpu, old_rd->online); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + reap = old_rd; } atomic_inc(&rd->refcount); @@ -6216,14 +6626,13 @@ static void rq_attach_root(struct rq *rq cpu_set(rq->cpu, rd->span); if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); - - for (class = sched_class_highest; class; class = class->next) { - if (class->join_domain) - class->join_domain(rq); - } + set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); + + /* Don't try to free the memory while in-atomic() */ + if (unlikely(reap)) + kfree(reap); } static void init_rootdomain(struct root_domain *rd) @@ -6232,6 +6641,8 @@ static void init_rootdomain(struct root_ cpus_clear(rd->span); cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); } static void init_defrootdomain(void) @@ -6955,7 +7366,6 @@ static void detach_destroy_domains(const for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); arch_destroy_sched_domains(cpu_map); } @@ -7194,7 +7604,8 @@ static void init_rt_rq(struct rt_rq *rt_ array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); + INIT_LIST_HEAD(array->xqueue + i); + INIT_LIST_HEAD(array->squeue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ @@ -7277,7 +7688,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7289,8 +7699,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; + init_task_group.rt_runtime = global_rt_runtime(); INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), @@ -7308,6 +7717,7 @@ void __init sched_init(void) rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); @@ -7338,6 +7748,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -7353,7 +7766,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line) { #ifdef in_atomic @@ -7361,13 +7774,16 @@ void __might_sleep(char *file, int line) if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (debug_direct_keyboard && hardirq_count()) + return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + " context %s(%d) at %s:%d\n", + current->comm, current->pid, file, line); + printk("in_atomic():%d [%08x], irqs_disabled():%d\n", + in_atomic(), preempt_count(), irqs_disabled()); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); @@ -7382,6 +7798,7 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) @@ -7413,7 +7830,6 @@ void normalize_rt_tasks(void) p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->clock = 0; if (!rt_task(p)) { /* Index: linux-2.6.25.8-rt7/kernel/rcupdate.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rcupdate.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rcupdate.c 2008-06-23 19:13:02.000000000 -0400 @@ -85,8 +85,11 @@ void synchronize_rcu(void) /* Will wake me after RCU finished */ call_rcu(&rcu.head, wakeme_after_rcu); + rcu_boost_readers(); + /* Wait for it */ wait_for_completion(&rcu.completion); + rcu_unboost_readers(); } EXPORT_SYMBOL_GPL(synchronize_rcu); Index: linux-2.6.25.8-rt7/include/linux/rcuclassic.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/rcuclassic.h 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/rcuclassic.h 2008-06-23 19:13:48.000000000 -0400 @@ -51,7 +51,7 @@ struct rcu_ctrlblk { int signaled; - spinlock_t lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t lock ____cacheline_internodealigned_in_smp; cpumask_t cpumask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ } ____cacheline_internodealigned_in_smp; @@ -162,6 +162,10 @@ extern long rcu_batches_completed_bh(voi #define rcu_enter_nohz() do { } while (0) #define rcu_exit_nohz() do { } while (0) +#define rcu_preempt_boost_init() do { } while (0) + +struct softirq_action; +extern void rcu_process_callbacks(struct softirq_action *unused); #endif /* __KERNEL__ */ #endif /* __LINUX_RCUCLASSIC_H */ Index: linux-2.6.25.8-rt7/kernel/rcuclassic.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rcuclassic.c 2008-06-23 19:13:01.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rcuclassic.c 2008-06-23 19:13:48.000000000 -0400 @@ -60,13 +60,13 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -402,6 +402,8 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { *rdp->donetail = rdp->curlist; rdp->donetail = rdp->curtail; @@ -410,12 +412,12 @@ static void __rcu_process_callbacks(stru } if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); + local_irq_save(flags); rdp->curlist = rdp->nxtlist; rdp->curtail = rdp->nxttail; rdp->nxtlist = NULL; rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + local_irq_restore(flags); /* * start the next batch of callbacks @@ -442,7 +444,7 @@ static void __rcu_process_callbacks(stru rcu_do_batch(rdp); } -static void rcu_process_callbacks(struct softirq_action *unused) +void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); @@ -497,6 +499,17 @@ int rcu_needs_cpu(int cpu) return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); } +void rcu_advance_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); +} + void rcu_check_callbacks(int cpu, int user) { if (user || Index: linux-2.6.25.8-rt7/kernel/rcutorture.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rcutorture.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rcutorture.c 2008-06-23 19:13:59.000000000 -0400 @@ -52,11 +52,13 @@ MODULE_AUTHOR("Paul E. McKenney rtort_rcu, rcu_torture_cb); } +static unsigned long rcu_torture_preempt_errors; + +static int rcu_torture_preempt(void *arg) +{ + int completedstart; + int err; + time_t gcstart; + struct sched_param sp; + + sp.sched_priority = 1; + err = sched_setscheduler(current, SCHED_RR, &sp); + if (err != 0) + printk(KERN_ALERT "rcu_torture_preempt() priority err: %d\n", + err); + current->flags |= PF_NOFREEZE; + + do { + completedstart = rcu_torture_completed(); + gcstart = xtime.tv_sec; + while ((xtime.tv_sec - gcstart < 10) && + (rcu_torture_completed() == completedstart)) + cond_resched(); + if (rcu_torture_completed() == completedstart) + rcu_torture_preempt_errors++; + schedule_timeout_interruptible(1); + } while (!kthread_should_stop()); + return 0; +} + +static long rcu_preempt_start(void) +{ + long retval = 0; + int i; + + rcu_preempt_tasks = kzalloc(nrealpreempthogs * sizeof(rcu_preempt_tasks[0]), + GFP_KERNEL); + if (rcu_preempt_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + retval = -ENOMEM; + goto out; + } + + for (i=0; i < nrealpreempthogs; i++) { + rcu_preempt_tasks[i] = kthread_run(rcu_torture_preempt, NULL, + "rcu_torture_preempt"); + if (IS_ERR(rcu_preempt_tasks[i])) { + VERBOSE_PRINTK_ERRSTRING("Failed to create preempter"); + retval = PTR_ERR(rcu_preempt_tasks[i]); + rcu_preempt_tasks[i] = NULL; + break; + } + } + out: + return retval; +} + +static void rcu_preempt_end(void) +{ + int i; + if (rcu_preempt_tasks) { + for (i=0; i < nrealpreempthogs; i++) { + if (rcu_preempt_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_preempt task"); + kthread_stop(rcu_preempt_tasks[i]); + } + rcu_preempt_tasks[i] = NULL; + } + kfree(rcu_preempt_tasks); + } +} + +static int rcu_preempt_stats(char *page) +{ + return sprintf(page, + "Preemption stalls: %lu\n", rcu_torture_preempt_errors); +} + static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, .sync = synchronize_rcu, - .stats = NULL, + .preemptstart = rcu_preempt_start, + .preemptend = rcu_preempt_end, + .stats = rcu_preempt_stats, .name = "rcu" }; @@ -296,14 +381,12 @@ static void rcu_sync_torture_init(void) static struct rcu_torture_ops rcu_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, - .stats = NULL, .name = "rcu_sync" }; @@ -355,28 +438,23 @@ static void rcu_bh_torture_synchronize(v } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, - .stats = NULL, .name = "rcu_bh" }; static struct rcu_torture_ops rcu_bh_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, - .stats = NULL, .name = "rcu_bh_sync" }; @@ -488,14 +566,12 @@ static void sched_torture_synchronize(vo static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .completed = sched_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, - .stats = NULL, .name = "sched" }; @@ -550,10 +626,20 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { + struct sched_param sp; + long id = (long) arg; + int err; DEFINE_RCU_RANDOM(rand); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); + /* + * Set up at a higher prio than the readers. + */ + sp.sched_priority = 1 + id; + err = sched_setscheduler(current, SCHED_RR, &sp); + if (err != 0) + printk(KERN_ALERT "rcu_torture_writer() priority err: %d\n", + err); do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); @@ -592,7 +678,7 @@ rcu_torture_reader(void *arg) if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); continue; } if (p->rtort_mbtest == 0) @@ -786,10 +872,13 @@ rcu_torture_print_module_parms(char *tag { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " + "npreempthogs=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", + "shuffle_interval=%d preempt_torture=%d\n", torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval); + nrealpreempthogs, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + preempt_torture); } static void @@ -842,6 +931,8 @@ rcu_torture_cleanup(void) kthread_stop(stats_task); } stats_task = NULL; + if (preempt_torture && (cur_ops->preemptend != NULL)) + cur_ops->preemptend(); /* Wait for all RCU callbacks to fire. */ rcu_barrier(); @@ -859,7 +950,7 @@ rcu_torture_cleanup(void) static int __init rcu_torture_init(void) { - int i; + long i; int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = @@ -887,6 +978,12 @@ rcu_torture_init(void) rcu_torture_print_module_parms("Start of test"); fullstop = 0; + if (npreempthogs >= 0) + nrealpreempthogs = npreempthogs; + else + nrealpreempthogs = num_online_cpus() == 1 ? 1 : + num_online_cpus() - 1; + /* Set up the freelist. */ INIT_LIST_HEAD(&rcu_torture_freelist); @@ -934,7 +1031,7 @@ rcu_torture_init(void) } for (i = 0; i < nfakewriters; i++) { VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, (void*)i, "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); @@ -984,6 +1081,11 @@ rcu_torture_init(void) goto unwind; } } + if (preempt_torture && (cur_ops->preemptstart != NULL)) { + firsterr = cur_ops->preemptstart(); + if (firsterr != 0) + goto unwind; + } return 0; unwind: Index: linux-2.6.25.8-rt7/kernel/time/timekeeping.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/time/timekeeping.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/time/timekeeping.c 2008-06-23 19:13:49.000000000 -0400 @@ -24,8 +24,9 @@ * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SEQLOCK(xtime_lock); +EXPORT_SYMBOL_GPL(xtime_lock); /* * The current time @@ -45,6 +46,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +EXPORT_SYMBOL_GPL(xtime); static struct timespec xtime_cache __attribute__ ((aligned (16))); void update_xtime_cache(u64 nsec) Index: linux-2.6.25.8-rt7/include/linux/srcu.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/srcu.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/srcu.h 2008-06-23 19:13:04.000000000 -0400 @@ -27,6 +27,8 @@ #ifndef _LINUX_SRCU_H #define _LINUX_SRCU_H +#include + struct srcu_struct_array { int c[2]; }; @@ -50,4 +52,24 @@ void srcu_read_unlock(struct srcu_struct void synchronize_srcu(struct srcu_struct *sp); long srcu_batches_completed(struct srcu_struct *sp); +/* + * fully compatible with srcu, but optimized for writers. + */ + +struct qrcu_struct { + int completed; + atomic_t ctr[2]; + wait_queue_head_t wq; + struct mutex mutex; +}; + +int init_qrcu_struct(struct qrcu_struct *qp); +int qrcu_read_lock(struct qrcu_struct *qp); +void qrcu_read_unlock(struct qrcu_struct *qp, int idx); +void synchronize_qrcu(struct qrcu_struct *qp); + +static inline void cleanup_qrcu_struct(struct qrcu_struct *qp) +{ +} + #endif Index: linux-2.6.25.8-rt7/kernel/srcu.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/srcu.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/srcu.c 2008-06-23 19:13:04.000000000 -0400 @@ -255,3 +255,89 @@ EXPORT_SYMBOL_GPL(srcu_read_lock); EXPORT_SYMBOL_GPL(srcu_read_unlock); EXPORT_SYMBOL_GPL(synchronize_srcu); EXPORT_SYMBOL_GPL(srcu_batches_completed); + +int init_qrcu_struct(struct qrcu_struct *qp) +{ + qp->completed = 0; + atomic_set(qp->ctr + 0, 1); + atomic_set(qp->ctr + 1, 0); + init_waitqueue_head(&qp->wq); + mutex_init(&qp->mutex); + + return 0; +} + +int qrcu_read_lock(struct qrcu_struct *qp) +{ + for (;;) { + int idx = qp->completed & 0x1; + if (likely(atomic_inc_not_zero(qp->ctr + idx))) + return idx; + } +} + +void qrcu_read_unlock(struct qrcu_struct *qp, int idx) +{ + if (atomic_dec_and_test(qp->ctr + idx)) + wake_up(&qp->wq); +} + +void synchronize_qrcu(struct qrcu_struct *qp) +{ + int idx; + + smp_mb(); /* Force preceding change to happen before fastpath check. */ + + /* + * Fastpath: If the two counters sum to "1" at a given point in + * time, there are no readers. However, it takes two separate + * loads to sample both counters, which won't occur simultaneously. + * So we might race with a counter switch, so that we might see + * ctr[0]==0, then the counter might switch, then we might see + * ctr[1]==1 (unbeknownst to us because there is a reader still + * there). So we do a read memory barrier and recheck. If the + * same race happens again, there must have been a second counter + * switch. This second counter switch could not have happened + * until all preceding readers finished, so if the condition + * is true both times, we may safely proceed. + * + * This relies critically on the atomic increment and atomic + * decrement being seen as executing in order. + */ + + if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) { + smp_rmb(); /* Keep two checks independent. */ + if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) + goto out; + } + + mutex_lock(&qp->mutex); + + idx = qp->completed & 0x1; + if (atomic_read(qp->ctr + idx) == 1) + goto out_unlock; + + atomic_inc(qp->ctr + (idx ^ 0x1)); + + /* + * Prevent subsequent decrement from being seen before previous + * increment -- such an inversion could cause the fastpath + * above to falsely conclude that there were no readers. Also, + * reduce the likelihood that qrcu_read_lock() will loop. + */ + + smp_mb__after_atomic_inc(); + qp->completed++; + + atomic_dec(qp->ctr + idx); + __wait_event(qp->wq, !atomic_read(qp->ctr + idx)); +out_unlock: + mutex_unlock(&qp->mutex); +out: + smp_mb(); /* force subsequent free after qrcu_read_unlock(). */ +} + +EXPORT_SYMBOL_GPL(init_qrcu_struct); +EXPORT_SYMBOL_GPL(qrcu_read_lock); +EXPORT_SYMBOL_GPL(qrcu_read_unlock); +EXPORT_SYMBOL_GPL(synchronize_qrcu); Index: linux-2.6.25.8-rt7/kernel/sched_cpupri.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/sched_cpupri.c 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + Index: linux-2.6.25.8-rt7/kernel/sched_cpupri.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/sched_cpupri.h 2008-06-23 19:14:36.000000000 -0400 @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO +#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + raw_spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ Index: linux-2.6.25.8-rt7/kernel/sched_rt.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/sched_rt.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/sched_rt.c 2008-06-23 19:14:36.000000000 -0400 @@ -12,6 +12,9 @@ static inline int rt_overloaded(struct r static inline void rt_set_overload(struct rq *rq) { + if (!rq->online) + return; + cpu_set(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set @@ -26,6 +29,9 @@ static inline void rt_set_overload(struc static inline void rt_clear_overload(struct rq *rq) { + if (!rq->online) + return; + /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); cpu_clear(rq->cpu, rq->rd->rto_mask); @@ -33,6 +39,9 @@ static inline void rt_clear_overload(str static void update_rt_migration(struct rq *rq) { + if (unlikely(num_online_cpus() == 1)) + return; + if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { if (!rq->rt.overloaded) { rt_set_overload(rq); @@ -112,6 +121,7 @@ static void sched_rt_rq_dequeue(struct r static inline int rt_rq_throttled(struct rt_rq *rt_rq) { + return 0; return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; } @@ -171,6 +181,7 @@ static inline void sched_rt_rq_dequeue(s static inline int rt_rq_throttled(struct rt_rq *rt_rq) { + return 0; return rt_rq->rt_throttled; } #endif @@ -191,6 +202,7 @@ static int sched_rt_runtime_exceeded(str { u64 runtime = sched_rt_runtime(rt_rq); + return 0; if (runtime == RUNTIME_INF) return 0; @@ -217,6 +229,7 @@ static void update_sched_rt_period(struc struct rt_rq *rt_rq; u64 period; + return; while (rq->clock > rq->rt_period_expire) { period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; rq->rt_period_expire += period; @@ -259,7 +272,7 @@ static void update_curr_rt(struct rq *rq curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); - rt_rq->rt_time += delta_exec; +// rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); } @@ -270,8 +283,14 @@ void inc_rt_tasks(struct sched_rt_entity WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) + if (rt_se_prio(rt_se) < rt_rq->highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_se_prio(rt_se)); + } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { @@ -290,6 +309,10 @@ void inc_rt_tasks(struct sched_rt_entity static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { +#ifdef CONFIG_SMP + int highest_prio = rt_rq->highest_prio; +#endif + WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; @@ -310,9 +333,18 @@ void dec_rt_tasks(struct sched_rt_entity #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { struct rq *rq = rq_of_rt_rq(rt_rq); + BUG_ON(!rq->rt.rt_nr_migratory); rq->rt.rt_nr_migratory--; } + if (rt_rq->highest_prio != highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_rq->highest_prio); + } + update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED @@ -332,7 +364,13 @@ static void enqueue_rt_entity(struct sch if (group_rq && rt_rq_throttled(group_rq)) return; - list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (rt_se->nr_cpus_allowed == 1) + list_add_tail(&rt_se->run_list, + array->xqueue + rt_se_prio(rt_se)); + else + list_add_tail(&rt_se->run_list, + array->squeue + rt_se_prio(rt_se)); + __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); @@ -344,7 +382,8 @@ static void dequeue_rt_entity(struct sch struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) + if (list_empty(array->squeue + rt_se_prio(rt_se)) + && list_empty(array->xqueue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -376,6 +415,55 @@ static void dequeue_rt_stack(struct task } while (top_se); } +static inline void incr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible++; +} + +static inline void decr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible--; +} + +unsigned long rt_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_running; + + return sum; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_running; +} + +unsigned long rt_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long rt_nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_uninterruptible; +} + /* * Adding/removing a task to/from a priority array: */ @@ -393,6 +481,9 @@ static void enqueue_task_rt(struct rq *r */ for_each_sched_rt_entity(rt_se) enqueue_rt_entity(rt_se); + + if (p->state == TASK_UNINTERRUPTIBLE) + decr_rt_nr_uninterruptible(p, rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -402,6 +493,9 @@ static void dequeue_task_rt(struct rq *r update_curr_rt(rq); + if (p->state == TASK_UNINTERRUPTIBLE) + incr_rt_nr_uninterruptible(p, rq); + dequeue_rt_stack(p); /* @@ -417,13 +511,19 @@ static void dequeue_task_rt(struct rq *r /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. + * + * Note: We always enqueue the task to the shared-queue, regardless of its + * previous position w.r.t. exclusive vs shared. This is so that exclusive RR + * tasks fairly round-robin with all tasks on the runqueue, not just other + * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + list_del_init(&rt_se->run_list); + list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -481,13 +581,46 @@ static int select_task_rq_rt(struct task } #endif /* CONFIG_SMP */ +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq); + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) { - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio) { resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * If: + * + * - the newly woken task is of equal priority to the current task + * - the newly woken task is non-migratable while current is migratable + * - current will be preempted on the next reschedule + * + * we should check to see if current can readily move to a different + * cpu. If so, we will reschedule to allow the push logic to try + * to move current somewhere else, making room for our non-migratable + * task. + */ + if((p->prio == rq->curr->prio) + && p->rt.nr_cpus_allowed == 1 + && rq->curr->rt.nr_cpus_allowed != 1 + && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + cpumask_t mask; + + if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) + /* + * There appears to be other cpus that can accept + * current, so lets reschedule to try and push it away + */ + resched_task(rq->curr); + } +#endif } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -501,8 +634,15 @@ static struct sched_rt_entity *pick_next idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->queue + idx; - next = list_entry(queue->next, struct sched_rt_entity, run_list); + queue = array->xqueue + idx; + if (!list_empty(queue)) + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + else { + queue = array->squeue + idx; + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + } return next; } @@ -572,7 +712,7 @@ static struct task_struct *pick_next_hig continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { + list_for_each_entry(rt_se, array->squeue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -590,73 +730,6 @@ static struct task_struct *pick_next_hig static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); -static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) -{ - int lowest_prio = -1; - int lowest_cpu = -1; - int count = 0; - int cpu; - - cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); - - /* - * Scan each rq for the lowest prio. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - struct rq *rq = cpu_rq(cpu); - - /* We look for lowest RT prio or non-rt CPU */ - if (rq->rt.highest_prio >= MAX_RT_PRIO) { - /* - * if we already found a low RT queue - * and now we found this non-rt queue - * clear the mask and set our bit. - * Otherwise just return the queue as is - * and the count==1 will cause the algorithm - * to use the first bit found. - */ - if (lowest_cpu != -1) { - cpus_clear(*lowest_mask); - cpu_set(rq->cpu, *lowest_mask); - } - return 1; - } - - /* no locking for now */ - if ((rq->rt.highest_prio > task->prio) - && (rq->rt.highest_prio >= lowest_prio)) { - if (rq->rt.highest_prio > lowest_prio) { - /* new low - clear old data */ - lowest_prio = rq->rt.highest_prio; - lowest_cpu = cpu; - count = 0; - } - count++; - } else - cpu_clear(cpu, *lowest_mask); - } - - /* - * Clear out all the set bits that represent - * runqueues that were of higher prio than - * the lowest_prio. - */ - if (lowest_cpu > 0) { - /* - * Perhaps we could add another cpumask op to - * zero out bits. Like cpu_zero_bits(cpumask, nrbits); - * Then that could be optimized to use memset and such. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - if (cpu >= lowest_cpu) - break; - cpu_clear(cpu, *lowest_mask); - } - } - - return count; -} - static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { int first; @@ -678,17 +751,12 @@ static int find_lowest_rq(struct task_st cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - int count = find_lowest_cpus(task, lowest_mask); - if (!count) - return -1; /* No targets found */ + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ - /* - * There is no sense in performing an optimal search if only one - * target is found. - */ - if (count == 1) - return first_cpu(*lowest_mask); + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return -1; /* No targets found */ /* * At this point we have built a mask of cpus representing the @@ -839,6 +907,8 @@ static int push_rt_task(struct rq *rq) resched_task(lowest_rq->curr); + schedstat_inc(rq, rto_pushed); + spin_unlock(&lowest_rq->lock); ret = 1; @@ -943,6 +1013,7 @@ static int pull_rt_task(struct rq *this_ */ next = p; + schedstat_inc(src_rq, rto_pulled); } skip: spin_unlock(&src_rq->lock); @@ -954,8 +1025,10 @@ static int pull_rt_task(struct rq *this_ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) + if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) { pull_rt_task(rq); + schedstat_inc(rq, rto_schedule); + } } static void post_schedule_rt(struct rq *rq) @@ -968,9 +1041,10 @@ static void post_schedule_rt(struct rq * * first via finish_lock_switch and then reaquire it here. */ if (unlikely(rq->rt.overloaded)) { - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); + schedstat_inc(rq, rto_schedule_tail); + spin_unlock(&rq->lock); } } @@ -978,9 +1052,11 @@ static void post_schedule_rt(struct rq * static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && - (p->prio >= rq->rt.highest_prio) && - rq->rt.overloaded) + !test_tsk_need_resched(rq->curr) && + rq->rt.overloaded) { push_rt_tasks(rq); + schedstat_inc(rq, rto_wakeup); + } } static unsigned long @@ -1022,6 +1098,14 @@ static void set_cpus_allowed_rt(struct t } update_rt_migration(rq); + + if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) + /* + * If either the new or old weight is a "1", we need + * to requeue to properly move between shared and + * exclusive queues. + */ + requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; @@ -1029,17 +1113,21 @@ static void set_cpus_allowed_rt(struct t } /* Assumes rq->lock is held */ -static void join_domain_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } /* Assumes rq->lock is held */ -static void leave_domain_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } /* @@ -1202,8 +1290,8 @@ const struct sched_class rt_sched_class .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, - .join_domain = join_domain_rt, - .leave_domain = leave_domain_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, Index: linux-2.6.25.8-rt7/arch/x86/Kconfig =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/Kconfig 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/Kconfig 2008-06-23 19:14:22.000000000 -0400 @@ -18,7 +18,10 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE + select HAVE_DYNAMIC_FTRACE + select HAVE_FTRACE select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES @@ -97,10 +100,18 @@ config DMI def_bool y config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + bool + depends on !X86_XADD || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + bool + depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y config ARCH_HAS_ILOG2_U32 def_bool n @@ -1228,6 +1239,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA +config HARDIRQS_SW_RESEND + bool + default y + menu "Power management options" depends on !X86_VOYAGER Index: linux-2.6.25.8-rt7/arch/x86/kernel/alternative.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/alternative.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/alternative.c 2008-06-23 19:13:04.000000000 -0400 @@ -141,7 +141,7 @@ static const unsigned char *const p6_nop #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; @@ -160,7 +160,7 @@ static const struct nop { { -1, NULL } }; -static const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { const unsigned char *const *noptable = intel_nops; int i; @@ -177,7 +177,7 @@ static const unsigned char*const * find_ #endif /* CONFIG_X86_64 */ /* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void add_nops(void *insns, unsigned int len) +void add_nops(void *insns, unsigned int len) { const unsigned char *const *noptable = find_nop_table(); @@ -190,6 +190,7 @@ static void add_nops(void *insns, unsign len -= noplen; } } +EXPORT_SYMBOL_GPL(add_nops); extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; Index: linux-2.6.25.8-rt7/arch/x86/kernel/entry_32.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/entry_32.S 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/entry_32.S 2008-06-23 19:13:43.000000000 -0400 @@ -265,14 +265,18 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + DISABLE_INTERRUPTS(CLBR_ANY) + call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -330,6 +334,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -345,6 +354,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_EVENT_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -368,6 +382,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -467,20 +486,19 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + call __schedule # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and @@ -1107,6 +1125,74 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + +.globl mcount_call +mcount_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + .section .rodata,"a" #include "syscall_table_32.S" Index: linux-2.6.25.8-rt7/arch/x86/kernel/entry_64.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/entry_64.S 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/entry_64.S 2008-06-23 19:13:28.000000000 -0400 @@ -54,6 +54,110 @@ .code64 +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + +.globl mcount_call +mcount_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + +#define HARDNMI_MASK 0x40000000 + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -249,7 +353,9 @@ ENTRY(system_call_after_swapgs) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -281,8 +387,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi @@ -305,7 +411,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) @@ -330,7 +436,9 @@ tracesys: cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ @@ -359,8 +467,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi @@ -395,7 +503,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) @@ -622,8 +730,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi @@ -649,7 +757,7 @@ retint_signal: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check Index: linux-2.6.25.8-rt7/arch/x86/kernel/ftrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/arch/x86/kernel/ftrace.c 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,160 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CALL_BACK 5 + +/* Long is fine, even if it is only 4 bytes ;-) */ +static long *ftrace_nop; + +union ftrace_code_union { + char code[5]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + +notrace int ftrace_ip_converted(unsigned long ip) +{ + unsigned long save; + + ip -= CALL_BACK; + save = *(long *)ip; + + return save == *ftrace_nop; +} + +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; +} + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned replaced; + unsigned old = *(unsigned *)old_code; /* 4 bytes */ + unsigned new = *(unsigned *)new_code; /* 4 bytes */ + unsigned char newch = new_code[4]; + int faulted = 0; + + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %3, (%2)\n" + " jnz 2f\n" + " movb %b4, 4(%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: movl $1, %0\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "r"(newch), + "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old && replaced != new) + faulted = 2; + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[5], *new; + int ret; + + ip += CALL_BACK; + + memcpy(old, &ftrace_call, 5); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[5], *new; + + /* ip is at the location, but modify code will subtact this */ + ip += CALL_BACK; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, 5); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) +{ + const unsigned char *const *noptable = find_nop_table(); + + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + + ftrace_nop = (unsigned long *)noptable[CALL_BACK]; + + return 0; +} + Index: linux-2.6.25.8-rt7/arch/x86/kernel/process_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/process_32.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/process_32.c 2008-06-23 19:13:43.000000000 -0400 @@ -112,7 +112,7 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { ktime_t t0, t1; u64 t0n, t1n; @@ -142,7 +142,9 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle(void) { - cpu_relax(); + do { + cpu_relax(); + } while (!need_resched() && !need_resched_delayed()); } #ifdef CONFIG_HOTPLUG_CPU @@ -186,10 +188,9 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); - check_pgt_cache(); rmb(); idle = pm_idle; @@ -203,12 +204,17 @@ void cpu_idle(void) play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); } + local_irq_disable(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -244,10 +250,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(ax, cx); } } @@ -359,9 +365,10 @@ void __show_registers(struct pt_regs *re regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x" + " preempt:%08x\n", regs->ds & 0xffff, regs->es & 0xffff, - regs->fs & 0xffff, gs, ss); + regs->fs & 0xffff, gs, ss, preempt_count()); if (!all) return; @@ -433,15 +440,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux-2.6.25.8-rt7/arch/x86/kernel/process_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/process_64.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/process_64.c 2008-06-23 19:13:38.000000000 -0400 @@ -106,7 +106,7 @@ void default_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { ktime_t t0, t1; u64 t0n, t1n; @@ -169,7 +169,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); rmb(); @@ -185,7 +185,10 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ @@ -193,9 +196,11 @@ void cpu_idle(void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -231,10 +236,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(ax, cx); } } @@ -242,10 +247,10 @@ void mwait_idle_with_hints(unsigned long /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __sti_mwait(0, 0); else local_irq_enable(); @@ -397,7 +402,7 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; @@ -405,6 +410,7 @@ void exit_thread(void) /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: linux-2.6.25.8-rt7/arch/x86/kernel/vsyscall_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/vsyscall_64.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/vsyscall_64.c 2008-06-23 19:14:30.000000000 -0400 @@ -42,7 +42,7 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" /* @@ -55,7 +55,7 @@ int __vgetcpu_mode __section_vgetcpu_mod struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -74,14 +74,40 @@ void update_vsyscall(struct timespec *wa unsigned long flags; write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + + if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timespec tmp = *(wall_time); + cycle_t (*vread)(void); + cycle_t now; + + vread = vsyscall_gtod_data.clock.vread; + if (likely(vread)) + now = vread(); + else + now = clock->read(); + + /* calculate interval: */ + now = (now - clock->cycle_last) & clock->mask; + /* convert to nsecs: */ + tmp.tv_nsec += ( now * clock->mult) >> clock->shift; + + while (tmp.tv_nsec >= NSEC_PER_SEC) { + tmp.tv_sec += 1; + tmp.tv_nsec -= NSEC_PER_SEC; + } + + vsyscall_gtod_data.wall_time_sec = tmp.tv_sec; + vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec; + } else { + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + } /* copy vsyscall data */ vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = clock->mult; vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -119,6 +145,26 @@ static __always_inline void do_vgettimeo unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; + barrier(); + tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; + tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + tv->tv_usec /= NSEC_PER_MSEC; + tv->tv_usec *= USEC_PER_MSEC; + return; + } + do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -127,7 +173,6 @@ static __always_inline void do_vgettimeo gettimeofday(tv,NULL); return; } - now = vread(); base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; @@ -137,6 +182,7 @@ static __always_inline void do_vgettimeo nsec = __vsyscall_gtod_data.wall_time_nsec; } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + now = vread(); /* calculate interval: */ cycle_delta = (now - base) & mask; /* convert to nsecs: */ Index: linux-2.6.25.8-rt7/arch/x86/lib/thunk_32.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/arch/x86/lib/thunk_32.S 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,47 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif Index: linux-2.6.25.8-rt7/arch/x86/lib/thunk_64.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/lib/thunk_64.S 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/lib/thunk_64.S 2008-06-23 19:13:28.000000000 -0400 @@ -40,15 +40,31 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC Index: linux-2.6.25.8-rt7/arch/x86/mm/init_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mm/init_32.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mm/init_32.c 2008-06-23 19:13:30.000000000 -0400 @@ -51,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); @@ -723,7 +723,7 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES +#if !defined(CONFIG_KPROBES) && !defined(CONFIG_DYNAMIC_FTRACE) #ifdef CONFIG_HOTPLUG_CPU /* It must still be possible to apply SMP alternatives. */ if (num_possible_cpus() <= 1) Index: linux-2.6.25.8-rt7/arch/x86/mm/init_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mm/init_64.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mm/init_64.c 2008-06-23 19:13:30.000000000 -0400 @@ -52,7 +52,7 @@ EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the @@ -604,7 +604,7 @@ void mark_rodata_ro(void) start = (unsigned long)_etext; #endif -#ifdef CONFIG_KPROBES +#if defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) start = (unsigned long)__start_rodata; #endif Index: linux-2.6.25.8-rt7/arch/x86/vdso/vclock_gettime.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/vdso/vclock_gettime.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/vdso/vclock_gettime.c 2008-06-23 19:13:04.000000000 -0400 @@ -23,7 +23,7 @@ #define gtod vdso_vsyscall_gtod_data -static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; asm("syscall" : "=a" (ret) : @@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long c return ret; } -static inline long vgetns(void) +notrace static inline long vgetns(void) { long v; cycles_t (*vread)(void); @@ -40,7 +40,7 @@ static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -static noinline int do_realtime(struct timespec *ts) +notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { @@ -54,7 +54,8 @@ static noinline int do_realtime(struct t } /* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +notrace static void +vset_normalized_timespec(struct timespec *ts, long sec, long nsec) { while (nsec >= NSEC_PER_SEC) { nsec -= NSEC_PER_SEC; @@ -68,7 +69,7 @@ static void vset_normalized_timespec(str ts->tv_nsec = nsec; } -static noinline int do_monotonic(struct timespec *ts) +notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { @@ -82,7 +83,7 @@ static noinline int do_monotonic(struct return 0; } -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled && gtod->clock.vread)) switch (clock) { @@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { Index: linux-2.6.25.8-rt7/arch/x86/vdso/vgetcpu.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/vdso/vgetcpu.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/vdso/vgetcpu.c 2008-06-23 19:13:04.000000000 -0400 @@ -13,7 +13,8 @@ #include #include "vextern.h" -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; Index: linux-2.6.25.8-rt7/include/asm-x86/vsyscall.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/vsyscall.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/vsyscall.h 2008-06-23 19:13:04.000000000 -0400 @@ -24,7 +24,7 @@ enum vsyscall_num { ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) #define __section_vsyscall_clock __attribute__ \ ((unused, __section__ (".vsyscall_clock"),aligned(16))) -#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) +#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 Index: linux-2.6.25.8-rt7/include/linux/ftrace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/include/linux/ftrace.h 2008-06-23 19:14:38.000000000 -0400 @@ -0,0 +1,206 @@ +#ifndef _LINUX_FTRACE_H +#define _LINUX_FTRACE_H + +#include + +#ifdef CONFIG_FTRACE + +#include +#include + +extern int ftrace_enabled; +extern int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + +typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); + +struct ftrace_ops { + ftrace_func_t func; + struct ftrace_ops *next; +}; + +/* + * The ftrace_ops must be a static and should also + * be read_mostly. These functions do modify read_mostly variables + * so use them sparely. Never free an ftrace_op or modify the + * next pointer after it has been registered. Even after unregistering + * it, the next pointer may still be used internally. + */ +int register_ftrace_function(struct ftrace_ops *ops); +int unregister_ftrace_function(struct ftrace_ops *ops); +void clear_ftrace_function(void); + +extern void ftrace_stub(unsigned long a0, unsigned long a1); +extern void mcount(void); + +void ftrace_enable(void); +void ftrace_disable(void); + +/* totally disable ftrace - can not re-enable after this */ +void ftrace_kill(void); +void __ftrace_kill(void); + +#else /* !CONFIG_FTRACE */ +# define register_ftrace_function(ops) do { } while (0) +# define unregister_ftrace_function(ops) do { } while (0) +# define clear_ftrace_function(ops) do { } while (0) +# define ftrace_enable() do { } while (0) +# define ftrace_disable() do { } while (0) +# define ftrace_kill() do { } while (0) +# define __ftrace_kill() do { } while (0) +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_HASHBITS 10 +# define FTRACE_HASHSIZE (1< + +static inline void ftrace_event_irq(int irq, int user, unsigned long ip) +{ + trace_mark(ftrace_event_irq, "%d %d %ld", irq, user, ip); +} + +static inline void ftrace_event_fault(unsigned long ip, unsigned long error, + unsigned long addr) +{ + trace_mark(ftrace_event_fault, "%ld %ld %ld", ip, error, addr); +} + +static inline void ftrace_event_timer_set(void *p1, void *p2) +{ + trace_mark(ftrace_event_timer_set, "%p %p", p1, p2); +} + +static inline void ftrace_event_timer_triggered(void *p1, void *p2) +{ + trace_mark(ftrace_event_timer_triggered, "%p %p", p1, p2); +} + +static inline void ftrace_event_timestamp(ktime_t *time) +{ + trace_mark(ftrace_event_hrtimer, "%p", time); +} + +static inline void ftrace_event_task_activate(struct task_struct *p, int cpu) +{ + trace_mark(ftrace_event_task_activate, "%p %d", p, cpu); +} + +static inline void ftrace_event_task_deactivate(struct task_struct *p, int cpu) +{ + trace_mark(ftrace_event_task_deactivate, "%p %d", p, cpu); +} + +static inline void ftrace_event_program_event(ktime_t *expires, int64_t *delta) +{ + trace_mark(ftrace_event_timer, "%p %p", expires, delta); +} + +#else +# define ftrace_event_irq(irq, user, ip) do { } while (0) +# define ftrace_event_fault(ip, error, addr) do { } while (0) +# define ftrace_event_timer_set(p1, p2) do { } while (0) +# define ftrace_event_timer_triggered(p1, p2) do { } while (0) +# define ftrace_event_timestamp(now) do { } while (0) +# define ftrace_event_task_activate(p, cpu) do { } while (0) +# define ftrace_event_task_deactivate(p, cpu) do { } while (0) +# define ftrace_event_program_event(p, d) do { } while (0) +#endif /* CONFIG_TRACE_EVENTS */ + +#endif /* _LINUX_FTRACE_H */ Index: linux-2.6.25.8-rt7/include/linux/irqflags.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/irqflags.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/irqflags.h 2008-06-23 19:14:33.000000000 -0400 @@ -11,6 +11,16 @@ #ifndef _LINUX_TRACE_IRQFLAGS_H #define _LINUX_TRACE_IRQFLAGS_H +#if !defined(BUILD_CHECK_IRQ_FLAGS) && defined(typecheck) +#define BUILD_CHECK_IRQ_FLAGS(flags) \ + do { \ + BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long)); \ + typecheck(unsigned long, flags); \ + } while (0) +#else +#define BUILD_CHECK_IRQ_FLAGS(flags) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); @@ -41,6 +51,15 @@ # define INIT_TRACE_IRQFLAGS #endif +#if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) + extern void stop_critical_timings(void); + extern void start_critical_timings(void); +#else +# define stop_critical_timings() do { } while (0) +# define start_critical_timings() do { } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #include @@ -50,10 +69,15 @@ #define local_irq_disable() \ do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0) #define local_irq_save(flags) \ - do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0) + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_irq_save(flags); \ + trace_hardirqs_off(); \ + } while (0) #define local_irq_restore(flags) \ do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ if (raw_irqs_disabled_flags(flags)) { \ raw_local_irq_restore(flags); \ trace_hardirqs_off(); \ @@ -69,8 +93,16 @@ */ # define raw_local_irq_disable() local_irq_disable() # define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) local_irq_save(flags) -# define raw_local_irq_restore(flags) local_irq_restore(flags) +# define raw_local_irq_save(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_save(flags); \ + } while (0) +# define raw_local_irq_restore(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_restore(flags); \ + } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT @@ -80,7 +112,11 @@ raw_safe_halt(); \ } while (0) -#define local_save_flags(flags) raw_local_save_flags(flags) +#define local_save_flags(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_save_flags(flags); \ + } while (0) #define irqs_disabled() \ ({ \ @@ -90,7 +126,11 @@ raw_irqs_disabled_flags(flags); \ }) -#define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags) +#define irqs_disabled_flags(flags) \ +({ \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_irqs_disabled_flags(flags); \ +}) #endif /* CONFIG_X86 */ #endif Index: linux-2.6.25.8-rt7/include/linux/ktime.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/ktime.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/ktime.h 2008-06-23 19:13:04.000000000 -0400 @@ -327,4 +327,10 @@ extern void ktime_get_ts(struct timespec /* Get the real (wall-) time in timespec format: */ #define ktime_get_real_ts(ts) getnstimeofday(ts) +static inline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + #endif Index: linux-2.6.25.8-rt7/include/linux/linkage.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/linkage.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/linkage.h 2008-06-23 19:13:04.000000000 -0400 @@ -3,6 +3,8 @@ #include +#define notrace __attribute__((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else Index: linux-2.6.25.8-rt7/include/linux/mmiotrace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/include/linux/mmiotrace.h 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,85 @@ +#ifndef MMIOTRACE_H +#define MMIOTRACE_H + +#include +#include + +struct kmmio_probe; +struct pt_regs; + +typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, + struct pt_regs *, unsigned long addr); +typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, + unsigned long condition, struct pt_regs *); + +struct kmmio_probe { + struct list_head list; /* kmmio internal list */ + unsigned long addr; /* start location of the probe point */ + unsigned long len; /* length of the probe region */ + kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ + kmmio_post_handler_t post_handler; /* Called after addr is executed */ + void *private; +}; + +/* kmmio is active by some kmmio_probes? */ +static inline int is_kmmio_active(void) +{ + extern unsigned int kmmio_count; + return kmmio_count; +} + +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +/* Called from page fault handler. */ +extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); + +/* Called from ioremap.c */ +#ifdef CONFIG_MMIOTRACE +extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr); +extern void mmiotrace_iounmap(volatile void __iomem *addr); +#else +static inline void mmiotrace_ioremap(resource_size_t offset, + unsigned long size, void __iomem *addr) +{ +} + +static inline void mmiotrace_iounmap(volatile void __iomem *addr) +{ +} +#endif /* CONFIG_MMIOTRACE_HOOKS */ + +enum mm_io_opcode { + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ + MMIO_MARKER = 0x5, /* raw char data */ + MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ +}; + +struct mmiotrace_rw { + resource_size_t phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ +}; + +struct mmiotrace_map { + resource_size_t phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ +}; + +/* in kernel/trace/trace_mmiotrace.c */ +extern void enable_mmiotrace(void); +extern void disable_mmiotrace(void); +extern void mmio_trace_rw(struct mmiotrace_rw *rw); +extern void mmio_trace_mapping(struct mmiotrace_map *map); + +#endif /* MMIOTRACE_H */ Index: linux-2.6.25.8-rt7/include/linux/preempt.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/preempt.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/preempt.h 2008-06-23 19:13:29.000000000 -0400 @@ -9,8 +9,10 @@ #include #include #include +#include -#ifdef CONFIG_DEBUG_PREEMPT +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) || \ + defined(CONFIG_PREEMPT_TRACE) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else @@ -21,11 +23,12 @@ #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) +#define preempt_count() (current_thread_info()->preempt_count) #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); +asmlinkage void preempt_schedule_irq(void); #define preempt_disable() \ do { \ @@ -33,21 +36,62 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define __preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) + +#ifdef CONFIG_DEBUG_PREEMPT +extern void notrace preempt_enable_no_resched(void); +#else +# define preempt_enable_no_resched() __preempt_enable_no_resched() +#endif + #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ preempt_schedule(); \ } while (0) +#define preempt_check_resched_delayed() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \ + preempt_schedule(); \ +} while (0) + #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ + __preempt_enable_no_resched(); \ + barrier(); \ + preempt_check_resched(); \ +} while (0) + +/* For debugging and tracer internals only! */ +#define add_preempt_count_notrace(val) \ + do { preempt_count() += (val); } while (0) +#define sub_preempt_count_notrace(val) \ + do { preempt_count() -= (val); } while (0) +#define inc_preempt_count_notrace() add_preempt_count_notrace(1) +#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) + +#define preempt_disable_notrace() \ +do { \ + inc_preempt_count_notrace(); \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched_notrace() \ +do { \ + barrier(); \ + dec_preempt_count_notrace(); \ +} while (0) + +/* preempt_check_resched is OK to trace */ +#define preempt_enable_notrace() \ +do { \ + preempt_enable_no_resched_notrace(); \ barrier(); \ preempt_check_resched(); \ } while (0) @@ -56,8 +100,16 @@ do { \ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) +#define __preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) +#define preempt_check_resched_delayed() do { } while (0) + +#define preempt_disable_notrace() do { } while (0) +#define preempt_enable_no_resched_notrace() do { } while (0) +#define preempt_enable_notrace() do { } while (0) + +#define preempt_schedule_irq() do { } while (0) #endif Index: linux-2.6.25.8-rt7/include/linux/writeback.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/writeback.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/writeback.h 2008-06-23 19:13:04.000000000 -0400 @@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; +extern unsigned long determine_dirtyable_memory(void); + extern int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); Index: linux-2.6.25.8-rt7/kernel/lockdep.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/lockdep.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/lockdep.c 2008-06-23 19:14:29.000000000 -0400 @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -66,7 +67,7 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static __raw_spinlock_t lockdep_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { @@ -81,6 +82,8 @@ static int graph_lock(void) __raw_spin_unlock(&lockdep_lock); return 0; } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; return 1; } @@ -89,6 +92,7 @@ static inline int graph_unlock(void) if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); + current->lockdep_recursion--; __raw_spin_unlock(&lockdep_lock); return 0; } @@ -508,7 +512,11 @@ static void print_lock(struct held_lock static void lockdep_print_held_locks(struct task_struct *curr) { - int i, depth = curr->lockdep_depth; + int i, depth; + + if (!curr) + curr = current; + depth = curr->lockdep_depth; if (!depth) { printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); @@ -573,7 +581,7 @@ static void print_lock_dependencies(stru static void print_kernel_version(void) { - printk("%s %.*s\n", init_utsname()->release, + printk("[ %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); } @@ -813,6 +821,21 @@ out_unlock_set: return class; } +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS) + +#define RECURSION_LIMIT 40 + +static int noinline print_infinite_recursion_bug(void) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); + + return 0; +} +#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */ + #ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns @@ -943,18 +966,6 @@ static noinline int print_circular_bug_t return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - /* * Prove that the dependency graph starting at can not * lead to . Print an error and return 0 if it does. @@ -982,7 +993,7 @@ check_noncircular(struct lock_class *sou return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1072,6 +1083,7 @@ find_usage_backwards(struct lock_class * return 1; } +#ifdef CONFIG_PROVE_LOCKING static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, @@ -1132,6 +1144,7 @@ print_bad_irq_dependency(struct task_str return 0; } +#endif /* CONFIG_PROVE_LOCKING */ static int check_usage(struct task_struct *curr, struct held_lock *prev, @@ -1680,7 +1693,7 @@ valid_state(struct task_struct *curr, st static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2013,11 +2026,12 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2055,16 +2069,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2082,7 +2103,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* @@ -2518,6 +2544,55 @@ static int check_unlock(struct task_stru return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class = class; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock->class->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2681,6 +2756,29 @@ static void check_flags(unsigned long fl #endif } +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat && !prove_locking)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2791,7 +2889,7 @@ found_it: stats = get_lock_stats(hlock->class); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); @@ -3039,13 +3137,13 @@ void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + printk("... MAX_LOCKDEP_SUBCLASSES: %6lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %6lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %6lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %6lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %6lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %6lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %6lu\n", CHAINHASH_SIZE); printk(" memory used by lock dependency info: %lu kB\n", (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + @@ -3216,6 +3314,8 @@ void __debug_show_held_locks(struct task printk("INFO: lockdep is turned off.\n"); return; } + if (task == current) + lockdep_print_held_locks(task); lockdep_print_held_locks(task); } EXPORT_SYMBOL_GPL(__debug_show_held_locks); Index: linux-2.6.25.8-rt7/kernel/sched_trace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/sched_trace.h 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,41 @@ +#include + +static inline void trace_kernel_sched_wait(struct task_struct *p) +{ + trace_mark(kernel_sched_wait_task, "pid %d state %ld", + p->pid, p->state); +} + +static inline +void trace_kernel_sched_wakeup(struct rq *rq, struct task_struct *p) +{ + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); +} + +static inline +void trace_kernel_sched_wakeup_new(struct rq *rq, struct task_struct *p) +{ + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); +} + +static inline void trace_kernel_sched_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); +} + +static inline void +trace_kernel_sched_migrate_task(struct task_struct *p, int src, int dst) +{ + trace_mark(kernel_sched_migrate_task, + "pid %d state %ld dest_cpu %d", + p->pid, p->state, dst); +} Index: linux-2.6.25.8-rt7/kernel/sysctl.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/sysctl.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/sysctl.c 2008-06-23 19:14:26.000000000 -0400 @@ -45,6 +45,8 @@ #include #include #include +#include +#include #include #include @@ -67,6 +69,7 @@ extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; extern int sysctl_oom_kill_allocating_task; extern int sysctl_oom_dump_tasks; +extern int futex_performance_hack; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -149,6 +152,10 @@ static int parse_table(int __user *, int void __user *, size_t, struct ctl_table *); #endif +#ifdef CONFIG_PREEMPT_RT +extern int rt_rwlock_limit; +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, @@ -356,6 +363,74 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_FUTEX + { + .ctl_name = CTL_UNNUMBERED, + .procname = "futex_performance_hack", + .data = &futex_performance_hack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prof_pid", + .data = &prof_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PREEMPT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kernel_preemption", + .data = &kernel_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY + { + .ctl_name = CTL_UNNUMBERED, + .procname = "voluntary_preemption", + .data = &voluntary_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "softirq_preemption", + .data = &softirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hardirq_preemption", + .data = &hardirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_RT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rwlock_reader_limit", + .data = &rt_rwlock_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC, .procname = "panic", @@ -364,6 +439,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_GENERIC_HARDIRQS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "debug_direct_keyboard", + .data = &debug_direct_keyboard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", @@ -470,6 +555,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FTRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ftrace_enable_sysctl, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, Index: linux-2.6.25.8-rt7/kernel/trace/Kconfig =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/Kconfig 2008-06-23 19:13:09.000000000 -0400 @@ -0,0 +1,165 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool + +config HAVE_DYNAMIC_FTRACE + bool + +config TRACER_MAX_TRACE + bool + +config TRACING + bool + select DEBUG_FS + select STACKTRACE + +config FTRACE + bool "Kernel Function Tracer" + depends on HAVE_FTRACE + select FRAME_POINTER + select TRACING + select CONTEXT_SWITCH_TRACER + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction to the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. + +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + depends on HAVE_FTRACE + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + depends on HAVE_FTRACE + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config SCHED_TRACER + bool "Scheduling Latency Tracer" + depends on HAVE_FTRACE + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + +config EVENT_TRACER + bool "trace kernel events" + select CONTEXT_SWITCH_TRACER + help + This option activates the event tracer of the latency_tracer. + It activates markers through out the kernel for tracing. + This option has a fairly low overhead when enabled. + +config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" + depends on HAVE_FTRACE + select TRACING + select MARKERS + help + This tracer gets called from the context switch and records + all switching of tasks. + +config DYNAMIC_FTRACE + bool "enable/disable ftrace tracepoints dynamically" + depends on FTRACE + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replaces them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. + + This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + has native performance as long as no tracing is active. + + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on TRACING + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. + +config INTERRUPT_OFF_HIST + bool "Interrupts off critical timings histogram" + depends on IRQSOFF_TRACER + help + This option uses the infrastructure of the critical + irqs off timings to create a histogram of latencies. + +config PREEMPT_OFF_HIST + bool "Preempt off critical timings histogram" + depends on PREEMPT_TRACER + help + This option uses the infrastructure of the critical + preemption off timings to create a histogram of latencies. + +config WAKEUP_LATENCY_HIST + bool "Interrupts off critical timings histogram" + select TRACING + select MARKERS + help + This option uses the infrastructure of the wakeup tracer + to create a histogram of latencies. + +config PREEMPT_TRACE + bool "Keep a record of preempt disabled spots" + depends on DEBUG_KERNEL + depends on PREEMPT + select TRACING + help + Keeps a record of the last 25 preempt disabled locations. Index: linux-2.6.25.8-rt7/kernel/trace/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/Makefile 2008-06-23 19:13:09.000000000 -0400 @@ -0,0 +1,30 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FTRACE +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = -pg +obj-y += trace_selftest_dynamic.o +endif + +obj-$(CONFIG_FTRACE) += libftrace.o + +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_EVENT_TRACER) += trace_events.o + +obj-$(CONFIG_INTERRUPT_OFF_HIST) += trace_hist.o +obj-$(CONFIG_PREEMPT_OFF_HIST) += trace_hist.o +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += trace_hist.o + +obj-$(CONFIG_PREEMPT_TRACE) += preempt-trace.o + +libftrace-y := ftrace.o Index: linux-2.6.25.8-rt7/kernel/trace/ftrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/ftrace.c 2008-06-23 19:14:38.000000000 -0400 @@ -0,0 +1,1594 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; +static int last_ftrace_enabled; + +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + +static DEFINE_RAW_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_sysctl_lock); + +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); + + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + + if (ftrace_enabled) { + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + } + + spin_unlock(&ftrace_lock); + + return 0; +} + +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + struct ftrace_ops **p; + int ret = 0; + + spin_lock(&ftrace_lock); + + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + if (ftrace_enabled) { + /* If we only have one func left, then call that directly */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + } + + out: + spin_unlock(&ftrace_lock); + + return ret; +} + +static int ftrace_disabled_count; +static int save_ftrace_enabled; + +void ftrace_disable(void) +{ + mutex_lock(&ftrace_sysctl_lock); + + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; +} + +void ftrace_enable(void) +{ + /* ftrace_enable must be paired with ftrace_disable */ + if (!mutex_is_locked(&ftrace_sysctl_lock)) { + WARN_ON(1); + return; + } + + ftrace_enabled = save_ftrace_enabled; + + mutex_unlock(&ftrace_sysctl_lock); +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct task_struct *ftraced_task; + +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + +static int ftrace_filtered; + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_RAW_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); +static DEFINE_MUTEX(ftrace_regex_lock); + +struct ftrace_page { + struct ftrace_page *next; + unsigned long index; + struct dyn_ftrace records[]; +}; + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static int ftraced_trigger; +static int ftraced_suspend; +static int ftraced_stop; + +static int ftrace_record_suspend; + +static struct dyn_ftrace *ftrace_free_records; + +static inline int +ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head_rcu(&node->node, &ftrace_hash[key]); +} + +static void ftrace_free_rec(struct dyn_ftrace *rec) +{ + /* no locking, only called from kstop_machine */ + + rec->ip = (unsigned long)ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + +static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +{ + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + WARN_ON_ONCE(1); + ftrace_free_records = NULL; + ftrace_disabled = 1; + ftrace_enabled = 0; + return NULL; + } + + ftrace_free_records = (void *)rec->ip; + memset(rec, 0, sizeof(*rec)); + return rec; + } + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + +static void +ftrace_record_ip(unsigned long ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + int cpu; + + if (!ftrace_enabled || ftrace_disabled) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + /* + * We simply need to protect against recursion. + * Use the the raw version of smp_processor_id and not + * __get_cpu_var which can call debug hooks that can + * cause a recursive crash here. + */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + /* + * There's a slight race that the ftraced will update the + * hash and reset here. If it is already converted, skip it. + */ + if (ftrace_ip_converted(ip)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +#define FTRACE_ADDR ((long)(ftrace_caller)) +#define MCOUNT_ADDR ((long)(mcount)) + +static void +__ftrace_replace_code(struct dyn_ftrace *rec, + unsigned char *old, unsigned char *new, int enable) +{ + unsigned long ip, fl; + int failed; + + ip = rec->ip; + + if (ftrace_filtered && enable) { + /* + * If filtering is on: + * + * If this record is set to be filtered and + * is enabled then do nothing. + * + * If this record is set to be filtered and + * it is not enabled, enable it. + * + * If this record is not set to be filtered + * and it is not enabled do nothing. + * + * If this record is set not to trace then + * do nothing. + * + * If this record is not set to be filtered and + * it is enabled, disable it. + */ + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || + (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) + return; + + /* + * If it is enabled disable it, + * otherwise enable it! + */ + if (fl == FTRACE_FL_ENABLED) { + /* swap new and old */ + new = old; + old = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags &= ~FTRACE_FL_ENABLED; + } else { + new = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags |= FTRACE_FL_ENABLED; + } + } else { + + if (enable) { + /* + * If this record is set not to trace and is + * not enabled, do nothing. + */ + fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); + if (fl == FTRACE_FL_NOTRACE) + return; + + new = ftrace_call_replace(ip, FTRACE_ADDR); + } else + old = ftrace_call_replace(ip, FTRACE_ADDR); + + if (enable) { + if (rec->flags & FTRACE_FL_ENABLED) + return; + rec->flags |= FTRACE_FL_ENABLED; + } else { + if (!(rec->flags & FTRACE_FL_ENABLED)) + return; + rec->flags &= ~FTRACE_FL_ENABLED; + } + } + + failed = ftrace_modify_code(ip, old, new); + if (failed) { + unsigned long key; + /* It is possible that the function hasn't been converted yet */ + key = hash_long(ip, FTRACE_HASHBITS); + if (!ftrace_ip_in_hash(ip, key)) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } + + } +} + +static void ftrace_replace_code(int enable) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int i; + + if (enable) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + __ftrace_replace_code(rec, old, new, enable); + } + } +} + +static void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} + +static int +ftrace_code_disable(struct dyn_ftrace *rec) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + return 0; + } + return 1; +} + +static int __ftrace_update_code(void *ignore); + +static int __ftrace_modify_code(void *data) +{ + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) { + /* + * Update any recorded ips now that we have the + * machine stopped + */ + __ftrace_update_code(NULL); + ftrace_replace_code(1); + } else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; +} + +static void ftrace_run_update_code(int command) +{ + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); +} + +void ftrace_disable_daemon(void) +{ + /* Stop the daemon from calling kstop_machine */ + mutex_lock(&ftraced_lock); + ftraced_stop = 1; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +void ftrace_enable_daemon(void) +{ + mutex_lock(&ftraced_lock); + ftraced_stop = 0; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +static ftrace_func_t saved_ftrace_func; + +static void ftrace_startup(void) +{ + int command = 0; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); + out: + mutex_unlock(&ftraced_lock); +} + +static void ftrace_shutdown(void) +{ + int command = 0; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); + out: + mutex_unlock(&ftraced_lock); +} + +static void ftrace_startup_sysctl(void) +{ + int command = FTRACE_ENABLE_MCOUNT; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftraced_suspend is true if we want ftrace running */ + if (ftraced_suspend) + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); +} + +static void ftrace_shutdown_sysctl(void) +{ + int command = FTRACE_DISABLE_MCOUNT; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if ftrace is running */ + if (ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + int save_ftrace_enabled; + cycle_t start, stop; + int i; + + /* Don't be recording funcs now */ + ftrace_record_suspend++; + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; + + start = ftrace_now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + if (ftrace_code_disable(p)) + ftrace_update_cnt++; + } + + } + + stop = ftrace_now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + ftraced_trigger = 0; + + ftrace_enabled = save_ftrace_enabled; + ftrace_record_suspend--; + + return 0; +} + +static int ftrace_update_code(void) +{ + if (unlikely(ftrace_disabled) || + !ftrace_enabled || !ftraced_trigger) + return 0; + + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + + return 1; +} + +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init ftrace_dyn_table_alloc(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_CONT = (1 << 1), + FTRACE_ITER_NOTRACE = (1 << 2), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + struct ftrace_page *pg; + unsigned idx; + unsigned flags; + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned buffer_idx; + unsigned filtered; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FAILED) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && + !(rec->flags & FTRACE_FL_NOTRACE))) { + rec = NULL; + goto retry; + } + } + + iter->pos = *pos; + + return rec; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l = -1; + + if (*pos != iter->pos) { + for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) + ; + } else { + l = *pos; + p = t_next(m, p, &l); + } + + return p; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + if (!rec) + return 0; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->pos = -1; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + +int ftrace_avail_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter = m->private; + + seq_release(inode, file); + kfree(iter); + + return 0; +} + +static void ftrace_filter_reset(int enable) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned i; + + /* keep kstop machine from running */ + preempt_disable(); + if (enable) + ftrace_filtered = 0; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~type; + } + pg = pg->next; + } + preempt_enable(); +} + +static int +ftrace_regex_open(struct inode *inode, struct file *file, int enable) +{ + struct ftrace_iterator *iter; + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&ftrace_regex_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_filter_reset(enable); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + iter->pos = -1; + iter->flags = enable ? FTRACE_ITER_FILTER : + FTRACE_ITER_NOTRACE; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + } else + file->private_data = iter; + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 1); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 0); +} + +static ssize_t +ftrace_regex_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static loff_t +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +enum { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +static void +ftrace_match(unsigned char *buff, int len, int enable) +{ + char str[KSYM_SYMBOL_LEN]; + char *search = NULL; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned i, match = 0, search_len = 0; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + search = buff + i + 1; + type = MATCH_END_ONLY; + search_len = len - (i + 1); + } else { + if (type == MATCH_END_ONLY) { + type = MATCH_MIDDLE_ONLY; + } else { + match = i; + type = MATCH_FRONT_ONLY; + } + buff[i] = 0; + break; + } + } + } + + /* keep kstop machine from running */ + preempt_disable(); + if (enable) + ftrace_filtered = 1; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + int matched = 0; + char *ptr; + + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; + } + if (matched) + rec->flags |= flag; + } + pg = pg->next; + } + preempt_enable(); +} + +static ssize_t +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) +{ + struct ftrace_iterator *iter; + char ch; + size_t read = 0; + ssize_t ret; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&ftrace_regex_lock); + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (!*ppos) { + iter->flags &= ~FTRACE_ITER_CONT; + iter->buffer_idx = 0; + } + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + file->f_pos += read; + ret = read; + goto out; + } + + iter->buffer_idx = 0; + } + + while (cnt && !isspace(ch)) { + if (iter->buffer_idx < FTRACE_BUFF_MAX) + iter->buffer[iter->buffer_idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx, enable); + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; + + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static void +ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +{ + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(enable); + if (buf) + ftrace_match(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); +} + +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 1); +} + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 0); +} + +static int +ftrace_regex_release(struct inode *inode, struct file *file, int enable) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + + mutex_lock(&ftrace_regex_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx, enable); + } + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (iter->filtered && ftraced_suspend && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + kfree(iter); + mutex_unlock(&ftrace_regex_lock); + return 0; +} + +static int +ftrace_filter_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 1); +} + +static int +ftrace_notrace_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 0); +} + +static ssize_t +ftraced_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* don't worry about races */ + char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; + int r = strlen(buf); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftraced_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (strncmp(buf, "enable", 6) == 0) + val = 1; + else if (strncmp(buf, "disable", 7) == 0) + val = 0; + else { + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + } + + if (val) + ftrace_enable_daemon(); + else + ftrace_disable_daemon(); + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + +static struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = ftrace_regex_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_filter_release, +}; + +static struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = ftrace_regex_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +}; + +static struct file_operations ftraced_fops = { + .open = tracing_open_generic, + .read = ftraced_read, + .write = ftraced_write, +}; + +/** + * ftrace_force_update - force an update to all recording ftrace functions + */ +int ftrace_force_update(void) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + + /* + * If ftraced_trigger is not set, then there is nothing + * to update. + */ + if (ftraced_trigger && !ftrace_update_code()) + ret = -EBUSY; + + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +static void ftrace_force_shutdown(void) +{ + struct task_struct *task; + int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; + + mutex_lock(&ftraced_lock); + task = ftraced_task; + ftraced_task = NULL; + ftraced_suspend = -1; + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); + + if (task) + kthread_stop(task); +} + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_filter_functions' entry\n"); + + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_filter' entry\n"); + + entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_notrace' entry\n"); + + entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, + NULL, &ftraced_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ftraced_enabled' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + +static int __init ftrace_dynamic_init(void) +{ + struct task_struct *p; + unsigned long addr; + int ret; + + addr = (unsigned long)ftrace_record_ip; + + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) { + ret = (int)addr; + goto failed; + } + + ret = ftrace_dyn_table_alloc(); + if (ret) + goto failed; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) { + ret = -1; + goto failed; + } + + last_ftrace_enabled = ftrace_enabled = 1; + ftraced_task = p; + + return 0; + + failed: + ftrace_disabled = 1; + return ret; +} + +core_initcall(ftrace_dynamic_init); +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_force_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/** + * ftrace_kill - totally shutdown ftrace + * + * This is a safety measure. If something was detected that seems + * wrong, calling this function will keep ftrace from doing + * any more modifications, and updates. + * used when something went wrong. + */ +void ftrace_kill(void) +{ + mutex_lock(&ftrace_sysctl_lock); + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); + mutex_unlock(&ftrace_sysctl_lock); + + /* Try to totally disable ftrace */ + ftrace_force_shutdown(); +} + +/** + * __ftrace_kill - shutdown ftrace in a mean fashion + * + * In case of system failure we want to stop ftrace as soon as + * possible. This is like ftrace_kill but does not grab the + * mutexes nor does it call the kstop machine. + * + * This one is save to use in atomic. + */ +void __ftrace_kill(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -1; + + mutex_lock(&ftrace_sysctl_lock); + ret = __register_ftrace_function(ops); + ftrace_startup(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + mutex_lock(&ftrace_sysctl_lock); + ret = __unregister_ftrace_function(ops); + ftrace_shutdown(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ftrace_sysctl_lock); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + goto out; + + last_ftrace_enabled = ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_list != &ftrace_list_end) { + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + else + ftrace_trace_function = ftrace_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_sysctl_lock); + return ret; +} Index: linux-2.6.25.8-rt7/kernel/trace/trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace.c 2008-06-23 19:14:38.000000000 -0400 @@ -0,0 +1,3517 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "trace.h" + +unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_thresh; + +static unsigned long __read_mostly tracing_nr_buffers; +static cpumask_t __read_mostly tracing_buffer_mask; + +#define for_each_cpu_mask_nr(cpu, mask) for_each_cpu_mask(cpu, mask) +#define for_each_tracing_cpu(cpu) \ + for_each_cpu_mask_nr(cpu, tracing_buffer_mask) + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = { + .name = "none", +}; + +static int trace_alloc_page(void); +static int trace_free_page(void); + +static int tracing_disabled = 1; + +static unsigned long tracing_pages_allocated; + +long +ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +cycle_t ftrace_now(int cpu) +{ +// return cpu_clock(cpu); + return sched_clock(); +} + +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +/* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum + * trace while it continues examining live traces. + * + * The buffers for the max_tr are set up the same as the global_trace. + * When a snapshot is taken, the link list of the max_tr is swapped + * with the link list of the global_trace and the buffers are reset for + * the global_trace so the tracing can continue. + */ +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_data); + +/* tracer_enabled is used to toggle activation of a tracer */ +static int tracer_enabled = 1; + +/* + * trace_nr_entries is the number of entries that is allocated + * for a buffer. Note, the number of entries is always rounded + * to ENTRIES_PER_PAGE. + */ +static unsigned long trace_nr_entries = 65536UL; + +/* trace_types holds a link list of available tracers. */ +static struct tracer *trace_types __read_mostly; + +/* current_trace points to the tracer that is currently active */ +static struct tracer *current_trace __read_mostly; + +/* + * max_tracer_type_len is used to simplify the allocating of + * buffers to read userspace tracer names. We keep track of + * the longest tracer name registered. + */ +static int max_tracer_type_len; + +/* + * trace_types_lock is used to protect the trace_types list. + * This lock is also used to keep user access serialized. + * Accesses from userspace will grab this lock while userspace + * activities happen inside the kernel. + */ +static DEFINE_MUTEX(trace_types_lock); + +/* trace_wait is a waitqueue for tasks blocked on trace_poll */ +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +/* trace_flags holds iter_ctrl options */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; + +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Simply wakes up any task that is blocked on the trace_wait + * queue. These is used with trace_poll for tasks polling the trace. + */ +void trace_wake_up(void) +{ + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + wake_up(&trace_wait); +} + +#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) + +static int __init set_nr_entries(char *str) +{ + unsigned long nr_entries; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &nr_entries); + /* nr_entries can not be zero */ + if (ret < 0 || nr_entries == 0) + return 0; + trace_nr_entries = nr_entries; + return 1; +} +__setup("trace_entries=", set_nr_entries); + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +/* These must match the bit postions in trace_iterator_flags */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + "raw", + "hex", + "bin", + "block", + "stacktrace", + "sched-tree", + NULL +}; + +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + */ +static __raw_spinlock_t ftrace_max_lock = + (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + +#define CHECK_COND(cond) \ + if (unlikely(cond)) { \ + tracing_disabled = 1; \ + WARN_ON(1); \ + return -1; \ + } + +/** + * check_pages - integrity check of trace buffers + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +int check_pages(struct trace_array_cpu *data) +{ + struct page *page, *tmp; + + CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); + CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); + + list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { + CHECK_COND(page->lru.next->prev != &page->lru); + CHECK_COND(page->lru.prev->next != &page->lru); + } + + return 0; +} + +/** + * head_page - page address of the first page in per_cpu buffer. + * + * head_page returns the page address of the first page in + * a per_cpu buffer. This also preforms various consistency + * checks to make sure the buffer has not been corrupted. + */ +void *head_page(struct trace_array_cpu *data) +{ + struct page *page; + + if (list_empty(&data->trace_pages)) + return NULL; + + page = list_entry(data->trace_pages.next, struct page, lru); + BUG_ON(&page->lru == &data->trace_pages); + + return page_address(page); +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + int ret; + + if (!len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +static int +trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +static int +trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +static int +trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + +#define HEX_CHARS 17 +static const char hex2asc[] = "0123456789abcdef"; + +static int +trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data = mem; + unsigned char byte; + int i, j; + + BUG_ON(len >= HEX_CHARS); + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + byte = data[i]; + + hex[j++] = hex2asc[byte & 0x0f]; + hex[j++] = hex2asc[byte >> 4]; + } + hex[j++] = ' '; + + return trace_seq_putmem(s, hex, j); +} + +static void +trace_seq_reset(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + +ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +{ + int len; + int ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret) + return -EFAULT; + + s->readpos += len; + return cnt; +} + +static void +trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_reset(s); +} + +/* + * flip the trace buffers between two trace descriptors. + * This usually is the buffers between the global_trace and + * the max_tr to record a snapshot of a current trace. + * + * The ftrace_max_lock must be held. + */ +static void +flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) +{ + struct list_head flip_pages; + + INIT_LIST_HEAD(&flip_pages); + + memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, + sizeof(struct trace_array_cpu) - + offsetof(struct trace_array_cpu, trace_head_idx)); + + check_pages(tr1); + check_pages(tr2); + list_splice_init(&tr1->trace_pages, &flip_pages); + list_splice_init(&tr2->trace_pages, &tr1->trace_pages); + list_splice_init(&flip_pages, &tr2->trace_pages); + BUG_ON(!list_empty(&flip_pages)); + check_pages(tr1); + check_pages(tr2); +} + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data; + int i; + + WARN_ON_ONCE(!irqs_disabled()); + __raw_spin_lock(&ftrace_max_lock); + /* clear out all the previous traces */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + flip_trace(max_tr.data[i], data); + tracing_reset(data); + } + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + int i; + + WARN_ON_ONCE(!irqs_disabled()); + __raw_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(i) + tracing_reset(max_tr.data[i]); + + flip_trace(max_tr.data[cpu], data); + tracing_reset(data); + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +} + +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int len; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Trace %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + +#ifdef CONFIG_FTRACE_STARTUP_TEST + if (type->selftest) { + struct tracer *saved_tracer = current_trace; + struct trace_array_cpu *data; + struct trace_array *tr = &global_trace; + int saved_ctrl = tr->ctrl; + int i; + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } + current_trace = type; + tr->ctrl = 0; + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + current_trace = saved_tracer; + tr->ctrl = saved_ctrl; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + goto out; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } + printk(KERN_CONT "PASSED\n"); + } +#endif + + type->next = trace_types; + trace_types = type; + len = strlen(type->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + int len; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Trace %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + if (strlen(type->name) != max_tracer_type_len) + goto out; + + max_tracer_type_len = 0; + for (t = &trace_types; *t; t = &(*t)->next) { + len = strlen((*t)->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + } + out: + mutex_unlock(&trace_types_lock); +} + +void tracing_reset(struct trace_array_cpu *data) +{ + data->trace_idx = 0; + data->overrun = 0; + data->trace_head = data->trace_tail = head_page(data); + data->trace_head_idx = 0; + data->trace_tail_idx = 0; +} + +#define SAVED_CMDLINES 128 +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static DEFINE_RAW_SPINLOCK(trace_cmdline_lock); + +/* temporary disable recording */ +atomic_t trace_record_cmdline_disabled __read_mostly; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +void trace_stop_cmdline_recording(void); + +static void trace_save_cmdline(struct task_struct *tsk) +{ + unsigned map; + unsigned idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!spin_trylock(&trace_cmdline_lock)) + return; + + /* from the pid, find the index of the cmdline array */ + idx = map_pid_to_cmdline[tsk->pid]; + + if (idx >= SAVED_CMDLINES) { + /* this is new */ + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + /* check the reverse map and reset it if needed */ + map = map_cmdline_to_pid[idx]; + if (map <= PID_MAX_DEFAULT) + map_pid_to_cmdline[map] = (unsigned)-1; + + map_cmdline_to_pid[idx] = tsk->pid; + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + spin_unlock(&trace_cmdline_lock); +} + +static char *trace_find_cmdline(int pid) +{ + char *cmdline = "<...>"; + unsigned map; + + if (!pid) + return ""; + + if (pid > PID_MAX_DEFAULT) + goto out; + + map = map_pid_to_cmdline[pid]; + if (map >= SAVED_CMDLINES) + goto out; + + if (map_cmdline_to_pid[map] != pid) + goto out; + + cmdline = saved_cmdlines[map]; + + out: + return cmdline; +} + +void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled)) + return; + + trace_save_cmdline(tsk); +} + +static inline struct list_head * +trace_next_list(struct trace_array_cpu *data, struct list_head *next) +{ + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = next->next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + + return next; +} + +static inline void * +trace_next_page(struct trace_array_cpu *data, void *addr) +{ + struct list_head *next; + struct page *page; + + page = virt_to_page(addr); + + next = trace_next_list(data, &page->lru); + page = list_entry(next, struct page, lru); + + return page_address(page); +} + +static inline struct trace_entry * +tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) +{ + unsigned long idx, idx_next; + struct trace_entry *entry; + + data->trace_idx++; + idx = data->trace_head_idx; + idx_next = idx + 1; + + BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); + + entry = data->trace_head + idx * TRACE_ENTRY_SIZE; + + if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { + data->trace_head = trace_next_page(data, data->trace_head); + idx_next = 0; + } + + if (data->trace_head == data->trace_tail && + idx_next == data->trace_tail_idx) { + /* overrun */ + data->overrun++; + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = + trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + } + + data->trace_head_idx = idx_next; + + return entry; +} + +static inline void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) +{ + struct task_struct *tsk = current; + unsigned long pc; + + pc = preempt_count(); + + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->t = ftrace_now(raw_smp_processor_id()); + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} + +void +trace_function(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); +} + +void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + if (likely(!atomic_read(&data->disabled))) + trace_function(tr, data, ip, parent_ip, flags); +} + +#ifdef CONFIG_MMIOTRACE +void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_RW; + entry->mmiorw = *rw; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_MAP; + entry->mmiomap = *map; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} +#endif + +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + struct trace_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_STACK; + + memset(&entry->stack, 0, sizeof(entry->stack)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->stack.caller; + + save_stack_trace(&trace); +} + +void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + __trace_stack(tr, data, irq_flags, 4); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; + entry->ctx.next_state = next->state; + __trace_stack(tr, data, flags, 5); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); +} + +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_WAKE; + entry->ctx.prev_pid = curr->pid; + entry->ctx.prev_prio = curr->prio; + entry->ctx.prev_state = curr->state; + entry->ctx.next_pid = wakee->pid; + entry->ctx.next_prio = wakee->prio; + entry->ctx.next_state = wakee->state; + __trace_stack(tr, data, flags, 6); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +void tracing_event_irq(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + int irq, int usermode, + unsigned long retip) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_IRQ; + entry->irq.ip = ip; + entry->irq.irq = irq; + entry->irq.ret_ip = retip; + entry->irq.usermode = usermode; +} + +void tracing_event_fault(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long retip, + unsigned long error_code, + unsigned long address) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_FAULT; + entry->fault.ip = ip; + entry->fault.ret_ip = retip; + entry->fault.errorcode = error_code; + entry->fault.address = address; +} + +void tracing_event_timer_set(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, void *timer) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_TIMER_SET; + entry->timer.ip = ip; + entry->timer.expire = *expires; + entry->timer.timer = timer; +} + +void tracing_event_program_event(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, int64_t *delta) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_PROGRAM_EVENT; + entry->program.ip = ip; + entry->program.expire = *expires; + entry->program.delta = *delta; +} + +void tracing_event_timer_triggered(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expired, void *timer) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_TIMER_TRIG; + entry->timer.ip = ip; + entry->timer.expire = *expired; + entry->timer.timer = timer; +} + +void tracing_event_timestamp(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *now) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_TIMESTAMP; + entry->timestamp.ip = ip; + entry->timestamp.now = *now; +} + +void tracing_event_task_activate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int task_cpu) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_TASK_ACT; + entry->task.ip = ip; + entry->task.pid = p->pid; + entry->task.prio = p->prio; + entry->task.cpu = task_cpu; +} + +void tracing_event_task_deactivate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int task_cpu) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_TASK_DEACT; + entry->task.ip = ip; + entry->task.pid = p->pid; + entry->task.prio = p->prio; + entry->task.cpu = task_cpu; +} + +void tracing_event_syscall(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long nr, + unsigned long p1, + unsigned long p2, + unsigned long p3) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_SYSCALL; + entry->syscall.ip = ip; + entry->syscall.nr = nr; + entry->syscall.p1 = p1; + entry->syscall.p2 = p2; + entry->syscall.p3 = p3; +} + +void tracing_event_sysret(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long ret) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_SYSRET; + entry->sysret.ip = ip; + entry->sysret.ret = ret; +} + +#ifdef CONFIG_FTRACE +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, resched; + + if (unlikely(!tracer_enabled)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + local_save_flags(flags); + trace_function(tr, data, ip, parent_ip, flags); + } + + atomic_dec(&data->disabled); + + /* + * To prevent recursion with schedule(), we look at the + * resched flag before disabling preemption. If it is already + * set, then we may be just calling schedule, and we don't + * want to call schedule again. But if it was not set, then + * it is fine to call schedule() if we need to. + */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); + tracing_record_cmdline(current); +} + +void tracing_stop_function_trace(void) +{ + tracing_record_cmdline(current); + unregister_ftrace_function(&trace_ops); +} +#endif + +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +}; + +static struct trace_entry * +trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, + struct trace_iterator *iter, int cpu) +{ + struct page *page; + struct trace_entry *array; + + if (iter->next_idx[cpu] >= tr->entries || + iter->next_idx[cpu] >= data->trace_idx || + (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx)) + return NULL; + + if (!iter->next_page[cpu]) { + /* Initialize the iterator for this cpu trace buffer */ + WARN_ON(!data->trace_tail); + page = virt_to_page(data->trace_tail); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_tail_idx; + } + + page = list_entry(iter->next_page[cpu], struct page, lru); + BUG_ON(&data->trace_pages == &page->lru); + + array = page_address(page); + + WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); + return &array[iter->next_page_idx[cpu]]; +} + +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) +{ + struct trace_array *tr = iter->tr; + struct trace_entry *ent, *next = NULL; + int next_cpu = -1; + int cpu; + + for_each_tracing_cpu(cpu) { + if (!head_page(tr->data[cpu])) + continue; + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ent->t < next->t)) { + next = ent; + next_cpu = cpu; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + return next; +} + +static void trace_iterator_increment(struct trace_iterator *iter) +{ + iter->idx++; + iter->next_idx[iter->cpu]++; + iter->next_page_idx[iter->cpu]++; + + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + iter->next_page_idx[iter->cpu] = 0; + iter->next_page[iter->cpu] = + trace_next_list(data, iter->next_page[iter->cpu]); + } +} + +static void trace_consume(struct trace_iterator *iter) +{ + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + + /* Check if we empty it, then reset the index */ + if (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx) + data->trace_idx = 0; +} + +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); + + iter->prev_ent = iter->ent; + iter->prev_cpu = iter->cpu; + + iter->ent = next; + iter->cpu = next_cpu; + + if (next) + trace_iterator_increment(iter); + + return next ? iter : NULL; +} + +static void *s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *last_ent = iter->ent; + int i = (int)*pos; + void *ent; + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = find_next_entry_inc(iter); + + iter->pos = *pos; + + if (last_ent && !ent) + seq_puts(m, "\n\nvim:ft=help\n"); + + return ent; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *p = NULL; + loff_t l = 0; + int i; + + mutex_lock(&trace_types_lock); + + if (!current_trace || current_trace != iter->trace) { + mutex_unlock(&trace_types_lock); + return NULL; + } + + atomic_inc(&trace_record_cmdline_disabled); + + /* let the tracer grab locks here if needed */ + if (current_trace->start) + current_trace->start(iter); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + iter->prev_ent = NULL; + iter->prev_cpu = -1; + + for_each_tracing_cpu(i) { + iter->next_idx[i] = 0; + iter->next_page[i] = NULL; + } + + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + l = *pos - 1; + p = s_next(m, p, &l); + } + + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + + /* let the tracer release locks here if needed */ + if (current_trace && current_trace == iter->trace && iter->trace->stop) + iter->trace->stop(iter); + + mutex_unlock(&trace_types_lock); +} + +static int +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + return trace_seq_printf(s, fmt, str); +#endif + return 1; +} + +static int +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, address); + return trace_seq_printf(s, fmt, str); +#endif + return 1; +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + ret = seq_print_sym_offset(s, "%s", ip); + else + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; + + if (sym_flags & TRACE_ITER_SYM_ADDR) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +static void print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +static void +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long total = 0; + unsigned long entries = 0; + int cpu; + const char *name = "preemption"; + + if (type) + name = type->name; + + for_each_tracing_cpu(cpu) { + if (head_page(tr->data[cpu])) { + total += tr->data[cpu]->trace_idx; + if (tr->data[cpu]->trace_idx > tr->entries) + entries += tr->entries; + else + entries += tr->data[cpu]->trace_idx; + } + } + + seq_printf(m, "%s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "-----------------------------------" + "---------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, + total, + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, " -----------------\n"); + + if (data->critical_start) { + seq_puts(m, " => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n"); + } + + seq_puts(m, "\n"); +} + +static void +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) { + trace_seq_putc(s, 'H'); + } else { + if (hardirq) { + trace_seq_putc(s, 'h'); + } else { + if (softirq) + trace_seq_putc(s, 's'); + else + trace_seq_putc(s, '.'); + } + } + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_puts(s, "."); +} + +unsigned long preempt_mark_thresh = 100; + +static void +lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, + unsigned long rel_usecs) +{ + trace_seq_printf(s, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + trace_seq_puts(s, "!: "); + else if (rel_usecs > 1) + trace_seq_puts(s, "+: "); + else + trace_seq_puts(s, " : "); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +extern unsigned long sys_call_table[NR_syscalls]; + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +extern unsigned long ia32_sys_call_table[], ia32_syscall_end[]; +# define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table) +#endif + +static int +print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *next_entry = find_next_entry(iter, NULL); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_entry *entry = iter->ent; + unsigned long abs_usecs; + unsigned long rel_usecs; + unsigned long nr; + char *comm; + int S, T; + int i; + unsigned state; + + if (!next_entry) + next_entry = entry; + rel_usecs = ns2usecs(next_entry->t - entry->t); + abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + + if (verbose) { + comm = trace_find_cmdline(entry->pid); + trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, + rel_usecs % 1000); + } else { + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(s, entry->fn.ip, sym_flags); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + trace_seq_puts(s, ")\n"); + break; + case TRACE_CTX: + case TRACE_WAKE: + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + + state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, entry->type == TRACE_CTX ? "==>" : " +", + entry->ctx.next_pid, + entry->ctx.next_prio, + T, comm); + break; + case TRACE_SPECIAL: + trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) + trace_seq_puts(s, " <= "); + seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + } + trace_seq_puts(s, "\n"); + break; + case TRACE_IRQ: + seq_print_ip_sym(s, entry->irq.ip, sym_flags); + if (entry->irq.irq >= 0) + trace_seq_printf(s, " %d ", entry->irq.irq); + if (entry->irq.usermode) + trace_seq_puts(s, " (usermode)\n "); + else { + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->irq.ret_ip, sym_flags); + trace_seq_puts(s, ")\n"); + } + break; + case TRACE_FAULT: + seq_print_ip_sym(s, entry->fault.ip, sym_flags); + trace_seq_printf(s, " %lx ", entry->fault.errorcode); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fault.ret_ip, sym_flags); + trace_seq_puts(s, ")"); + trace_seq_printf(s, " [%lx]\n", entry->fault.address); + break; + case TRACE_TIMER_SET: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_seq_printf(s, " (%Ld) (%p)\n", + entry->timer.expire, entry->timer.timer); + break; + case TRACE_TIMER_TRIG: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_seq_printf(s, " (%Ld) (%p)\n", + entry->timer.expire, entry->timer.timer); + break; + case TRACE_TIMESTAMP: + seq_print_ip_sym(s, entry->timestamp.ip, sym_flags); + trace_seq_printf(s, " (%Ld)\n", + entry->timestamp.now.tv64); + break; + case TRACE_PROGRAM_EVENT: + seq_print_ip_sym(s, entry->program.ip, sym_flags); + trace_seq_printf(s, " (%Ld) (%Ld)\n", + entry->program.expire, entry->program.delta); + break; + case TRACE_TASK_ACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_TASK_DEACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_SYSCALL: + seq_print_ip_sym(s, entry->syscall.ip, sym_flags); + nr = entry->syscall.nr; + trace_seq_putc(s, ' '); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + seq_print_ip_sym(s, ia32_sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + } else +#endif + if (nr < NR_syscalls) + seq_print_ip_sym(s, sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + + trace_seq_printf(s, " (%lx %lx %lx)\n", + entry->syscall.p1, + entry->syscall.p2, + entry->syscall.p3); + break; + case TRACE_SYSRET: + seq_print_ip_sym(s, entry->sysret.ip, sym_flags); + trace_seq_printf(s, " < (%ld)\n", + entry->sysret.ret); + break; + default: + trace_seq_printf(s, "Unknown type %d\n", entry->type); + } + return 1; +} + +static int print_trace_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry; + unsigned long usec_rem; + unsigned long long t; + unsigned long secs; + long nr; + char *comm; + int ret; + int S, T; + int i; + + entry = iter->ent; + + comm = trace_find_cmdline(iter->ent->pid); + + t = ns2usecs(entry->t); + usec_rem = do_div(t, 1000000ULL); + secs = (unsigned long)t; + + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + if (!ret) + return 0; + if ((sym_flags & TRACE_ITER_PRINT_PARENT) && + entry->fn.parent_ip) { + ret = trace_seq_printf(s, " <-"); + if (!ret) + return 0; + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_printf(s, "\n"); + if (!ret) + return 0; + break; + case TRACE_CTX: + case TRACE_WAKE: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->type == TRACE_CTX ? "==>" : " +", + entry->ctx.next_pid, + entry->ctx.next_prio, + T); + if (!ret) + return 0; + break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + ret = trace_seq_puts(s, " <= "); + if (!ret) + return 0; + } + ret = seq_print_ip_sym(s, entry->stack.caller[i], + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_puts(s, "\n"); + if (!ret) + return 0; + break; + case TRACE_IRQ: + seq_print_ip_sym(s, entry->irq.ip, sym_flags); + if (entry->irq.irq >= 0) + trace_seq_printf(s, " %d ", entry->irq.irq); + if (entry->irq.usermode) + trace_seq_puts(s, " (usermode)\n "); + else { + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->irq.ret_ip, sym_flags); + trace_seq_puts(s, ")\n"); + } + break; + case TRACE_FAULT: + seq_print_ip_sym(s, entry->fault.ip, sym_flags); + trace_seq_printf(s, " %lx ", entry->fault.errorcode); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fault.ret_ip, sym_flags); + trace_seq_puts(s, ")"); + trace_seq_printf(s, " [%lx]\n", entry->fault.address); + break; + case TRACE_TIMER_SET: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_seq_printf(s, " (%Ld) (%p)\n", + entry->timer.expire, entry->timer.timer); + break; + case TRACE_TIMER_TRIG: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_seq_printf(s, " (%Ld) (%p)\n", + entry->timer.expire, entry->timer.timer); + break; + case TRACE_TIMESTAMP: + seq_print_ip_sym(s, entry->timestamp.ip, sym_flags); + trace_seq_printf(s, " (%Ld)\n", + entry->timestamp.now.tv64); + break; + case TRACE_PROGRAM_EVENT: + seq_print_ip_sym(s, entry->program.ip, sym_flags); + trace_seq_printf(s, " (%Ld) (%Ld)\n", + entry->program.expire, entry->program.delta); + break; + case TRACE_TASK_ACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_TASK_DEACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_SYSCALL: + seq_print_ip_sym(s, entry->syscall.ip, sym_flags); + nr = entry->syscall.nr; + trace_seq_putc(s, ' '); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + seq_print_ip_sym(s, ia32_sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + } else +#endif + if (nr < NR_syscalls) + seq_print_ip_sym(s, sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + + trace_seq_printf(s, " (%lx %lx %lx)\n", + entry->syscall.p1, + entry->syscall.p2, + entry->syscall.p3); + break; + case TRACE_SYSRET: + seq_print_ip_sym(s, entry->sysret.ip, sym_flags); + trace_seq_printf(s, "< (%ld)\n", + entry->sysret.ret); + break; + default: + trace_seq_printf(s, "Unknown type %d\n", entry->type); + } + return 1; +} + +static int print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + int ret; + int S, T; + + entry = iter->ent; + + ret = trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, entry->t); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = trace_seq_printf(s, "%x %x\n", + entry->fn.ip, entry->fn.parent_ip); + if (!ret) + return 0; + break; + case TRACE_CTX: + case TRACE_WAKE: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; + ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + T); + if (!ret) + return 0; + break; + case TRACE_SPECIAL: + case TRACE_STACK: + ret = trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; + } + return 1; +} + +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static int print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + int S, T; + + entry = iter->ent; + + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + case TRACE_WAKE: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, T); + break; + case TRACE_SPECIAL: + case TRACE_STACK: + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + break; + } + SEQ_PUT_FIELD_RET(s, newline); + + return 1; +} + +static int print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + + entry = iter->ent; + + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + break; + case TRACE_SPECIAL: + case TRACE_STACK: + SEQ_PUT_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_FIELD_RET(s, entry->special.arg3); + break; + } + return 1; +} + +static int trace_empty(struct trace_iterator *iter) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_tracing_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (head_page(data) && data->trace_idx && + (data->trace_tail != data->trace_head || + data->trace_tail_idx != data->trace_head_idx)) + return 0; + } + return 1; +} + +static int print_trace_line(struct trace_iterator *iter) +{ + if (iter->trace && iter->trace->print_line) + return iter->trace->print_line(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + return print_lat_fmt(iter, iter->idx, iter->cpu); + + return print_trace_fmt(iter); +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return 0; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } + } else { + print_trace_line(iter); + trace_print_seq(m, &iter->seq); + } + + return 0; +} + +static struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator * +__tracing_open(struct inode *inode, struct file *file, int *ret) +{ + struct trace_iterator *iter; + + if (tracing_disabled) { + *ret = -ENODEV; + return NULL; + } + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + *ret = -ENOMEM; + goto out; + } + + mutex_lock(&trace_types_lock); + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; + iter->trace = current_trace; + iter->pos = -1; + + /* TODO stop tracer */ + *ret = seq_open(file, &tracer_seq_ops); + if (!*ret) { + struct seq_file *m = file->private_data; + m->private = iter; + + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + } else { + kfree(iter); + iter = NULL; + } + mutex_unlock(&trace_types_lock); + + out: + return iter; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + filp->private_data = inode->i_private; + return 0; +} + +int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct trace_iterator *iter = m->private; + + mutex_lock(&trace_types_lock); + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + if (iter->tr->ctrl) + tracer_enabled = 1; + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + int ret; + + __tracing_open(inode, file, &ret); + + return ret; +} + +static int tracing_lt_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret; + + iter = __tracing_open(inode, file, &ret); + + if (!ret) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + return ret; +} + + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = m->private; + + (*pos)++; + + if (t) + t = t->next; + + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t = m->private; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + int ret; + + if (tracing_disabled) + return -ENODEV; + + ret = seq_open(file, &show_traces_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = trace_types; + } + + return ret; +} + +static struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations tracing_lt_fops = { + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask = CPU_MASK_ALL; + +/* + * When tracing/tracing_cpu_mask is modified then this holds + * the new bitmask we are about to install: + */ +static cpumask_t tracing_cpumask_new; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + int len; + + mutex_lock(&tracing_cpumask_update_lock); + + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + if (count - len < 2) { + count = -EINVAL; + goto out_err; + } + len += sprintf(mask_str + len, "\n"); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err, cpu; + + mutex_lock(&tracing_cpumask_update_lock); + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_unlock; + + raw_local_irq_disable(); + __raw_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpu_isset(cpu, tracing_cpumask) && + !cpu_isset(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + if (!cpu_isset(cpu, tracing_cpumask) && + cpu_isset(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_enable(); + + tracing_cpumask = tracing_cpumask_new; + + mutex_unlock(&tracing_cpumask_update_lock); + + return count; + +err_unlock: + mutex_unlock(&tracing_cpumask_update_lock); + + return err; +} + +static struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, +}; + +static ssize_t +tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + /* calulate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); + len += 3; /* "no" and space */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + r += sprintf(buf + r, "%s ", trace_options[i]); + else + r += sprintf(buf + r, "no%s ", trace_options[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { + if (neg) + trace_flags &= ~(1 << i); + else + trace_flags |= (1 << i); + break; + } + } + /* + * If no option could be set, return an error: + */ + if (!trace_options[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, +}; + +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# mkdir /debug\n" + "# mount -t debugfs nodev /debug\n\n" + "# cat /debug/tracing/available_tracers\n" + "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" + "# cat /debug/tracing/current_tracer\n" + "none\n" + "# echo sched_switch > /debug/tracing/current_tracer\n" + "# cat /debug/tracing/current_tracer\n" + "sched_switch\n" + "# cat /debug/tracing/iter_ctrl\n" + "noprint-parent nosym-offset nosym-addr noverbose\n" + "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo 1 > /debug/tracing/tracing_enabled\n" + "# cat /debug/tracing/trace > /tmp/trace.txt\n" + "echo 0 > /debug/tracing/tracing_enabled\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", tr->ctrl); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tr->ctrl ^ val) { + if (val) + tracer_enabled = 1; + else + tracer_enabled = 0; + + tr->ctrl = val; + + if (current_trace && current_trace->ctrl_update) + current_trace->ctrl_update(tr); + } + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + struct tracer *t; + char buf[max_tracer_type_len+1]; + int i; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t || t == current_trace) + goto out; + + if (current_trace && current_trace->reset) + current_trace->reset(tr); + + current_trace = t; + if (t->init) + t->init(tr); + + out: + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + *ptr = val * 1000; + + return cnt; +} + +static atomic_t tracing_reader; + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_iterator *iter; + + if (tracing_disabled) + return -ENODEV; + + /* We only allow for reader of the pipe */ + if (atomic_inc_return(&tracing_reader) != 1) { + atomic_dec(&tracing_reader); + return -EBUSY; + } + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&trace_types_lock); + iter->tr = &global_trace; + iter->trace = current_trace; + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + + kfree(iter); + atomic_dec(&tracing_reader); + + return 0; +} + +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if (trace_flags & TRACE_ITER_BLOCK) { + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + } else { + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait(filp, &trace_wait, poll_table); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + + return 0; + } +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + struct trace_array_cpu *data; + static cpumask_t mask; + unsigned long flags; +#ifdef CONFIG_FTRACE + int ftrace_save; +#endif + int cpu; + ssize_t sret; + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + sret = 0; + + trace_seq_reset(&iter->seq); + + mutex_lock(&trace_types_lock); + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { + sret = -EAGAIN; + goto out; + } + + /* + * This is a make-shift waitqueue. The reason we don't use + * an actual wait queue is because: + * 1) we only ever have one waiter + * 2) the tracing, traces all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * Anyway, this is really very primitive wakeup. + */ + set_current_state(TASK_INTERRUPTIBLE); + iter->tr->waiter = current; + + mutex_unlock(&trace_types_lock); + + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ/10); + + mutex_lock(&trace_types_lock); + + iter->tr->waiter = NULL; + + if (signal_pending(current)) { + sret = -EINTR; + goto out; + } + + if (iter->trace != current_trace) + goto out; + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_enabled && iter->pos) + break; + + continue; + } + + /* stop when tracing is finished */ + if (trace_empty(iter)) + goto out; + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + /* reset all but tr, trace, and overruns */ + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter->pos = -1; + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + cpus_clear(mask); + local_irq_save(flags); +#ifdef CONFIG_FTRACE + ftrace_save = ftrace_enabled; + ftrace_enabled = 0; +#endif + smp_wmb(); + for_each_tracing_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!head_page(data) || !data->trace_idx) + continue; + + atomic_inc(&data->disabled); + cpu_set(cpu, mask); + } + + for_each_cpu_mask_nr(cpu, mask) { + data = iter->tr->data[cpu]; + __raw_spin_lock(&data->lock); + + if (data->overrun > iter->last_overrun[cpu]) + iter->overrun[cpu] += + data->overrun - iter->last_overrun[cpu]; + iter->last_overrun[cpu] = data->overrun; + } + + while (find_next_entry_inc(iter) != NULL) { + int ret; + int len = iter->seq.len; + + ret = print_trace_line(iter); + if (!ret) { + /* don't print partial lines */ + iter->seq.len = len; + break; + } + + trace_consume(iter); + + if (iter->seq.len >= cnt) + break; + } + + for_each_cpu_mask_nr(cpu, mask) { + data = iter->tr->data[cpu]; + __raw_spin_unlock(&data->lock); + } + + for_each_cpu_mask_nr(cpu, mask) { + data = iter->tr->data[cpu]; + atomic_dec(&data->disabled); + } +#ifdef CONFIG_FTRACE + ftrace_enabled = ftrace_save; +#endif + local_irq_restore(flags); + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) + trace_seq_reset(&iter->seq); + if (sret == -EBUSY) + sret = 0; + +out: + mutex_unlock(&trace_types_lock); + + return sret; +} + +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%lu\n", tr->entries); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int i, ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + if (current_trace != &no_tracer) { + cnt = -EBUSY; + pr_info("ftrace: set current_tracer to none" + " before modifying buffer size\n"); + goto out; + } + + if (val > global_trace.entries) { + long pages_requested; + unsigned long freeable_pages; + + /* make sure we have enough memory before mapping */ + pages_requested = + (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; + + /* account for each buffer (and max_tr) */ + pages_requested *= tracing_nr_buffers * 2; + + /* Check for overflow */ + if (pages_requested < 0) { + cnt = -ENOMEM; + goto out; + } + + freeable_pages = determine_dirtyable_memory(); + + /* we only allow to request 1/4 of useable memory */ + if (pages_requested > + ((freeable_pages + tracing_pages_allocated) / 4)) { + cnt = -ENOMEM; + goto out; + } + + while (global_trace.entries < val) { + if (trace_alloc_page()) { + cnt = -ENOMEM; + goto out; + } + /* double check that we don't go over the known pages */ + if (tracing_pages_allocated > pages_requested) + break; + } + + } else { + /* include the number of entries in val (inc of page entries) */ + while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) + trace_free_page(); + } + + /* check integrity */ + for_each_tracing_cpu(i) + check_pages(global_trace.data[i]); + + filp->f_pos += cnt; + + /* If check pages failed, return ENOMEM */ + if (tracing_disabled) + cnt = -ENOMEM; + out: + max_tr.entries = global_trace.entries; + mutex_unlock(&trace_types_lock); + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + +static struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, +}; + +static struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, +}; + +static struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .poll = tracing_poll_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, +}; + +static struct file_operations tracing_entries_fops = { + .open = tracing_open_generic, + .read = tracing_entries_read, + .write = tracing_entries_write, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_long(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", *p); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static struct file_operations tracing_read_long_fops = { + .open = tracing_open_generic, + .read = tracing_read_long, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + +static __init void tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + + entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + NULL, &tracing_iter_fops); + if (!entry) + pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + + entry = debugfs_create_file("latency_trace", 0444, d_tracer, + &global_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'latency_trace' entry\n"); + + entry = debugfs_create_file("trace", 0444, d_tracer, + &global_trace, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + entry = debugfs_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + if (!entry) + pr_warning("Could not create debugfs 'README' entry\n"); + + entry = debugfs_create_file("trace_pipe", 0644, d_tracer, + NULL, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + + entry = debugfs_create_file("trace_entries", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + +#ifdef CONFIG_DYNAMIC_FTRACE + entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, + &tracing_read_long_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'dyn_ftrace_total_info' entry\n"); +#endif +#ifdef CONFIG_SYSPROF_TRACER + init_tracer_sysprof_debugfs(d_tracer); +#endif +} + +/** + * ftrace_stop - called when we need to drastically disable the tracer. + */ +void ftrace_stop(void) +{ + struct tracer *saved_tracer = current_trace; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + int i; + + __ftrace_kill(); + for_each_tracing_cpu(i) { + data = tr->data[i]; + atomic_inc(&data->disabled); + } + tracer_enabled = 0; + + /* + * TODO: make a safe method to ctrl_update. + * ctrl_update may schedule, but currently only + * does when ftrace is enabled. + */ + if (tr->ctrl) { + tr->ctrl = 0; + if (saved_tracer && saved_tracer->ctrl_update) + saved_tracer->ctrl_update; + } + + +} + +static int trace_alloc_page(void) +{ + struct trace_array_cpu *data; + struct page *page, *tmp; + LIST_HEAD(pages); + void *array; + unsigned pages_allocated = 0; + int i; + + /* first allocate a page for each CPU */ + for_each_tracing_cpu(i) { + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + + pages_allocated++; + page = virt_to_page(array); + list_add(&page->lru, &pages); + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + pages_allocated++; + page = virt_to_page(array); + list_add(&page->lru, &pages); +#endif + } + + /* Now that we successfully allocate a page per CPU, add them */ + for_each_tracing_cpu(i) { + data = global_trace.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del_init(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + ClearPageLRU(page); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del_init(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + SetPageLRU(page); +#endif + } + tracing_pages_allocated += pages_allocated; + global_trace.entries += ENTRIES_PER_PAGE; + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + return -ENOMEM; +} + +static int trace_free_page(void) +{ + struct trace_array_cpu *data; + struct page *page; + struct list_head *p; + int i; + int ret = 0; + + /* free one page from each buffer */ + for_each_tracing_cpu(i) { + data = global_trace.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + tracing_pages_allocated--; + tracing_pages_allocated--; + __free_page(page); + + tracing_reset(data); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); +#endif + } + global_trace.entries -= ENTRIES_PER_PAGE; + + return ret; +} + +__init static int tracer_alloc_buffers(void) +{ + struct trace_array_cpu *data; + void *array; + struct page *page; + int pages = 0; + int ret = -ENOMEM; + int i; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_nr_buffers = num_possible_cpus(); + tracing_buffer_mask = cpu_possible_map; + + /* Allocate the first page for all buffers */ + for_each_tracing_cpu(i) { + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_buffers; + } + + /* set the array to the list */ + INIT_LIST_HEAD(&data->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &data->trace_pages); + /* use the LRU flag to differentiate the two buffers */ + ClearPageLRU(page); + + data->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + max_tr.data[i]->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_buffers; + } + + INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &max_tr.data[i]->trace_pages); + SetPageLRU(page); +#endif + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + global_trace.entries = ENTRIES_PER_PAGE; + pages++; + + while (global_trace.entries < trace_nr_entries) { + if (trace_alloc_page()) + break; + pages++; + } + max_tr.entries = global_trace.entries; + + pr_info("tracer: %d pages allocated for %ld", + pages, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info(" actual entries %ld\n", global_trace.entries); + + tracer_init_debugfs(); + + trace_init_cmdlines(); + + register_tracer(&no_tracer); + current_trace = &no_tracer; + + /* All seems OK, enable tracing */ + global_trace.ctrl = tracer_enabled; + tracing_disabled = 0; + + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct page *page, *tmp; + struct trace_array_cpu *data = global_trace.data[i]; + + if (data) { + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data) { + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + } +#endif + } + return ret; +} +fs_initcall(tracer_alloc_buffers); Index: linux-2.6.25.8-rt7/kernel/trace/trace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace.h 2008-06-23 19:14:35.000000000 -0400 @@ -0,0 +1,480 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include +#include +#include +#include +#include + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, + TRACE_IRQ, + TRACE_FAULT, + TRACE_TIMER_SET, + TRACE_TIMER_TRIG, + TRACE_TIMESTAMP, + TRACE_PROGRAM_EVENT, + TRACE_TASK_ACT, + TRACE_TASK_DEACT, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +/* + * Function trace entry - function address and parent function addres: + */ +struct ftrace_entry { + unsigned long ip; + unsigned long parent_ip; +}; + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + */ +struct ctx_switch_entry { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; + unsigned char next_state; +}; + +/* + * Special (free-form) trace entry: + */ +struct special_entry { + unsigned long arg1; + unsigned long arg2; + unsigned long arg3; +}; + +struct irq_entry { + unsigned long ip; + unsigned long ret_ip; + unsigned irq; + unsigned usermode; +}; + +struct fault_entry { + unsigned long ip; + unsigned long ret_ip; + unsigned long errorcode; + unsigned long address; +}; + +struct timer_entry { + unsigned long ip; + ktime_t expire; + void *timer; +}; + +struct program_entry { + unsigned long ip; + ktime_t expire; + int64_t delta; +}; + +struct timestamp_entry { + unsigned long ip; + ktime_t now; +}; + +struct task_entry { + unsigned long ip; + pid_t pid; + unsigned prio; + int cpu; +}; + +struct wakeup_entry { + unsigned long ip; + pid_t pid; + unsigned prio; + unsigned curr_prio; +}; + +struct syscall_entry { + unsigned long ip; + unsigned long nr; + unsigned long p1; + unsigned long p2; + unsigned long p3; +}; + +struct sysret_entry { + unsigned long ip; + unsigned long ret; +}; + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +struct stack_entry { + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; + int pid; + cycle_t t; + union { + struct ftrace_entry fn; + struct ctx_switch_entry ctx; + struct special_entry special; + struct stack_entry stack; + struct mmiotrace_rw mmiorw; + struct mmiotrace_map mmiomap; + struct irq_entry irq; + struct fault_entry fault; + struct timer_entry timer; + struct timestamp_entry timestamp; + struct program_entry program; + struct task_entry task; + struct wakeup_entry wakeup; + struct syscall_entry syscall; + struct sysret_entry sysret; + }; +}; + +#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + struct list_head trace_pages; + atomic_t disabled; + __raw_spinlock_t lock; + struct lock_class_key lock_key; + + /* these fields get copied into max-trace: */ + unsigned trace_head_idx; + unsigned trace_tail_idx; + void *trace_head; /* producer */ + void *trace_tail; /* consumer */ + unsigned long trace_idx; + unsigned long overrun; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct trace_iterator; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + unsigned long entries; + long ctrl; + int cpu; + cycle_t time_start; + struct task_struct *waiter; + struct trace_array_cpu *data[NR_CPUS]; +}; + +/* + * A specific tracer, represented by methods that operate on a trace array: + */ +struct tracer { + const char *name; + void (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*start)(struct trace_iterator *iter); + void (*stop)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); + void (*ctrl_update)(struct trace_array *tr); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif + int (*print_line)(struct trace_iterator *iter); + struct tracer *next; + int print_max; +}; + +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; + unsigned int readpos; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; + long last_overrun[NR_CPUS]; + long overrun[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + int cpu; + + struct trace_entry *prev_ent; + int prev_cpu; + + unsigned long iter_flags; + loff_t pos; + unsigned long next_idx[NR_CPUS]; + struct list_head *next_page[NR_CPUS]; + unsigned next_page_idx[NR_CPUS]; + long idx; +}; + +void tracing_reset(struct trace_array_cpu *data); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *tracing_init_dentry(void); +void init_tracer_sysprof_debugfs(struct dentry *d_tracer); + +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); +void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags); +void trace_special(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3); +void trace_function(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_event_irq(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + int irq, int usermode, + unsigned long retip); +void tracing_event_fault(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long retip, + unsigned long error_code, + unsigned long address); +void tracing_event_timer_set(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, void *timer); +void tracing_event_timer_triggered(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expired, void *timer); +void tracing_event_timestamp(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *now); +void tracing_event_task_activate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int cpu); +void tracing_event_task_deactivate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int cpu); +void tracing_event_program_event(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, int64_t *delta); +void tracing_event_wakeup(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + pid_t pid, int prio, + int curr_prio); +void tracing_event_syscall(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long nr, + unsigned long p1, + unsigned long p2, + unsigned long p3); +void tracing_event_sysret(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long ret); + +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +extern cycle_t ftrace_now(int cpu); + +#ifdef CONFIG_FTRACE +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +#else +# define tracing_start_function_trace() do { } while (0) +# define tracing_stop_function_trace() do { } while (0) +#endif + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +typedef void +(*tracer_switch_func_t)(void *private, + void *__rq, + struct task_struct *prev, + struct task_struct *next); + +struct tracer_switch_ops { + tracer_switch_func_t func; + void *private; + struct tracer_switch_ops *next; +}; + +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); +#endif + +#ifdef CONFIG_MMIOTRACE +extern void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw); +extern void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map); +#endif + +#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_FTRACE +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_IRQSOFF_TRACER +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_PREEMPT_TRACER +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +#endif +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SCHED_TRACER +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SYSPROF_TRACER +extern int trace_selftest_startup_sysprof(struct tracer *trace, + struct trace_array *tr); +#endif +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +extern void *head_page(struct trace_array_cpu *data); +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +extern long ns2usecs(cycle_t nsec); + +extern unsigned long trace_flags; + +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_SCHED_TREE = 0x200, +}; + +#endif /* _LINUX_KERNEL_TRACE_H */ Index: linux-2.6.25.8-rt7/kernel/trace/trace_functions.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_functions.c 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,78 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include + +#include "trace.h" + +static void function_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void start_function_trace(struct trace_array *tr) +{ + function_reset(tr); + tracing_start_cmdline_record(); + tracing_start_function_trace(); +} + +static void stop_function_trace(struct trace_array *tr) +{ + tracing_stop_function_trace(); + tracing_stop_cmdline_record(); +} + +static void function_trace_init(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); +} + +static void function_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_function_trace(tr); +} + +static void function_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); + else + stop_function_trace(tr); +} + +static struct tracer function_trace __read_mostly = +{ + .name = "ftrace", + .init = function_trace_init, + .reset = function_trace_reset, + .ctrl_update = function_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif +}; + +static __init int init_function_trace(void) +{ + return register_tracer(&function_trace); +} + +device_initcall(init_function_trace); Index: linux-2.6.25.8-rt7/kernel/trace/trace_irqsoff.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_irqsoff.c 2008-06-23 19:14:36.000000000 -0400 @@ -0,0 +1,508 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_hist.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +static DEFINE_PER_CPU(int, tracing_cpu); + +static DEFINE_RAW_SPINLOCK(max_trace_lock); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!ftrace_enabled)) + return; + + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return; + + local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + spin_lock_irqsave(&max_trace_lock, flags); + + /* check if we are still the max latency */ + if (!report_latency(delta)) + goto out_unlock; + + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out_unlock; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + max_sequence++; + +out_unlock: + spin_unlock_irqrestore(&max_trace_lock, flags); + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + tracing_reset(data); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); +} + +static inline void +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + tracing_reset(data); + + local_save_flags(flags); + + trace_function(tr, data, ip, parent_ip, flags); + + per_cpu(tracing_cpu, cpu) = 1; + + atomic_dec(&data->disabled); +} + +static inline void +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; + else + return; + + if (!tracer_enabled) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!head_page(data)) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + local_save_flags(flags); + trace_function(tr, data, ip, parent_ip, flags); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +/* start and stop critical timings used to for stoppage (in idle) */ +void start_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + + tracing_hist_preempt_start(); +} + +void stop_critical_timings(void) +{ + tracing_hist_preempt_stop(TRACE_STOP); + + if (preempt_trace() || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_IRQSOFF_TRACER +#ifdef CONFIG_PROVE_LOCKING +void time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + tracing_hist_preempt_stop(1); + + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); +} + +void time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); + + tracing_hist_preempt_start(); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void trace_hardirqs_on(void) +{ + tracing_hist_preempt_stop(1); + + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + + tracing_hist_preempt_start(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracing_hist_preempt_stop(1); + + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, caller_addr); + + tracing_hist_preempt_start(); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracing_hist_preempt_stop(0); + stop_critical_timing(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + start_critical_timing(a0, a1); + tracing_hist_preempt_start(); +} +#endif /* CONFIG_PREEMPT_TRACER */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + register_ftrace_function(&trace_ops); + tracer_enabled = 1; +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); +} + +static void __irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visible */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif +}; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif + +__init static int init_irqsoff_tracer(void) +{ + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); Index: linux-2.6.25.8-rt7/kernel/trace/trace_mmiotrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_mmiotrace.c 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,295 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen + */ + +#define DEBUG 1 + +#include +#include +#include + +#include "trace.h" + +struct header_iter { + struct pci_dev *dev; +}; + +static struct trace_array *mmio_trace_array; +static bool overrun_detected; + +static void mmio_reset_data(struct trace_array *tr) +{ + int cpu; + + overrun_detected = false; + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + if (tr->ctrl) { + mmio_reset_data(tr); + enable_mmiotrace(); + } +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; +} + +static void mmio_trace_ctrl_update(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) { + mmio_reset_data(tr); + enable_mmiotrace(); + } else { + disable_mmiotrace(); + } +} + +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) +{ + struct header_iter *hiter; + struct trace_seq *s = &iter->seq; + + trace_seq_printf(s, "VERSION 20070824\n"); + + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static unsigned long count_overruns(struct trace_iterator *iter) +{ + int cpu; + unsigned long cnt = 0; + for_each_online_cpu(cpu) { + cnt += iter->overrun[cpu]; + iter->overrun[cpu] = 0; + } + return cnt; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + +print_out: + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static int mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, 0); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +static int mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_map *m = &entry->mmiomap; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %lu.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +/* return 0 to abort printing without consuming current entry in pipe mode */ +static int mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + default: + return 1; /* ignore unknown entries */ + } +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, + .ctrl_update = mmio_trace_ctrl_update, + .print_line = mmio_print_line, +}; + +__init static int init_mmio_trace(void) +{ + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +void mmio_trace_rw(struct mmiotrace_rw *rw) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} + +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); +} Index: linux-2.6.25.8-rt7/kernel/trace/trace_sched_switch.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_sched_switch.c 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,196 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; +static atomic_t sched_ref; + +static void +sched_switch_func(void *private, void *__rq, struct task_struct *prev, + struct task_struct *next) +{ + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_switch_trace(tr, data, prev, next, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + if (!atomic_read(&sched_ref)) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + sched_switch_func(probe_data, __rq, prev, next); +} + +static void sched_switch_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static int tracing_sched_register(void) +{ + int ret; + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &ctx_trace); + if (ret) + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + + return ret; +} + +static void tracing_sched_unregister(void) +{ + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &ctx_trace); +} + +void tracing_start_sched_switch(void) +{ + long ref; + + ref = atomic_inc_return(&sched_ref); + if (ref == 1) + tracing_sched_register(); +} + +void tracing_stop_sched_switch(void) +{ + long ref; + + ref = atomic_dec_and_test(&sched_ref); + if (ref) + tracing_sched_unregister(); +} + +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} + +static void start_sched_trace(struct trace_array *tr) +{ + sched_switch_reset(tr); + tracer_enabled = 1; + tracing_start_cmdline_record(); +} + +static void stop_sched_trace(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + tracer_enabled = 0; +} + +static void sched_switch_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_sched_trace(tr); +} + +static void sched_switch_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_sched_trace(tr); +} + +static void sched_switch_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_sched_trace(tr); + else + stop_sched_trace(tr); +} + +static struct tracer sched_switch_trace __read_mostly = +{ + .name = "sched_switch", + .init = sched_switch_trace_init, + .reset = sched_switch_trace_reset, + .ctrl_update = sched_switch_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sched_switch, +#endif +}; + +__init static int init_sched_switch_trace(void) +{ + int ret = 0; + + if (atomic_read(&sched_ref)) + ret = tracing_sched_register(); + if (ret) { + pr_info("error registering scheduler trace\n"); + return ret; + } + return register_tracer(&sched_switch_trace); +} +device_initcall(init_sched_switch_trace); Index: linux-2.6.25.8-rt7/kernel/trace/trace_sched_wakeup.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_sched_wakeup.c 2008-06-23 19:14:36.000000000 -0400 @@ -0,0 +1,452 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static __raw_spinlock_t wakeup_lock = + (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static void __wakeup_reset(struct trace_array *tr); + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int resched; + int cpu; + + if (likely(!wakeup_task) || !ftrace_enabled) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + raw_local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); + + if (unlikely(!wakeup_task)) + goto unlock; + + /* + * The task can't disappear because it needs to + * wake up first, and we have the wakeup_lock. + */ + if (task_cpu(wakeup_task) != cpu) + goto unlock; + + trace_function(tr, data, ip, parent_ip, flags); + + unlock: + __raw_spin_unlock(&wakeup_lock); + raw_local_irq_restore(flags); + + out: + atomic_dec(&data->disabled); + + /* + * To prevent recursion from the scheduler, if the + * resched flag was set before we entered, then + * don't reschedule. + */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = wakeup_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long latency = 0, t0 = 0, t1 = 0; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waiting for is waking up */ + data = tr->data[wakeup_cpu]; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + latency = nsecs_to_usecs(delta); + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + +out_unlock: + __wakeup_reset(tr); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + wakeup_sched_switch(probe_data, __rq, prev, next); +} + +static void __wakeup_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); + __wakeup_reset(tr); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +} + +static void +wakeup_check_start(struct trace_array *tr, struct task_struct *p, + struct task_struct *curr) +{ + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + __raw_spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); + trace_function(tr, tr->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags); + +out_locked: + __raw_spin_unlock(&wakeup_lock); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array **ptr = probe_data; + struct trace_array *tr = *ptr; + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + + if (likely(!tracer_enabled)) + return; + + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); + tracing_record_cmdline(curr); + + wakeup_check_start(tr, task, curr); +} + +static void start_wakeup_tracer(struct trace_array *tr) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &wakeup_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + tracer_enabled = 1; + register_ftrace_function(&trace_ops); + + return; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); +} + +static void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); +} + +static void wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_trace = tr; + + if (tr->ctrl) + start_wakeup_tracer(tr); +} + +static void wakeup_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_wakeup_tracer(tr); + else + stop_wakeup_tracer(tr); +} + +static void wakeup_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_tracer(iter->tr); +} + +static void wakeup_tracer_close(struct trace_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_tracer(iter->tr); +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .open = wakeup_tracer_open, + .close = wakeup_tracer_close, + .ctrl_update = wakeup_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); Index: linux-2.6.25.8-rt7/kernel/trace/trace_selftest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_selftest.c 2008-06-23 19:13:09.000000000 -0400 @@ -0,0 +1,572 @@ +/* Include in trace.c */ + +#include +#include + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: + case TRACE_STACK: + case TRACE_SPECIAL: + case TRACE_IRQ: + case TRACE_FAULT: + case TRACE_TIMER_SET: + case TRACE_TIMER_TRIG: + case TRACE_TIMESTAMP: + case TRACE_TASK_ACT: + case TRACE_TASK_DEACT: + case TRACE_SYSCALL: + case TRACE_SYSRET: + return 1; + } + return 0; +} + +static int +trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +{ + struct trace_entry *entries; + struct page *page; + int idx = 0; + int i; + + BUG_ON(list_empty(&data->trace_pages)); + page = list_entry(data->trace_pages.next, struct page, lru); + entries = page_address(page); + + check_pages(data); + if (head_page(data) != entries) + goto failed; + + /* + * The starting trace buffer always has valid elements, + * if any element exists. + */ + entries = head_page(data); + + for (i = 0; i < tr->entries; i++) { + + if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", + entries[idx].type); + goto failed; + } + + idx++; + if (idx >= ENTRIES_PER_PAGE) { + page = virt_to_page(entries); + if (page->lru.next == &data->trace_pages) { + if (i != tr->entries - 1) { + printk(KERN_CONT ".. entries buffer mismatch"); + goto failed; + } + } else { + page = list_entry(page->lru.next, struct page, lru); + entries = page_address(page); + } + idx = 0; + } + } + + page = virt_to_page(entries); + if (page->lru.next != &data->trace_pages) { + printk(KERN_CONT ".. too many entries"); + goto failed; + } + + return 0; + + failed: + /* disable tracing */ + tracing_disabled = 1; + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +{ + unsigned long flags, cnt = 0; + int cpu, ret = 0; + + /* Don't allow flipping of max traces now */ + raw_local_irq_save(flags); + __raw_spin_lock(&ftrace_max_lock); + for_each_possible_cpu(cpu) { + if (!head_page(tr->data[cpu])) + continue; + + cnt += tr->data[cpu]->trace_idx; + + ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + if (ret) + break; + } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_restore(flags); + + if (count) + *count = cnt; + + return ret; +} + +#ifdef CONFIG_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE + +#define __STR(x) #x +#define STR(x) __STR(x) + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* update the records */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* + * Some archs *cough*PowerPC*cough* add charachters to the + * start of the function names. We simply put a '*' to + * accomodate them. + */ + func_name = "*" STR(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + /* we should only have one item */ + if (!ret && count != 1) { + printk(KERN_CONT ".. filter failed count=%ld ..", count); + ret = -1; + goto out; + } + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_filter(NULL, 0, 1); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + + /* make sure msleep has been recorded */ + msleep(1); + + /* force the recorded functions to be traced */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* start the tracing */ + ftrace_enabled = 1; + tracer_enabled = 1; + + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + + return ret; +} +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + + /* reset the max latency */ + tracing_max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tr->ctrl = 1; + trace->ctrl_update(tr); + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + out: + trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_SCHED_TRACER +static int trace_wakeup_test_thread(void *data) +{ + /* Make this a RT thread, doesn't need to be too high */ + struct sched_param param = { .sched_priority = 5 }; + struct completion *x = data; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* Make it know we have a new prio */ + complete(x); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + /* + * This is an RT task, do short sleeps to let + * others run. + */ + msleep(100); + } + + return 0; +} + +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + struct task_struct *p; + struct completion isrt; + unsigned long count; + int ret; + + init_completion(&isrt); + + /* create a high prio thread */ + p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + if (IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at an RT prio */ + wait_for_completion(&isrt); + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + + /* sleep to let the RT thread sleep too */ + msleep(100); + + /* + * Yes this is slightly racy. It is possible that for some + * strange reason that the RT thread we created, did not + * call schedule for 100ms after doing the completion, + * and we do a wakeup on a task that already is awake. + * But that is extremely unlikely, and the worst thing that + * happens in such a case, is that we disable tracing. + * Honestly, if this race does happen something is horrible + * wrong with the system. + */ + + wake_up_process(p); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + + + trace->reset(tr); + + tracing_max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_SYSPROF_TRACER +int +trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + return ret; +} +#endif /* CONFIG_SYSPROF_TRACER */ Index: linux-2.6.25.8-rt7/kernel/trace/trace_selftest_dynamic.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_selftest_dynamic.c 2008-06-23 19:13:04.000000000 -0400 @@ -0,0 +1,7 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} Index: linux-2.6.25.8-rt7/lib/Kconfig.debug =================================================================== --- linux-2.6.25.8-rt7.orig/lib/Kconfig.debug 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/Kconfig.debug 2008-06-23 19:14:35.000000000 -0400 @@ -237,6 +237,8 @@ config DEBUG_RT_MUTEXES help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. + When realtime preemption is enabled this includes spinlocks, + rwlocks, mutexes and (rw)semaphores config DEBUG_PI_LIST bool @@ -260,7 +262,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. @@ -275,6 +277,17 @@ config DEBUG_SEMAPHORE verbose debugging messages. If you suspect a semaphore problem or a kernel hacker asks for this option then say Y. Otherwise say N. +config RWLOCK_TORTURE_TEST + tristate "torture tests for Priority Inheritance RW locks" + depends on DEBUG_KERNEL + depends on m + default n + help + This option provides a kernel modules that runs a torture test + of several threads that try to grab mutexes, rwlocks and rwsems. + + Say N if you are unsure. + config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -621,4 +634,6 @@ config PROVIDE_OHCI1394_DMA_INIT See Documentation/debugging-via-ohci1394.txt for more information. +source kernel/trace/Kconfig + source "samples/Kconfig" Index: linux-2.6.25.8-rt7/lib/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/lib/Makefile 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/Makefile 2008-06-23 19:13:53.000000000 -0400 @@ -3,11 +3,20 @@ # lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o \ + rbtree.o radix-tree.o dump_stack.o lock_list.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o +ifdef CONFIG_FTRACE +# Do not profile string.o, since it may be used in early boot or vdso +CFLAGS_REMOVE_string.o = -pg +# Also do not profile any debug utilities +CFLAGS_REMOVE_spinlock_debug.o = -pg +CFLAGS_REMOVE_list_debug.o = -pg +CFLAGS_REMOVE_debugobjects.o = -pg +endif + lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o @@ -27,7 +36,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_PREEMPT_RT) += plist.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o Index: linux-2.6.25.8-rt7/mm/page-writeback.c =================================================================== --- linux-2.6.25.8-rt7.orig/mm/page-writeback.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/mm/page-writeback.c 2008-06-23 19:13:57.000000000 -0400 @@ -126,8 +126,6 @@ static void background_writeout(unsigned static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -286,7 +284,13 @@ static unsigned long highmem_dirtyable_m #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; @@ -1016,9 +1020,11 @@ int __set_page_dirty_nobuffers(struct pa if (!mapping) return 1; - write_lock_irq(&mapping->tree_lock); + lock_page_ref_irq(page); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); + BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { @@ -1027,10 +1033,12 @@ int __set_page_dirty_nobuffers(struct pa BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } - radix_tree_tag_set(&mapping->page_tree, + radix_tree_lock(&ctx); + radix_tree_tag_set(ctx.tree, page_index(page), PAGECACHE_TAG_DIRTY); + radix_tree_unlock(&ctx); } - write_unlock_irq(&mapping->tree_lock); + unlock_page_ref_irq(page); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1186,18 +1194,21 @@ int test_clear_page_writeback(struct pag struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + lock_page_ref_irqsave(page, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); + + radix_tree_lock(&ctx); + radix_tree_tag_clear(ctx.tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + radix_tree_unlock(&ctx); if (bdi_cap_writeback_dirty(bdi)) { __dec_bdi_stat(bdi, BDI_WRITEBACK); __bdi_writeout_inc(bdi); } } - write_unlock_irqrestore(&mapping->tree_lock, flags); + unlock_page_ref_irqrestore(page, flags); } else { ret = TestClearPageWriteback(page); } @@ -1214,21 +1225,25 @@ int test_set_page_writeback(struct page if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); - write_lock_irqsave(&mapping->tree_lock, flags); + lock_page_ref_irqsave(page, flags); ret = TestSetPageWriteback(page); if (!ret) { - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_lock(&ctx); + radix_tree_tag_set(ctx.tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + radix_tree_unlock(&ctx); if (bdi_cap_writeback_dirty(bdi)) __inc_bdi_stat(bdi, BDI_WRITEBACK); } - if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + if (!PageDirty(page)) { + radix_tree_lock(&ctx); + radix_tree_tag_clear(ctx.tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + radix_tree_unlock(&ctx); + } + unlock_page_ref_irqrestore(page, flags); } else { ret = TestSetPageWriteback(page); } Index: linux-2.6.25.8-rt7/scripts/Makefile.lib =================================================================== --- linux-2.6.25.8-rt7.orig/scripts/Makefile.lib 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/scripts/Makefile.lib 2008-06-23 19:13:04.000000000 -0400 @@ -96,7 +96,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUI modname_flags = $(if $(filter 1,$(words $(modname))),\ -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") -_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) +orig_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) +_c_flags = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags)) _a_flags = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o) _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F)) Index: linux-2.6.25.8-rt7/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/Makefile 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/Makefile 2008-06-23 19:14:39.000000000 -0400 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 25 -EXTRAVERSION = .8 +EXTRAVERSION = .8-rt7 NAME = Funky Weasel is Jiggy wit it # *DOCUMENTATION* @@ -507,6 +507,10 @@ else KBUILD_CFLAGS += -O2 endif +ifdef CONFIG_FTRACE +KBUILD_CFLAGS += -pg +endif + # Force gcc to behave correct even for buggy distributions # Arch Makefiles may override this setting KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) Index: linux-2.6.25.8-rt7/arch/x86/kernel/i386_ksyms_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/i386_ksyms_32.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/i386_ksyms_32.c 2008-06-23 19:13:27.000000000 -0400 @@ -1,13 +1,21 @@ +#include #include #include #include #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); Index: linux-2.6.25.8-rt7/arch/x86/kernel/x8664_ksyms_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/x8664_ksyms_64.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/x8664_ksyms_64.c 2008-06-23 19:13:28.000000000 -0400 @@ -1,6 +1,7 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ +#include #include #include @@ -10,12 +11,19 @@ #include #include +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); Index: linux-2.6.25.8-rt7/arch/x86/kernel/tsc_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/tsc_32.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/tsc_32.c 2008-06-23 19:13:07.000000000 -0400 @@ -109,9 +109,10 @@ static void set_cyc2ns_scale(unsigned lo /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long native_sched_clock(void) +unsigned long long notrace native_sched_clock(void) { unsigned long long this_offset; + unsigned long long ret; /* * Fall back to jiffies if there's no TSC available: @@ -125,11 +126,17 @@ unsigned long long native_sched_clock(vo /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + preempt_disable_notrace(); + /* read the Time Stamp Counter: */ rdtscll(this_offset); /* return the value in ns */ - return cycles_2_ns(this_offset); + ret = cycles_2_ns(this_offset); + + preempt_enable_notrace(); + + return ret; } /* We need to define a real function for sched_clock, to override the Index: linux-2.6.25.8-rt7/arch/x86/kernel/nmi_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/nmi_32.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/nmi_32.c 2008-06-23 19:14:32.000000000 -0400 @@ -25,6 +25,7 @@ #include #include +#include #include "mach_traps.h" @@ -42,7 +43,7 @@ static cpumask_t backtrace_mask = CPU_MA atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -55,7 +56,12 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -92,7 +98,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = nmi_count(cpu); local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks + mdelay((100*1000)/nmi_hz); // wait 100 ticks for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -317,7 +323,60 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace int irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return 0; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", + per_cpu(irq_stat, cpu).apic_timer_irqs); + show_regs(regs); + spin_unlock(&nmi_print_lock); + nmi_show_regs[cpu] = 0; + return 1; +} + +void nmi_show_all_regs(void) +{ + struct pt_regs *regs; + int i, cpu; + + if (system_state == SYSTEM_BOOTING) + return; + + preempt_disable(); + + regs = get_irq_regs(); + cpu = smp_processor_id(); + + printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", cpu); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + if (regs) + irq_show_regs_callback(cpu, regs); + else + nmi_show_regs[cpu] = 0; + + smp_send_nmi_allbutself(); + preempt_enable(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +notrace __kprobes int +nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { /* @@ -328,7 +387,10 @@ __kprobes int nmi_watchdog_tick(struct p unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - int rc = 0; + int rc; + + rc = irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -342,7 +404,7 @@ __kprobes int nmi_watchdog_tick(struct p spin_lock(&lock); printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); + show_regs(regs); spin_unlock(&lock); cpu_clear(cpu, backtrace_mask); } @@ -354,6 +416,7 @@ __kprobes int nmi_watchdog_tick(struct p sum = per_cpu(irq_stat, cpu).apic_timer_irqs + per_cpu(irq_stat, cpu).irq0_irqs; + /* if the apic timer isn't firing, this cpu isn't doing much */ /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* @@ -361,11 +424,36 @@ __kprobes int nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI watchdog detected lockup on " + "CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], + 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + } + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + printk(KERN_WARNING "NMI watchdog running again ...\n"); + for_each_online_cpu(i) + alert_counter[i] = 0; + + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; @@ -463,5 +551,17 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + cpumask_t mask = cpu_online_map; + preempt_disable(); + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); + preempt_enable(); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); Index: linux-2.6.25.8-rt7/arch/x86/kernel/nmi_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/nmi_64.c 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/nmi_64.c 2008-06-23 19:14:32.000000000 -0400 @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include +#include int unknown_nmi_panic; int nmi_watchdog_enabled; @@ -42,12 +44,12 @@ atomic_t nmi_active = ATOMIC_INIT(0); / static int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); /* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) +static void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -63,7 +65,9 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -297,7 +301,7 @@ void touch_nmi_watchdog(void) unsigned cpu; /* - * Tell other CPUs to reset their alert counters. We cannot + * Tell other CPUs to reset their alert counters. We cannot * do it ourselves because the alert count increase is not * atomic. */ @@ -311,12 +315,66 @@ void touch_nmi_watchdog(void) } EXPORT_SYMBOL(touch_nmi_watchdog); -int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace int irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return 0; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", read_pda(apic_timer_irqs)); + show_regs(regs); + spin_unlock(&nmi_print_lock); + nmi_show_regs[cpu] = 0; + return 1; +} + +void nmi_show_all_regs(void) +{ + struct pt_regs *regs; + int i, cpu; + + if (system_state == SYSTEM_BOOTING) + return; + + preempt_disable(); + + regs = get_irq_regs(); + cpu = smp_processor_id(); + + printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", cpu); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + if (regs) + irq_show_regs_callback(cpu, regs); + else + nmi_show_regs[cpu] = 0; + + smp_send_nmi_allbutself(); + preempt_enable(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +notrace int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; int cpu = smp_processor_id(); - int rc = 0; + int rc; + + rc = irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -325,7 +383,6 @@ int __kprobes nmi_watchdog_tick(struct p touched = 1; } - sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -341,6 +398,12 @@ int __kprobes nmi_watchdog_tick(struct p cpu_clear(cpu, backtrace_mask); } + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = read_pda(apic_timer_irqs) + kstat_cpu(cpu).irqs[0]; + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ @@ -354,9 +417,26 @@ int __kprobes nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + } + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, panic_on_timeout); + } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -474,5 +554,14 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + preempt_disable(); + send_IPI_allbutself(NMI_VECTOR); + preempt_enable(); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); Index: linux-2.6.25.8-rt7/arch/x86/lib/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/lib/Makefile 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/lib/Makefile 2008-06-23 19:13:05.000000000 -0400 @@ -5,6 +5,7 @@ obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay_$(BITS).o +lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o lib-y += memcpy_$(BITS).o @@ -20,7 +21,7 @@ else CFLAGS_csum-partial_64.o := -funroll-loops lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o - lib-y += thunk_64.o clear_page_64.o copy_page_64.o + lib-y += clear_page_64.o copy_page_64.o lib-y += bitops_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o Index: linux-2.6.25.8-rt7/include/asm-x86/alternative.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/alternative.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/alternative.h 2008-06-23 19:13:05.000000000 -0400 @@ -72,6 +72,8 @@ static inline void alternatives_smp_modu static inline void alternatives_smp_switch(int smp) {} #endif /* CONFIG_SMP */ +const unsigned char *const *find_nop_table(void); + /* * Alternative instructions for different CPU types or capabilities. * Index: linux-2.6.25.8-rt7/include/asm-x86/irqflags.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/irqflags.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/irqflags.h 2008-06-23 19:13:05.000000000 -0400 @@ -212,8 +212,6 @@ static inline void trace_hardirqs_fixup( * have a reliable stack. x86_64 only. */ #define SWAPGS_UNSAFE_STACK swapgs -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ @@ -225,24 +223,6 @@ static inline void trace_hardirqs_fixup( TRACE_IRQS_OFF; #else -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #define ARCH_LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ @@ -256,8 +236,8 @@ static inline void trace_hardirqs_fixup( #endif #ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else # define TRACE_IRQS_ON # define TRACE_IRQS_OFF Index: linux-2.6.25.8-rt7/include/asm-x86/timer.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/timer.h 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/timer.h 2008-06-23 19:13:07.000000000 -0400 @@ -43,19 +43,16 @@ DECLARE_PER_CPU(unsigned long, cyc2ns); #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) +static inline notrace unsigned long long __cycles_2_ns(unsigned long long cyc) { return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; } -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static inline notrace unsigned long long cycles_2_ns(unsigned long long cyc) { unsigned long long ns; - unsigned long flags; - local_irq_save(flags); ns = __cycles_2_ns(cyc); - local_irq_restore(flags); return ns; } Index: linux-2.6.25.8-rt7/arch/x86/kernel/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/Makefile 2008-06-23 19:13:00.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/Makefile 2008-06-23 19:13:05.000000000 -0400 @@ -7,6 +7,13 @@ extra-$(CONFIG_X86_64) += head64.o CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc_64.o = -pg +CFLAGS_REMOVE_tsc_32.o = -pg +CFLAGS_REMOVE_rtc.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -55,6 +62,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse_$(B obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o Index: linux-2.6.25.8-rt7/init/main.c =================================================================== --- linux-2.6.25.8-rt7.orig/init/main.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/init/main.c 2008-06-23 19:14:32.000000000 -0400 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +60,7 @@ #include #include #include +#include #include #include @@ -97,6 +100,12 @@ static inline void acpi_early_init(void) #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } #endif +#ifdef CONFIG_ALLOC_RTSJ_MEM +extern void alloc_rtsj_mem_early_setup(void); +#else +static inline void alloc_rtsj_mem_early_setup(void) { } +#endif + #ifdef CONFIG_TC extern void tc_init(void); @@ -434,6 +443,8 @@ static void noinline __init_refok rest_i { int pid; + system_state = SYSTEM_BOOTING_SCHEDULER_OK; + kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); @@ -445,7 +456,7 @@ static void noinline __init_refok rest_i * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -550,8 +561,10 @@ asmlinkage void __init start_kernel(void * fragile until we cpu_idle() for the first time. */ preempt_disable(); + build_all_zonelists(); page_alloc_init(); + early_init_hardirqs(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, @@ -572,6 +585,7 @@ asmlinkage void __init start_kernel(void softirq_init(); timekeeping_init(); time_init(); + sched_clock_init(); profile_init(); if (!irqs_disabled()) printk("start_kernel(): bug: interrupts were enabled early\n"); @@ -606,6 +620,7 @@ asmlinkage void __init start_kernel(void #endif vfs_caches_init_early(); cpuset_init_early(); + alloc_rtsj_mem_early_setup(); mem_init(); enable_debug_pagealloc(); cpu_hotplug_init(); @@ -646,6 +661,9 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -750,11 +768,14 @@ __setup("nosoftlockup", nosoftlockup_set static void __init do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); + extern int spawn_desched_task(void); migration_init(); + posix_cpu_thread_init(); spawn_ksoftirqd(); if (!nosoftlockup) spawn_softlockup_task(); + spawn_desched_task(); } static void run_init_process(char *init_filename) @@ -780,11 +801,17 @@ static int noinline init_post(void) (void) sys_dup(0); (void) sys_dup(0); +#ifdef CONFIG_PREEMPT_RT + ftrace_disable_daemon(); +#endif if (ramdisk_execute_command) { run_init_process(ramdisk_execute_command); printk(KERN_WARNING "Failed to execute %s\n", ramdisk_execute_command); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif /* * We try each of these until one succeeds. @@ -826,6 +853,8 @@ static int __init kernel_init(void * unu smp_prepare_cpus(setup_max_cpus); + init_hardirqs(); + do_pre_smp_initcalls(); smp_init(); @@ -847,12 +876,61 @@ static int __init kernel_init(void * unu ramdisk_execute_command = NULL; prepare_namespace(); } +#ifdef CONFIG_PREEMPT_RT + WARN_ON(irqs_disabled()); +#endif +#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_IRQSOFF_TRACER) + defined(CONFIG_PREEMPT_TRACER) + defined(CONFIG_FTRACE) + defined(CONFIG_WAKEUP_LATENCY_HIST) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP)) + +#if DEBUG_COUNT > 0 + printk(KERN_ERR "*****************************************************************************\n"); + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* REMINDER, the following debugging option is turned on in your .config: *\n"); +#else + printk(KERN_ERR "* REMINDER, the following debugging options are turned on in your .config: *\n"); +#endif + printk(KERN_ERR "* *\n"); +#ifdef CONFIG_DEBUG_RT_MUTEXES + printk(KERN_ERR "* CONFIG_DEBUG_RT_MUTEXES *\n"); +#endif +#ifdef CONFIG_IRQSOFF_TRACER + printk(KERN_ERR "* CONFIG_IRQSOFF_TRACER *\n"); +#endif +#ifdef CONFIG_PREEMPT_TRACER + printk(KERN_ERR "* CONFIG_PREEMPT_TRACER *\n"); +#endif +#ifdef CONFIG_FTRACE + printk(KERN_ERR "* CONFIG_FTRACE *\n"); +#endif +#ifdef CONFIG_WAKEUP_LATENCY_HIST + printk(KERN_ERR "* CONFIG_WAKEUP_LATENCY_HIST *\n"); +#endif +#ifdef CONFIG_DEBUG_SLAB + printk(KERN_ERR "* CONFIG_DEBUG_SLAB *\n"); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_ERR "* CONFIG_DEBUG_PAGEALLOC *\n"); +#endif +#ifdef CONFIG_LOCKDEP + printk(KERN_ERR "* CONFIG_LOCKDEP *\n"); +#endif + printk(KERN_ERR "* *\n"); +#if DEBUG_COUNT == 1 + printk(KERN_ERR "* it may increase runtime overhead and latencies. *\n"); +#else + printk(KERN_ERR "* they may increase runtime overhead and latencies. *\n"); +#endif + printk(KERN_ERR "* *\n"); + printk(KERN_ERR "*****************************************************************************\n"); +#endif /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ init_post(); + WARN_ON(debug_direct_keyboard); + return 0; } Index: linux-2.6.25.8-rt7/kernel/sched_clock.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/sched_clock.c 2008-06-23 19:14:22.000000000 -0400 @@ -0,0 +1,272 @@ +/* + * sched_clock for unstable cpu clocks + * + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * + * Based on code by: + * Ingo Molnar + * Guillaume Chazarain + * + * Create a semi stable clock from a mixture of other events, including: + * - gtod + * - jiffies + * - sched_clock() + * - explicit idle events + * + * We use gtod as base and the unstable clock deltas. The deltas are filtered, + * making it monotonic and keeping it within an expected window. This window + * is set up using jiffies. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat + * consistent between cpus (never more than 1 jiffies difference). + */ +#include +#include +#include +#include +#include +#include + + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + +struct sched_clock_data { + /* + * Raw spinlock - this is a special case: this might be called + * from within instrumentation code so we dont want to do any + * instrumentation ourselves. + */ + __raw_spinlock_t lock; + + unsigned long prev_jiffies; + u64 prev_raw; + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +static __read_mostly int sched_clock_running; + +void sched_clock_init(void) +{ + int cpu; + unsigned long now_jiffies = jiffies; + u64 ktime = ktime_to_ns(ktime_get()); + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + scd->prev_jiffies = now_jiffies; + scd->prev_raw = 0; + scd->tick_raw = 0; + scd->tick_gtod = 0; + scd->clock = ktime; + } + + sched_clock_running = 1; +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use jiffies to generate a min,max window to clip the raw values + */ +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +{ + unsigned long now_jiffies = jiffies; + long delta_jiffies = now_jiffies - scd->prev_jiffies; + u64 clock = scd->clock; + u64 min_clock, max_clock; + s64 delta = now - scd->prev_raw; + + WARN_ON_ONCE(!irqs_disabled()); + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + if (unlikely(delta < 0)) { + clock++; + goto out; + } + + max_clock = min_clock + TICK_NSEC; + + if (unlikely(clock + delta > max_clock)) { + if (clock < max_clock) + clock = max_clock; + else + clock++; + } else { + clock += delta; + } + + out: + if (unlikely(clock < min_clock)) + clock = min_clock; + + scd->prev_raw = now; + scd->prev_jiffies = now_jiffies; + scd->clock = clock; +} + +static void lock_double_clock(struct sched_clock_data *data1, + struct sched_clock_data *data2) +{ + if (data1 < data2) { + __raw_spin_lock(&data1->lock); + __raw_spin_lock(&data2->lock); + } else { + __raw_spin_lock(&data2->lock); + __raw_spin_lock(&data1->lock); + } +} + +u64 sched_clock_cpu(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + u64 now, clock; + + if (unlikely(!sched_clock_running)) + return 0ULL; + + if (unlikely(in_nmi())) + return scd->clock; + + WARN_ON_ONCE(!irqs_disabled()); + now = sched_clock(); + +#if 0 + if (cpu != smp_processor_id()) { + /* + * in order to update a remote cpu's clock based on our + * unstable raw time rebase it against: + * tick_raw (offset between raw counters) + * tick_gotd (tick offset between cpus) + */ + struct sched_clock_data *my_scd = this_scd(); + + lock_double_clock(scd, my_scd); + + now -= my_scd->tick_raw; + now += scd->tick_raw; + + now -= my_scd->tick_gtod; + now += scd->tick_gtod; + + __raw_spin_unlock(&my_scd->lock); + } else { + __raw_spin_lock(&scd->lock); + } + + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); +#else + if (cpu == smp_processor_id() && __raw_spin_trylock(&scd->lock)) { + __update_sched_clock(scd, now); + clock = scd->clock; + __raw_spin_unlock(&scd->lock); + } else + clock = scd->clock; +#endif + + return clock; +} + +void sched_clock_tick(void) +{ + struct sched_clock_data *scd = this_scd(); + u64 now, now_gtod; + + if (unlikely(!sched_clock_running)) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + now = sched_clock(); + now_gtod = ktime_to_ns(ktime_get()); + + __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now); + /* + * update tick_gtod after __update_sched_clock() because that will + * already observe 1 new jiffy; adding a new tick_gtod to that would + * increase the clock 2 jiffies. + */ + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + __raw_spin_unlock(&scd->lock); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct sched_clock_data *scd = this_scd(); + u64 now = sched_clock(); + + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + __raw_spin_lock(&scd->lock); + scd->prev_raw = now; + scd->clock += delta_ns; + __raw_spin_unlock(&scd->lock); + + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +#endif + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} + +unsigned long long cpu_clock(int cpu) +{ + unsigned long flags; + unsigned long long clock; + + raw_local_irq_save(flags); + clock = sched_clock_cpu(cpu); + raw_local_irq_restore(flags); + + return clock; +} +EXPORT_SYMBOL_GPL(cpu_clock); Index: linux-2.6.25.8-rt7/kernel/sched_debug.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/sched_debug.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/sched_debug.c 2008-06-23 19:14:05.000000000 -0400 @@ -175,18 +175,24 @@ static void print_cpu(struct seq_file *m PN(next_balance); P(curr->pid); PN(clock); - PN(idle_clock); - PN(prev_clock_raw); - P(clock_warps); - P(clock_overflows); - P(clock_underflows); - P(clock_deep_idle_events); - PN(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); +#ifdef CONFIG_PREEMPT_RT + /* Print rt related rq stats */ + P(rt.rt_nr_running); + P(rt.rt_nr_uninterruptible); +# ifdef CONFIG_SCHEDSTATS + P(rto_schedule); + P(rto_schedule_tail); + P(rto_wakeup); + P(rto_pulled); + P(rto_pushed); +# endif +#endif + #undef P #undef PN Index: linux-2.6.25.8-rt7/kernel/sched_fair.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/sched_fair.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/sched_fair.c 2008-06-23 19:14:33.000000000 -0400 @@ -901,7 +901,7 @@ static void yield_task_fair(struct rq *r return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - __update_rq_clock(rq); + update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ @@ -951,7 +951,7 @@ static int wake_idle(int cpu, struct tas * sibling runqueue info. This will avoid the checks and cache miss * penalities associated with that. */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) return cpu; for_each_domain(cpu, sd) { Index: linux-2.6.25.8-rt7/arch/x86/kernel/tsc_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/tsc_64.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/tsc_64.c 2008-06-23 19:13:07.000000000 -0400 @@ -66,6 +66,7 @@ static void set_cyc2ns_scale(unsigned lo unsigned long long native_sched_clock(void) { unsigned long a = 0; + unsigned long long ret; /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, * which means it is not completely exact and may not be monotonous @@ -73,8 +74,11 @@ unsigned long long native_sched_clock(vo * scheduling purposes. */ + preempt_disable_notrace(); rdtscll(a); - return cycles_2_ns(a); + ret = cycles_2_ns(a); + preempt_enable_notrace(); + return ret; } /* We need to define a real function for sched_clock, to override the Index: linux-2.6.25.8-rt7/arch/x86/kernel/apic_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/apic_32.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/apic_32.c 2008-06-23 19:14:34.000000000 -0400 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -598,6 +599,7 @@ void smp_apic_timer_interrupt(struct pt_ { struct pt_regs *old_regs = set_irq_regs(regs); + ftrace_event_irq(-1, user_mode(regs), regs->ip); /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -1285,6 +1287,7 @@ void smp_error_interrupt(struct pt_regs */ printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } Index: linux-2.6.25.8-rt7/arch/x86/kernel/irq_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/irq_32.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/irq_32.c 2008-06-23 19:14:34.000000000 -0400 @@ -16,6 +16,8 @@ #include #include +#include + #include #include @@ -77,6 +79,10 @@ unsigned int do_IRQ(struct pt_regs *regs u32 *isp; #endif +#ifdef CONFIG_X86_LOCAL_APIC + irq_show_regs_callback(smp_processor_id(), regs); +#endif + if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __FUNCTION__, irq); @@ -85,6 +91,7 @@ unsigned int do_IRQ(struct pt_regs *regs old_regs = set_irq_regs(regs); irq_enter(); + ftrace_event_irq(irq, user_mode(regs), regs->ip); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -93,7 +100,7 @@ unsigned int do_IRQ(struct pt_regs *regs __asm__ __volatile__("andl %%esp,%0" : "=r" (sp) : "0" (THREAD_SIZE - 1)); if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", sp - sizeof(struct thread_info)); dump_stack(); } Index: linux-2.6.25.8-rt7/arch/x86/kernel/irq_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/irq_64.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/irq_64.c 2008-06-23 19:13:16.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -165,10 +166,14 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned vector = ~regs->orig_ax; unsigned irq; + irq_show_regs_callback(smp_processor_id(), regs); + exit_idle(); irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; + ftrace_event_irq(irq, user_mode(regs), regs->ip); + #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif Index: linux-2.6.25.8-rt7/arch/x86/kernel/traps_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/traps_32.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/traps_32.c 2008-06-23 19:14:38.000000000 -0400 @@ -30,6 +30,8 @@ #include #include +#include + #ifdef CONFIG_EISA #include #include @@ -255,6 +257,7 @@ show_trace_log_lvl(struct task_struct *t { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); + print_preempt_trace(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -284,8 +287,15 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } + + pause_on_oops_head(); + printk("\n%sCall Trace:\n", log_lvl); show_trace_log_lvl(task, regs, sp, bp, log_lvl); + debug_show_held_locks(task); + + pause_on_oops_tail(); + } void show_stack(struct task_struct *task, unsigned long *sp) @@ -317,6 +327,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -382,6 +398,8 @@ int __kprobes __die(const char * str, st unsigned long sp; unsigned short ss; + ftrace_stop(); + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); @@ -425,7 +443,7 @@ void die(const char * str, struct pt_reg u32 lock_owner; int lock_owner_depth; } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -436,7 +454,7 @@ void die(const char * str, struct pt_reg if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); + spin_lock(&die.lock); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); @@ -455,7 +473,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); + spin_unlock(&die.lock); raw_local_irq_restore(flags); if (!regs) @@ -495,6 +513,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { /* * We want error_code and trap_no set for userspace faults and @@ -755,6 +778,7 @@ void __kprobes die_nmi(struct pt_regs *r crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } @@ -804,6 +828,8 @@ __kprobes void do_nmi(struct pt_regs * r nmi_enter(); + ftrace_event_irq(-1, user_mode(regs), regs->ip); + cpu = smp_processor_id(); ++nmi_count(cpu); Index: linux-2.6.25.8-rt7/arch/x86/kernel/traps_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/traps_64.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/traps_64.c 2008-06-23 19:14:38.000000000 -0400 @@ -33,6 +33,8 @@ #include #include +#include + #if defined(CONFIG_EDAC) #include #endif @@ -82,20 +84,22 @@ static inline void conditional_sti(struc local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) +static inline void preempt_conditional_sti(struct pt_regs *regs, int stack) { - inc_preempt_count(); + if (stack) + inc_preempt_count(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void preempt_conditional_cli(struct pt_regs *regs, int stack) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ - dec_preempt_count(); + if (stack) + dec_preempt_count(); } int kstack_depth_to_print = 12; @@ -132,10 +136,14 @@ static unsigned long *in_exception_stack unsigned *usedp, char **idp) { static char ids[][8] = { +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = "#DB", +#endif [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", +#if STACKFAULT_STACK > 0 [STACKFAULT_STACK - 1] = "#SS", +#endif [MCE_STACK - 1] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" @@ -260,7 +268,7 @@ void dump_trace(struct task_struct *tsk, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); + const unsigned cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -344,7 +352,6 @@ void dump_trace(struct task_struct *tsk, * This handles the process stack: */ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); - put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -383,9 +390,13 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, unsigned long bp) { + pause_on_oops_head(); printk("\nCall Trace:\n"); dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); + debug_show_held_locks(tsk); + pause_on_oops_tail(); + print_preempt_trace(tsk); } static void @@ -394,7 +405,7 @@ _show_stack(struct task_struct *tsk, str { unsigned long *stack; int i; - const int cpu = smp_processor_id(); + const int cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -513,7 +524,7 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static raw_spinlock_t die_lock = RAW_SPIN_LOCK_UNLOCKED(die_lock); static int die_owner = -1; static unsigned int die_nest_count; @@ -527,11 +538,11 @@ unsigned __kprobes long oops_begin(void) /* racy, but better than risking deadlock. */ raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { + if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - __raw_spin_lock(&die_lock); + spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -547,7 +558,7 @@ void __kprobes oops_end(unsigned long fl die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); + spin_unlock(&die_lock); raw_local_irq_restore(flags); if (!regs) { oops_exit(); @@ -562,6 +573,9 @@ void __kprobes oops_end(unsigned long fl int __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; + + ftrace_stop(); + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); @@ -707,9 +721,9 @@ asmlinkage void do_stack_segment(struct if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, STACKFAULT_STACK); do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, STACKFAULT_STACK); } asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) @@ -825,6 +839,8 @@ asmlinkage __kprobes void default_do_nmi cpu = smp_processor_id(); + ftrace_event_irq(-1, user_mode(regs), regs->ip); + /* Only the BSP gets external NMIs from the system. */ if (!cpu) reason = get_nmi_reason(); @@ -863,9 +879,9 @@ asmlinkage void __kprobes do_int3(struct if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } /* Help handler running on IST stack to switch back to user stack @@ -911,7 +927,7 @@ asmlinkage void __kprobes do_debug(struc SIGTRAP) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -943,13 +959,13 @@ asmlinkage void __kprobes do_debug(struc clear_dr7: set_debugreg(0UL, 7); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); return; clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) Index: linux-2.6.25.8-rt7/arch/x86/mm/fault.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mm/fault.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mm/fault.c 2008-06-23 19:13:58.000000000 -0400 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -363,6 +364,7 @@ static int is_f00f_bug(struct pt_regs *r nr = (address - idt_descr.address) >> 3; if (nr == 6) { + zap_rt_locks(); do_invalid_op(regs, 0); return 1; } @@ -597,6 +599,8 @@ void __kprobes do_page_fault(struct pt_r /* get the address */ address = read_cr2(); + ftrace_event_fault(regs->ip, error_code, address); + si_code = SEGV_MAPERR; if (notify_page_fault(regs)) @@ -646,7 +650,7 @@ void __kprobes do_page_fault(struct pt_r * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ - if (in_atomic() || !mm) + if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) goto bad_area_nosemaphore; #else /* CONFIG_X86_64 */ if (likely(regs->flags & X86_EFLAGS_IF)) Index: linux-2.6.25.8-rt7/kernel/hrtimer.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/hrtimer.c 2008-06-23 19:12:59.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/hrtimer.c 2008-06-23 19:14:02.000000000 -0400 @@ -44,6 +44,8 @@ #include #include +#include + #include /** @@ -393,9 +395,9 @@ static inline int hrtimer_is_hres_enable /* * Is the high resolution mode active ? */ -static inline int hrtimer_hres_active(void) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return __get_cpu_var(hrtimer_bases).hres_active; + return cpu_base->hres_active; } /* @@ -483,11 +485,12 @@ static int hrtimer_reprogram(struct hrti */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base; + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset; unsigned long seq; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active(base)) return; do { @@ -497,8 +500,6 @@ static void retrigger_next_event(void *a -wall_to_monotonic.tv_nsec); } while (read_seqretry(&xtime_lock, seq)); - base = &__get_cpu_var(hrtimer_bases); - /* Adjust CLOCK_REALTIME offset */ spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = @@ -601,10 +602,8 @@ static inline int hrtimer_enqueue_reprog /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -615,7 +614,7 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", raw_smp_processor_id()); return 0; } base->hres_active = 1; @@ -627,8 +626,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } @@ -639,9 +636,15 @@ static inline void hrtimer_raise_softirq #else -static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) @@ -721,6 +724,28 @@ u64 hrtimer_forward(struct hrtimer *time } EXPORT_SYMBOL_GPL(hrtimer_forward); +unsigned long +hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + unsigned long orun = 1; + ktime_t delta; + + delta = ktime_sub(now, timer->expires); + + if (delta.tv64 < 0) + return 0; + + if (interval.tv64 < timer->base->resolution.tv64) + interval.tv64 = timer->base->resolution.tv64; + + if (unlikely(delta.tv64 >= interval.tv64)) + orun = ktime_divns(delta, ktime_to_ns(interval)) + 1; + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_overrun); + + /* * enqueue_hrtimer - internal function to (re)start a timer * @@ -735,6 +760,7 @@ static void enqueue_hrtimer(struct hrtim struct hrtimer *entry; int leftmost = 1; + ftrace_event_timer_set(&timer->expires, timer); /* * Find the right place in the rbtree: */ @@ -806,7 +832,7 @@ static void __remove_hrtimer(struct hrti if (base->first == &timer->node) { base->first = rb_next(&timer->node); /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) + if (reprogram && hrtimer_hres_active(base->cpu_base)) hrtimer_force_reprogram(base->cpu_base); } rb_erase(&timer->node, &base->active); @@ -948,7 +974,7 @@ int hrtimer_cancel(struct hrtimer *timer if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -988,7 +1014,7 @@ ktime_t hrtimer_get_next_event(void) spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { + if (!hrtimer_hres_active(cpu_base)) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -1060,6 +1086,32 @@ int hrtimer_get_res(const clockid_t whic } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) { spin_lock_irq(&cpu_base->lock); @@ -1111,6 +1163,8 @@ static void run_hrtimer_pending(struct h } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } static void __run_hrtimer(struct hrtimer *timer) @@ -1169,6 +1223,7 @@ void hrtimer_interrupt(struct clock_even retry: now = ktime_get(); + ftrace_event_timestamp(&now); expires_next.tv64 = KTIME_MAX; @@ -1197,6 +1252,8 @@ void hrtimer_interrupt(struct clock_even break; } + ftrace_event_timer_triggered(&timer->expires, timer); + /* Move softirq callbacks to the pending list */ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { __remove_hrtimer(timer, base, @@ -1228,7 +1285,10 @@ void hrtimer_interrupt(struct clock_even static void run_hrtimer_softirq(struct softirq_action *h) { - run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); + struct hrtimer_cpu_base *cpu_base; + + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + run_hrtimer_pending(cpu_base); } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1244,7 +1304,7 @@ void hrtimer_run_pending(void) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - if (hrtimer_hres_active()) + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1256,7 +1316,7 @@ void hrtimer_run_pending(void) * deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); + hrtimer_switch_to_hres(cpu_base); run_hrtimer_pending(cpu_base); } @@ -1299,10 +1359,11 @@ static inline void run_hrtimer_queue(str void hrtimer_run_queues(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; int i; - if (hrtimer_hres_active()) + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + if (hrtimer_hres_active(cpu_base)) return; hrtimer_get_softirq_time(cpu_base); @@ -1458,6 +1519,9 @@ static void __cpuinit init_hrtimers_cpu( INIT_LIST_HEAD(&cpu_base->cb_pending); hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU @@ -1492,7 +1556,7 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, + raw_double_spin_lock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1500,7 +1564,7 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, + raw_double_spin_unlock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); local_irq_enable(); put_cpu_var(hrtimer_bases); Index: linux-2.6.25.8-rt7/kernel/trace/trace_events.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_events.c 2008-06-23 19:14:35.000000000 -0400 @@ -0,0 +1,702 @@ +/* + * trace task events + * + * Copyright (C) 2007 Steven Rostedt + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array __read_mostly *events_trace; +static int __read_mostly tracer_enabled; +static atomic_t event_ref; + +static void event_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + tr->time_start = ftrace_now(raw_smp_processor_id()); +} + +/* HACK */ +void notrace +sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + struct trace_array *tr; + struct trace_array_cpu *data; + unsigned long flags; + unsigned long ip; + int cpu; + + if (!tracer_enabled) + return; + + tr = events_trace; + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + atomic_inc(&data->disabled); + if (atomic_read(&data->disabled) != 1) + goto out; + + ip = CALLER_ADDR0; + + tracing_event_syscall(tr, data, flags, ip, nr, p1, p2, p3); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +void notrace +sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2, + unsigned long p3) +{ + struct trace_array *tr; + struct trace_array_cpu *data; + unsigned long flags; + unsigned long ip; + int cpu; + + if (!tracer_enabled) + return; + + tr = events_trace; + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + atomic_inc(&data->disabled); + if (atomic_read(&data->disabled) != 1) + goto out; + + ip = CALLER_ADDR0; + tracing_event_syscall(tr, data, flags, ip, nr | 0x80000000, p1, p2, p3); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} +#endif + +void notrace +sys_ret(unsigned long ret) +{ + struct trace_array *tr; + struct trace_array_cpu *data; + unsigned long flags; + unsigned long ip; + int cpu; + + if (!tracer_enabled) + return; + + tr = events_trace; + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + atomic_inc(&data->disabled); + if (atomic_read(&data->disabled) != 1) + goto out; + + ip = CALLER_ADDR0; + tracing_event_sysret(tr, data, flags, ip, ret); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#define getarg(arg, ap) arg = va_arg(ap, typeof(arg)); + +static void +event_irq_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long ip, flags; + int irq, user, cpu; + long disable; + + if (!tracer_enabled) + return; + + getarg(irq, *args); + getarg(user, *args); + getarg(ip, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_irq(tr, data, flags, CALLER_ADDR1, irq, user, ip); + + out: + atomic_dec(&data->disabled); +} + +static void +event_fault_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long ip, flags, error, addr; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(ip, *args); + getarg(error, *args); + getarg(addr, *args); + + preempt_disable_notrace(); + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_fault(tr, data, flags, CALLER_ADDR1, ip, error, addr); + + out: + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static void +event_timer_set_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *expires; + void *timer; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(expires, *args); + getarg(timer, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_timer_set(tr, data, flags, CALLER_ADDR1, expires, timer); + + out: + atomic_dec(&data->disabled); +} + +static void +event_timer_triggered_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *expired; + void *timer; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(expired, *args); + getarg(timer, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_timer_triggered(tr, data, flags, CALLER_ADDR1, expired, timer); + + out: + atomic_dec(&data->disabled); +} + +static void +event_hrtimer_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *now; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(now, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_timestamp(tr, data, flags, CALLER_ADDR1, now); + + out: + atomic_dec(&data->disabled); +} + +static void +event_program_event_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *expires; + int64_t *delta; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(expires, *args); + getarg(delta, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_program_event(tr, data, flags, CALLER_ADDR1, expires, delta); + + out: + atomic_dec(&data->disabled); +} + + +static void +event_task_activate_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *p; + long disable; + int cpu, rqcpu; + + if (!tracer_enabled) + return; + + getarg(p, *args); + getarg(rqcpu, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_task_activate(tr, data, flags, CALLER_ADDR1, p, rqcpu); + + out: + atomic_dec(&data->disabled); +} + +static void +event_task_deactivate_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *p; + long disable; + int cpu, rqcpu; + + if (!tracer_enabled) + return; + + getarg(p, *args); + getarg(rqcpu, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_task_deactivate(tr, data, flags, CALLER_ADDR1, p, rqcpu); + + out: + atomic_dec(&data->disabled); +} + +static void +event_wakeup_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *wakee, *curr; + long disable, ignore2; + void *ignore3; + int ignore1; + int cpu; + + if (!tracer_enabled) + return; + + getarg(ignore1, *args); + getarg(ignore2, *args); + getarg(ignore3, *args); + + getarg(wakee, *args); + getarg(curr, *args); + + /* interrupts should be disabled */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) + goto out; + + local_save_flags(flags); + /* record process's command line */ + tracing_record_cmdline(wakee); + tracing_record_cmdline(curr); + + tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + + out: + atomic_dec(&data->disabled); +} +static void +event_ctx_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *prev; + struct task_struct *next; + long disable, ignore2; + void *ignore3; + int ignore1; + int cpu; + + if (!tracer_enabled) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + getarg(ignore1, *args); + getarg(ignore1, *args); + getarg(ignore2, *args); + getarg(ignore3, *args); + + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + + /* interrupts should be disabled */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + disable = atomic_inc_return(&data->disabled); + + if (likely(disable != 1)) + goto out; + + local_save_flags(flags); + tracing_sched_switch_trace(tr, data, prev, next, flags); + out: + atomic_dec(&data->disabled); +} + +static int event_register_marker(const char *name, const char *format, + marker_probe_func *probe, void *data) +{ + int ret; + + ret = marker_probe_register(name, format, probe, data); + if (ret) { + pr_info("event trace: Couldn't add marker" + " probe to %s\n", name); + return ret; + } + + return 0; +} + +static void event_tracer_register(struct trace_array *tr) +{ + int ret; + + ret = event_register_marker("ftrace_event_irq", "%d %d %ld", + event_irq_callback, tr); + if (ret) + return; + + ret = event_register_marker("ftrace_event_fault", "%ld %ld %ld", + event_fault_callback, tr); + if (ret) + goto out1; + + ret = event_register_marker("ftrace_event_timer_set", "%p %p", + event_timer_set_callback, tr); + if (ret) + goto out2; + + ret = event_register_marker("ftrace_event_timer_triggered", "%p %p", + event_timer_triggered_callback, tr); + if (ret) + goto out3; + + ret = event_register_marker("ftrace_event_hrtimer", "%p", + event_hrtimer_callback, tr); + if (ret) + goto out4; + + ret = event_register_marker("ftrace_event_task_activate", "%p %d", + event_task_activate_callback, tr); + if (ret) + goto out5; + + ret = event_register_marker("ftrace_event_task_deactivate", "%p %d", + event_task_deactivate_callback, tr); + if (ret) + goto out6; + + ret = event_register_marker("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + event_wakeup_callback, tr); + if (ret) + goto out7; + + ret = event_register_marker("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + event_wakeup_callback, tr); + if (ret) + goto out8; + + ret = event_register_marker("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + event_ctx_callback, tr); + if (ret) + goto out9; + + ret = event_register_marker("ftrace_event_timer", "%p %p", + event_program_event_callback, tr); + if (ret) + goto out10; + + return; + + out10: + marker_probe_unregister("kernel_sched_schedule", + event_ctx_callback, tr); + out9: + marker_probe_unregister("kernel_sched_wakeup_new", + event_wakeup_callback, tr); + out8: + marker_probe_unregister("kernel_sched_wakeup", + event_wakeup_callback, tr); + out7: + marker_probe_unregister("ftrace_event_task_deactivate", + event_task_deactivate_callback, tr); + out6: + marker_probe_unregister("ftrace_event_task_activate", + event_task_activate_callback, tr); + out5: + marker_probe_unregister("ftrace_event_hrtimer", + event_hrtimer_callback, tr); + out4: + marker_probe_unregister("ftrace_event_timer_triggered", + event_timer_triggered_callback, tr); + out3: + marker_probe_unregister("ftrace_event_timer_set", + event_timer_set_callback, tr); + out2: + marker_probe_unregister("ftrace_event_fault", + event_fault_callback, tr); + out1: + marker_probe_unregister("ftrace_event_irq", + event_irq_callback, tr); +} + +static void event_tracer_unregister(struct trace_array *tr) +{ + marker_probe_unregister("ftrace_event_timer", + event_program_event_callback, tr); + marker_probe_unregister("kernel_sched_schedule", + event_ctx_callback, tr); + marker_probe_unregister("kernel_sched_wakeup_new", + event_wakeup_callback, tr); + marker_probe_unregister("kernel_sched_wakeup", + event_wakeup_callback, tr); + marker_probe_unregister("ftrace_event_task_deactivate", + event_task_deactivate_callback, tr); + marker_probe_unregister("ftrace_event_task_activate", + event_task_activate_callback, tr); + marker_probe_unregister("ftrace_event_hrtimer", + event_hrtimer_callback, tr); + marker_probe_unregister("ftrace_event_timer_triggered", + event_timer_triggered_callback, tr); + marker_probe_unregister("ftrace_event_timer_set", + event_timer_set_callback, tr); + marker_probe_unregister("ftrace_event_fault", + event_fault_callback, tr); + marker_probe_unregister("ftrace_event_irq", + event_irq_callback, tr); +} + +void trace_event_register(struct trace_array *tr) +{ + long ref; + + ref = atomic_inc_return(&event_ref); + if (ref == 1) + event_tracer_register(tr); +} + +void trace_event_unregister(struct trace_array *tr) +{ + long ref; + + ref = atomic_dec_and_test(&event_ref); + if (ref) + event_tracer_unregister(tr); +} + +static void start_event_trace(struct trace_array *tr) +{ + event_reset(tr); + trace_event_register(tr); + tracing_start_function_trace(); + tracer_enabled = 1; +} + +static void stop_event_trace(struct trace_array *tr) +{ + tracer_enabled = 0; + tracing_stop_function_trace(); + trace_event_unregister(tr); +} + +static void event_trace_init(struct trace_array *tr) +{ + events_trace = tr; + + if (tr->ctrl) + start_event_trace(tr); +} + +static void event_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_event_trace(tr); +} + +static void event_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_event_trace(tr); + else + stop_event_trace(tr); +} + +static void event_trace_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; +} + +static void event_trace_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + tracer_enabled = 1; +} + +static struct tracer event_trace __read_mostly = +{ + .name = "events", + .init = event_trace_init, + .reset = event_trace_reset, + .open = event_trace_open, + .close = event_trace_close, + .ctrl_update = event_trace_ctrl_update, +}; + +__init static int init_event_trace(void) +{ + int ret; + + ret = register_tracer(&event_trace); + if (ret) + return ret; + + return 0; +} + +device_initcall(init_event_trace); Index: linux-2.6.25.8-rt7/kernel/trace/trace_hist.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_hist.c 2008-06-23 19:14:33.000000000 -0400 @@ -0,0 +1,637 @@ +/* + * kernel/trace/trace_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + * Converted to work with the new latency tracer. + * Copyright (C) 2008 Red Hat, Inc. + * Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_hist.h" + +enum { + INTERRUPT_LATENCY = 0, + PREEMPT_LATENCY, + PREEMPT_INTERRUPT_LATENCY, + WAKEUP_LATENCY, +}; + +#define MAX_ENTRY_NUM 10240 + +struct hist_data { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +}; + +static char *latency_hist_dir_root = "latency_hist"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, interrupt_off_hist); +static char *interrupt_off_hist_dir = "interrupt_off_latency"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, preempt_off_hist); +static char *preempt_off_hist_dir = "preempt_off_latency"; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) +static DEFINE_PER_CPU(struct hist_data, preempt_irqs_off_hist); +static char *preempt_irqs_off_hist_dir = "preempt_interrupts_off_latency"; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist); +static char *wakeup_latency_hist_dir = "wakeup_latency"; +#endif + +void notrace latency_hist(int latency_type, int cpu, unsigned long latency) +{ + struct hist_data *my_hist; + + if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY) + || (latency_type > WAKEUP_LATENCY) || (latency < 0)) + return; + + switch (latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + my_hist = &per_cpu(interrupt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + my_hist = &per_cpu(preempt_off_hist, cpu); + break; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) + case PREEMPT_INTERRUPT_LATENCY: + my_hist = &per_cpu(preempt_irqs_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = &per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) + my_hist->max_lat = latency; + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + return; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + struct hist_data *my_hist = m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + char avgstr[32]; + + atomic_dec(&my_hist->hist_mode); + if (likely(my_hist->total_samples)) { + unsigned long avg = (unsigned long) + div64_64(my_hist->accumulate_lat, + my_hist->total_samples); + sprintf(avgstr, "%lu", avg); + } else + strcpy(avgstr, ""); + + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %s microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal" + " than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , avgstr + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t *pos) +{ + loff_t *index_ptr = p; + struct hist_data *my_hist = m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + struct hist_data *my_hist = m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &latency_hist_seq_op); + if (!ret) { + struct seq_file *seq = file->private_data; + seq->private = inode->i_private; + } + return ret; +} + +static struct file_operations latency_hist_fops = { + .open = latency_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void hist_reset(struct hist_data *hist) +{ + atomic_dec(&hist->hist_mode); + + memset(hist->hist_array, 0, sizeof(hist->hist_array)); + hist->beyond_hist_bound_samples = 0ULL; + hist->min_lat = 0xFFFFFFFFUL; + hist->max_lat = 0UL; + hist->total_samples = 0ULL; + hist->accumulate_lat = 0ULL; + + atomic_inc(&hist->hist_mode); +} + +ssize_t latency_hist_reset(struct file *file, const char __user *a, + size_t size, loff_t *off) +{ + int cpu; + struct hist_data *hist; + int latency_type = (long)file->private_data; + + switch (latency_type) { + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(wakeup_latency_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preempt_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(interrupt_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + case PREEMPT_INTERRUPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preempt_irqs_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + } + + return size; +} + +static struct file_operations latency_hist_reset_fops = { + .open = tracing_open_generic, + .write = latency_hist_reset, +}; + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start); +static DEFINE_PER_CPU(int, hist_irqsoff_tracing); +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start); +static DEFINE_PER_CPU(int, hist_preemptoff_tracing); +#endif +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start); +static DEFINE_PER_CPU(int, hist_preemptirqsoff_tracing); +#endif + +notrace void tracing_hist_preempt_start(void) +{ + cycle_t uninitialized_var(start); + int start_set = 0; + int cpu; + + if (!preempt_count() && !irqs_disabled()) + return; + + /* cpu is only used if we are in atomic */ + cpu = raw_smp_processor_id(); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if (irqs_disabled() && + !per_cpu(hist_irqsoff_tracing, cpu)) { + per_cpu(hist_irqsoff_tracing, cpu) = 1; + start_set++; + start = ftrace_now(cpu); + per_cpu(hist_irqsoff_start, cpu) = start; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if (preempt_count() && + !per_cpu(hist_preemptoff_tracing, cpu)) { + per_cpu(hist_preemptoff_tracing, cpu) = 1; + if (1 || !(start_set++)) + start = ftrace_now(cpu); + per_cpu(hist_preemptoff_start, cpu) = start; + + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if (!per_cpu(hist_preemptirqsoff_tracing, cpu)) { + per_cpu(hist_preemptirqsoff_tracing, cpu) = 1; + if (1 || !(start_set)) + start = ftrace_now(cpu); + per_cpu(hist_preemptirqsoff_start, cpu) = start; + } +#endif +} + +notrace void tracing_hist_preempt_stop(int irqs_on) +{ + long latency; + cycle_t start; + cycle_t uninitialized_var(stop); + int stop_set = 0; + int cpu; + + /* irqs_on == TRACE_STOP if we must stop tracing. */ + + /* cpu is only used if we are in atomic */ + cpu = raw_smp_processor_id(); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if (irqs_on && + per_cpu(hist_irqsoff_tracing, cpu)) { + stop = ftrace_now(cpu); + stop_set++; + start = per_cpu(hist_irqsoff_start, cpu); + latency = (long)nsecs_to_usecs(stop - start); + if (latency > 1000000) { + printk("%d: latency = %ld (%lu)\n", __LINE__, latency, latency); + printk("%d: start=%Ld stop=%Ld\n", __LINE__, start, stop); + } + barrier(); + per_cpu(hist_irqsoff_tracing, cpu) = 0; + latency_hist(INTERRUPT_LATENCY, cpu, latency); + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if ((!irqs_on || irqs_on == TRACE_STOP) && + per_cpu(hist_preemptoff_tracing, cpu)) { + WARN_ON(!preempt_count()); + if (1 || !(stop_set++)) + stop = ftrace_now(cpu); + start = per_cpu(hist_preemptoff_start, cpu); + latency = (long)nsecs_to_usecs(stop - start); + if (latency > 1000000) { + printk("%d: latency = %ld (%lu)\n", __LINE__, latency, latency); + printk("%d: start=%Ld stop=%Ld\n", __LINE__, start, stop); + } + barrier(); + per_cpu(hist_preemptoff_tracing, cpu) = 0; + latency_hist(PREEMPT_LATENCY, cpu, latency); + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if (((!irqs_on && !irqs_disabled()) || + (irqs_on && !preempt_count()) || + (irqs_on == TRACE_STOP)) && + per_cpu(hist_preemptirqsoff_tracing, cpu)) { + WARN_ON(!preempt_count() && !irqs_disabled()); + if (1 || !stop_set) + stop = ftrace_now(cpu); + start = per_cpu(hist_preemptirqsoff_start, cpu); + latency = (long)nsecs_to_usecs(stop - start); + if (latency > 1000000) { + printk("%d: latency = %ld (%lu)\n", __LINE__, latency, latency); + printk("%d: start=%Ld stop=%Ld\n", __LINE__, start, stop); + } + barrier(); + per_cpu(hist_preemptirqsoff_tracing, cpu) = 0; + latency_hist(PREEMPT_INTERRUPT_LATENCY, cpu, latency); + } +#endif +} +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +int tracing_wakeup_hist __read_mostly = 1; + +static unsigned wakeup_prio = (unsigned)-1 ; +static struct task_struct *wakeup_task; +static cycle_t wakeup_start; +static DEFINE_RAW_SPINLOCK(wakeup_lock); + +notrace void tracing_hist_wakeup_start(struct task_struct *p, + struct task_struct *curr) +{ + unsigned long flags; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + spin_lock_irqsave(&wakeup_lock, flags); + if (wakeup_task) + put_task_struct(wakeup_task); + + get_task_struct(p); + wakeup_task = p; + wakeup_prio = p->prio; + wakeup_start = ftrace_now(raw_smp_processor_id()); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +notrace void tracing_hist_wakeup_stop(struct task_struct *next) +{ + unsigned long flags; + long latency; + cycle_t stop; + + if (next != wakeup_task) + return; + + stop = ftrace_now(raw_smp_processor_id()); + + spin_lock_irqsave(&wakeup_lock, flags); + if (wakeup_task != next) + goto out; + + latency = (long)nsecs_to_usecs(stop - wakeup_start); + + latency_hist(WAKEUP_LATENCY, smp_processor_id(), latency); + + put_task_struct(wakeup_task); + wakeup_task = NULL; + wakeup_prio = (unsigned)-1; + out: + spin_unlock_irqrestore(&wakeup_lock, flags); + +} + +static void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_hist_wakeup_stop(next); +} + +static void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_hist_wakeup_start(task, curr); +} + +#endif + +static __init int latency_hist_init(void) +{ + struct dentry *latency_hist_root = NULL; + struct dentry *dentry; + struct dentry *entry; + int i = 0, len = 0; + struct hist_data *my_hist; + char name[64]; + + dentry = tracing_init_dentry(); + + latency_hist_root = + debugfs_create_dir(latency_hist_dir_root, dentry); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + dentry = debugfs_create_dir(interrupt_off_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(interrupt_off_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(interrupt_off_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)INTERRUPT_LATENCY, + &latency_hist_reset_fops); +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + dentry = debugfs_create_dir(preempt_off_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preempt_off_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(preempt_off_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)PREEMPT_LATENCY, + &latency_hist_reset_fops); +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + dentry = debugfs_create_dir(preempt_irqs_off_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preempt_off_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(preempt_irqs_off_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)PREEMPT_INTERRUPT_LATENCY, + &latency_hist_reset_fops); +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + + i = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, NULL); + if (i) { + pr_info("wakeup hist: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + goto out_wake; + } + + i = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, NULL); + if (i) { + pr_info("wakeup hist: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + i = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, NULL); + if (i) { + pr_info("wakeup hist: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + dentry = debugfs_create_dir(wakeup_latency_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(wakeup_latency_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(wakeup_latency_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)WAKEUP_LATENCY, + &latency_hist_reset_fops); + + goto out_wake; + +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, NULL); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, NULL); + out_wake: +#endif + return 0; + +} + +__initcall(latency_hist_init); Index: linux-2.6.25.8-rt7/kernel/trace/trace_hist.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/trace_hist.h 2008-06-23 19:13:08.000000000 -0400 @@ -0,0 +1,39 @@ +/* + * kernel/trace/trace_hist.h + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + * Converted to work with the new latency tracer. + * Copyright (C) 2008 Red Hat, Inc. + * Steven Rostedt + * + */ +#ifndef _LIB_TRACING_TRACER_HIST_H_ +#define _LIB_TRACING_TRACER_HIST_H_ + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) +# define TRACE_STOP 2 +void tracing_hist_preempt_start(void); +void tracing_hist_preempt_stop(int irqs_on); +#else +# define tracing_hist_preempt_start() do { } while (0) +# define tracing_hist_preempt_stop(irqs_off) do { } while (0) +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +void tracing_hist_wakeup_start(struct task_struct *p, + struct task_struct *curr); +void tracing_hist_wakeup_stop(struct task_struct *next); +extern int tracing_wakeup_hist; +#else +# define tracing_hist_wakeup_start(p, curr) do { } while (0) +# define tracing_hist_wakeup_stop(next) do { } while (0) +# define tracing_wakeup_hist 0 +#endif + +#endif /* ifndef _LIB_TRACING_TRACER_HIST_H_ */ Index: linux-2.6.25.8-rt7/arch/x86/ia32/ia32entry.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/ia32/ia32entry.S 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/ia32/ia32entry.S 2008-06-23 19:13:08.000000000 -0400 @@ -131,7 +131,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -236,7 +238,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -327,8 +331,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -399,7 +405,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -727,4 +733,5 @@ ia32_sys_call_table: .quad sys32_fallocate .quad compat_sys_timerfd_settime /* 325 */ .quad compat_sys_timerfd_gettime +.globl ia32_syscall_end ia32_syscall_end: Index: linux-2.6.25.8-rt7/include/asm-x86/calling.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/calling.h 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/calling.h 2008-06-23 19:13:08.000000000 -0400 @@ -166,3 +166,53 @@ .byte 0xf1 .endm + +/* + * latency-tracing helpers: + */ + + .macro TRACE_SYS_CALL + +#ifdef CONFIG_EVENT_TRACER + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_call + + RESTORE_ARGS +#endif + .endm + + + .macro TRACE_SYS_IA32_CALL + +#ifdef CONFIG_EVENT_TRACER + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_ia32_call + + RESTORE_ARGS +#endif + .endm + + .macro TRACE_SYS_RET + +#ifdef CONFIG_EVENT_TRACER + SAVE_ARGS + + mov %rax, %rdi + + call sys_ret + + RESTORE_ARGS +#endif + .endm Index: linux-2.6.25.8-rt7/include/asm-x86/unistd_64.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/unistd_64.h 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/unistd_64.h 2008-06-23 19:13:08.000000000 -0400 @@ -11,6 +11,8 @@ * Note: holes are not allowed. */ +#define NR_syscalls (__NR_syscall_max+1) + /* at least 8 syscall per cacheline */ #define __NR_read 0 __SYSCALL(__NR_read, sys_read) Index: linux-2.6.25.8-rt7/include/asm-x86/unistd_32.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/unistd_32.h 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/unistd_32.h 2008-06-23 19:13:09.000000000 -0400 @@ -333,6 +333,8 @@ #define __NR_timerfd_settime 325 #define __NR_timerfd_gettime 326 +#define NR_syscalls 327 + #ifdef __KERNEL__ #define __ARCH_WANT_IPC_PARSE_VERSION Index: linux-2.6.25.8-rt7/arch/arm/kernel/traps.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/traps.c 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/traps.c 2008-06-23 19:13:35.000000000 -0400 @@ -225,7 +225,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -268,7 +268,7 @@ void arm_notify_die(const char *str, str } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { @@ -357,6 +357,7 @@ asmlinkage void do_unexp_fiq (struct pt_ { printk("Hmm. Unexpected FIQ received, but trying to continue\n"); printk("You may have a hardware problem...\n"); + print_preempt_trace(current); } /* Index: linux-2.6.25.8-rt7/kernel/trace/preempt-trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/trace/preempt-trace.c 2008-06-23 19:13:09.000000000 -0400 @@ -0,0 +1,30 @@ +#include +#include +#include + +void print_preempt_trace(struct task_struct *task) +{ + unsigned int count; + unsigned int i, lim; + + if (!task) + task = current; + + count = task_thread_info(task)->preempt_count; + lim = count & PREEMPT_MASK; + + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} Index: linux-2.6.25.8-rt7/arch/arm/kernel/irq.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/irq.c 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/irq.c 2008-06-23 19:13:35.000000000 -0400 @@ -37,6 +37,8 @@ #include #include +#include + #include #include @@ -100,7 +102,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* @@ -108,11 +110,13 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage void __exception notrace asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc = irq_desc + irq; + ftrace_event_irq(irq, user_mode(regs), instruction_pointer(regs)); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux-2.6.25.8-rt7/arch/powerpc/xmon/xmon.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/xmon/xmon.c 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/xmon/xmon.c 2008-06-23 19:13:41.000000000 -0400 @@ -343,6 +343,7 @@ static int xmon_core(struct pt_regs *reg unsigned long timeout; #endif + preempt_disable(); local_irq_save(flags); bp = in_breakpoint_table(regs->nip, &offset); @@ -519,6 +520,7 @@ static int xmon_core(struct pt_regs *reg insert_cpu_bpts(); local_irq_restore(flags); + preempt_enable(); return cmd != 'X' && cmd != EOF; } @@ -2137,7 +2139,7 @@ print_address(unsigned long addr) static unsigned long mdest; /* destination address */ static unsigned long msrc; /* source address */ static unsigned long mval; /* byte value to set memory to */ -static unsigned long mcount; /* # bytes to affect */ +static unsigned long xmon_mcount; /* # bytes to affect */ static unsigned long mdiffs; /* max # differences to print */ void @@ -2149,19 +2151,20 @@ memops(int cmd) scanhex((void *)(cmd == 's'? &mval: &msrc)); if( termch != '\n' ) termch = 0; - scanhex((void *)&mcount); + scanhex((void *)&xmon_mcount); switch( cmd ){ case 'm': - memmove((void *)mdest, (void *)msrc, mcount); + memmove((void *)mdest, (void *)msrc, xmon_mcount); break; case 's': - memset((void *)mdest, mval, mcount); + memset((void *)mdest, mval, xmon_mcount); break; case 'd': if( termch != '\n' ) termch = 0; scanhex((void *)&mdiffs); - memdiffs((unsigned char *)mdest, (unsigned char *)msrc, mcount, mdiffs); + memdiffs((unsigned char *)mdest, (unsigned char *)msrc, + xmon_mcount, mdiffs); break; } } Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/entry_64.S 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/entry_64.S 2008-06-23 19:13:39.000000000 -0400 @@ -470,7 +470,8 @@ _GLOBAL(ret_from_except_lite) #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ + li r0,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) + /* bits to check */ ld r3,_MSR(r1) ld r4,TI_FLAGS(r9) /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ @@ -592,17 +593,22 @@ do_work: rotldi r10,r10,16 mtmsrd r10,1 ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne 1b b restore user_work: #endif + /* here we are preempting the current task */ + li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + /* Enable interrupts */ ori r10,r10,MSR_EE mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq 1f bl .schedule b .ret_from_except_lite @@ -846,3 +852,65 @@ _GLOBAL(enter_prom) ld r0,16(r1) mtlr r0 blr + +#ifdef CONFIG_MCOUNT +/* + * code almost taken from entry_32.S + */ +#define MCOUNT_FRAME_SIZE 32 +_GLOBAL(mcount) + stdu r1,-MCOUNT_FRAME_SIZE(r1) + mflr r3 + + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + ld r4,MCOUNT_FRAME_SIZE(r1) + ld r4,16(r4) + LOAD_REG_ADDR(r5,ftrace_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + nop +1: + ld r0,MCOUNT_FRAME_SIZE+16(r1) + mtlr r0 + addi r1,r1,MCOUNT_FRAME_SIZE + blr + +/* + * Based on glibc-2.4/sysdeps/powerpc/powerpc64/ppc-mcount.S + * + * We don't need to save the parameter-passing registers as gcc takes + * care of that for us. Thus this function looks fairly normal. + * In fact, the generic code would work for us. + */ +_GLOBAL(_mcount) + /* return if we're in real mode. */ + mfmsr r3 + andi. r0,r3,MSR_IR|MSR_DR /* see if relocation is on? */ + beqlr /* if not, do nothing. */ + /* we're in translation mode. keep going. */ + mflr r3 + ld r11,0(r1) /* load back chain ptr */ + stdu r1,-STACK_FRAME_OVERHEAD(r1) + std r3,STACK_FRAME_OVERHEAD+16(r1) + ld r4,16(r11) /* LR in back chain */ + LOAD_REG_ADDR(r5,mcount_enabled) + lwz r5,0(r5) + cmpwi r5,0 /* see if mcount_enabled? */ + beq 1f /* if disabled, then skip */ + + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + LOAD_REG_ADDR(r5,mcount_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + nop +1: + ld r0,STACK_FRAME_OVERHEAD+16(r1) /* restore saved LR */ + mtlr r0 + addi r1,r1,STACK_FRAME_OVERHEAD + blr + +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/setup_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/setup_64.c 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/setup_64.c 2008-06-23 19:13:10.000000000 -0400 @@ -607,3 +607,21 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif /* CONFIG_PPC_INDIRECT_IO */ +#ifdef CONFIG_STACKTRACE +#include +void notrace save_stack_trace(struct stack_trace *trace) +{ +} +#endif /* CONFIG_STACKTRACE */ + +#ifdef CONFIG_EARLY_PRINTK +void notrace early_printk(const char *fmt, ...) +{ + BUG(); +} +#endif /* CONFIG_EARLY_PRINTK */ + +#ifdef CONFIG_MCOUNT +extern void _mcount(void); +EXPORT_SYMBOL(_mcount); +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/irq.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/irq.c 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/irq.c 2008-06-23 19:13:40.000000000 -0400 @@ -94,8 +94,6 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; static inline unsigned long get_hard_enabled(void) @@ -114,7 +112,7 @@ static inline void set_soft_enabled(unsi : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void local_irq_restore(unsigned long en) +void notrace raw_local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; @@ -404,7 +402,7 @@ void do_softirq(void) #ifdef CONFIG_PPC_MERGE static LIST_HEAD(irq_hosts); -static DEFINE_SPINLOCK(irq_big_lock); +static DEFINE_RAW_SPINLOCK(irq_big_lock); static DEFINE_PER_CPU(unsigned int, irq_radix_reader); static unsigned int irq_radix_writer; struct irq_map_entry irq_map[NR_IRQS]; Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/entry_32.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/entry_32.S 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/entry_32.S 2008-06-23 19:13:23.000000000 -0400 @@ -661,7 +661,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -896,7 +896,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -910,7 +910,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user @@ -1022,3 +1022,86 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_MCOUNT +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + LOAD_REG_ADDR(r5,ftrace_trace_function) + lwz r5,0(r5) + mtctr r5 + bctrl +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function + * preamble, before the stack frame is created. An example of this preamble + * code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + LOAD_REG_ADDR(r5,mcount_trace_function) + lwz r5,0(r5) + mtctr r5 + bctrl +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.25.8-rt7/arch/powerpc/Kconfig =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/Kconfig 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/Kconfig 2008-06-23 19:13:28.000000000 -0400 @@ -49,13 +49,6 @@ config IRQ_PER_CPU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_LOCKBREAK bool default y @@ -90,6 +83,7 @@ config PPC select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_MCOUNT select HAVE_KRETPROBES config EARLY_PRINTK @@ -208,6 +202,18 @@ config HIGHMEM source kernel/time/Kconfig source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux-2.6.25.8-rt7/arch/arm/mach-ep93xx/core.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/mach-ep93xx/core.c 2008-06-23 19:12:58.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/mach-ep93xx/core.c 2008-06-23 19:13:11.000000000 -0400 @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include @@ -50,7 +52,6 @@ #include - /************************************************************************* * Static I/O mappings that are needed for all EP93xx platforms *************************************************************************/ @@ -93,55 +94,123 @@ void __init ep93xx_map_io(void) * to use this timer for something else. We also use timer 4 for keeping * track of lost jiffies. */ -static unsigned int last_jiffy_time; - -#define TIMER4_TICKS_PER_JIFFY ((CLOCK_TICK_RATE + (HZ/2)) / HZ) +static struct clock_event_device clockevent_ep93xx; static int ep93xx_timer_interrupt(int irq, void *dev_id) { - __raw_writel(1, EP93XX_TIMER1_CLEAR); - while ((signed long) - (__raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time) - >= TIMER4_TICKS_PER_JIFFY) { - last_jiffy_time += TIMER4_TICKS_PER_JIFFY; - timer_tick(); - } + __raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR); + + clockevent_ep93xx.event_handler(&clockevent_ep93xx); return IRQ_HANDLED; } +static int ep93xx_set_next_event(unsigned long evt, + struct clock_event_device *unused) +{ + u32 tmode = __raw_readl(EP93XX_TIMER1_CONTROL); + + /* stop timer */ + __raw_writel(tmode & ~EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + /* program timer */ + __raw_writel(evt, EP93XX_TIMER1_LOAD); + /* start timer */ + __raw_writel(tmode | EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + + return 0; +} + +static void ep93xx_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + u32 tmode = EP93XX_TC123_SEL_508KHZ; + + /* Disable timer */ + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* Set timer period */ + __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); + tmode |= EP93XX_TC123_PERIODIC; + + case CLOCK_EVT_MODE_ONESHOT: + tmode |= EP93XX_TC123_ENABLE; + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + break; + + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_RESUME: + return; + } +} + +static struct clock_event_device clockevent_ep93xx = { + .name = "ep93xx-timer1", + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, + .shift = 32, + .set_mode = ep93xx_set_mode, + .set_next_event = ep93xx_set_next_event, +}; static struct irqaction ep93xx_timer_irq = { .name = "ep93xx timer", .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, .handler = ep93xx_timer_interrupt, }; -static void __init ep93xx_timer_init(void) +static void __init ep93xx_clockevent_init(void) { - /* Enable periodic HZ timer. */ - __raw_writel(0x48, EP93XX_TIMER1_CONTROL); - __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); - __raw_writel(0xc8, EP93XX_TIMER1_CONTROL); + setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); - /* Enable lost jiffy timer. */ - __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); + clockevent_ep93xx.mult = div_sc(508469, NSEC_PER_SEC, + clockevent_ep93xx.shift); + clockevent_ep93xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ep93xx); + clockevent_ep93xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ep93xx); + clockevent_ep93xx.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&clockevent_ep93xx); +} - setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); +/* + * timer4 is a 40 Bit timer, separated in a 32bit and a 8 bit + * register, EP93XX_TIMER4_VALUE_LOW stores 32 bit word. The + * controlregister is in EP93XX_TIMER4_VALUE_HIGH + */ + +cycle_t ep93xx_get_cycles(void) +{ + return __raw_readl(EP93XX_TIMER4_VALUE_LOW); } -static unsigned long ep93xx_gettimeoffset(void) +static struct clocksource clocksource_ep93xx = { + .name = "ep93xx_timer4", + .rating = 200, + .read = ep93xx_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void __init ep93xx_clocksource_init(void) { - int offset; + /* Reset time-stamp counter */ + __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); - offset = __raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time; + clocksource_ep93xx.mult = + clocksource_hz2mult(983040, clocksource_ep93xx.shift); + clocksource_register(&clocksource_ep93xx); +} - /* Calculate (1000000 / 983040) * offset. */ - return offset + (53 * offset / 3072); +static void __init ep93xx_timer_init(void) +{ + ep93xx_clocksource_init(); + ep93xx_clockevent_init(); } struct sys_timer ep93xx_timer = { - .init = ep93xx_timer_init, - .offset = ep93xx_gettimeoffset, + .init = ep93xx_timer_init, }; @@ -549,7 +618,6 @@ static struct platform_device ep93xx_ohc .resource = ep93xx_ohci_resources, }; - void __init ep93xx_init_devices(void) { unsigned int v; Index: linux-2.6.25.8-rt7/include/asm-arm/arch-ep93xx/ep93xx-regs.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/arch-ep93xx/ep93xx-regs.h 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/arch-ep93xx/ep93xx-regs.h 2008-06-23 19:13:11.000000000 -0400 @@ -67,6 +67,12 @@ #define EP93XX_TIMER3_CONTROL EP93XX_TIMER_REG(0x88) #define EP93XX_TIMER3_CLEAR EP93XX_TIMER_REG(0x8c) +#define EP93XX_TC_CLEAR 0x00000001 +#define EP93XX_TC123_ENABLE 0x00000080 +#define EP93XX_TC123_PERIODIC 0x00000040 +#define EP93XX_TC123_SEL_508KHZ 0x00000008 +#define EP93XX_TC4_ENABLE 0x00000100 + #define EP93XX_I2S_BASE (EP93XX_APB_VIRT_BASE + 0x00020000) #define EP93XX_SECURITY_BASE (EP93XX_APB_VIRT_BASE + 0x00030000) Index: linux-2.6.25.8-rt7/arch/arm/kernel/time.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/time.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/time.c 2008-06-23 19:13:12.000000000 -0400 @@ -225,6 +225,13 @@ static inline void do_leds(void) #define do_leds() #endif +void arch_tick_leds(void) +{ +#ifdef CONFIG_LEDS_TIMER + do_leds(); +#endif +} + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { Index: linux-2.6.25.8-rt7/drivers/net/sungem.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/net/sungem.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/net/sungem.c 2008-06-23 19:13:12.000000000 -0400 @@ -1033,10 +1033,8 @@ static int gem_start_xmit(struct sk_buff (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { + if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { /* Tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } /* We raced with gem_do_stop() */ Index: linux-2.6.25.8-rt7/arch/x86/kernel/tsc_sync.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/tsc_sync.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/tsc_sync.c 2008-06-23 19:13:28.000000000 -0400 @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -101,6 +101,7 @@ static __cpuinit void check_tsc_warp(voi */ void __cpuinit check_tsc_sync_source(int cpu) { + unsigned long flags; int cpus = 2; /* @@ -121,8 +122,11 @@ void __cpuinit check_tsc_sync_source(int /* * Wait for the target to arrive: */ + local_save_flags(flags); + local_irq_enable(); while (atomic_read(&start_count) != cpus-1) cpu_relax(); + local_irq_restore(flags); /* * Trigger the target to continue into the measurement too: */ Index: linux-2.6.25.8-rt7/drivers/input/keyboard/atkbd.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/input/keyboard/atkbd.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/input/keyboard/atkbd.c 2008-06-23 19:13:12.000000000 -0400 @@ -1453,8 +1453,23 @@ static struct dmi_system_id atkbd_dmi_qu { } }; +static int __read_mostly noatkbd; + +static int __init noatkbd_setup(char *str) +{ + noatkbd = 1; + printk(KERN_INFO "debug: not setting up AT keyboard.\n"); + + return 1; +} + +__setup("noatkbd", noatkbd_setup); + static int __init atkbd_init(void) { + if (noatkbd) + return 0; + dmi_check_system(atkbd_dmi_quirk_table); return serio_register_driver(&atkbd_drv); Index: linux-2.6.25.8-rt7/drivers/input/mouse/psmouse-base.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/input/mouse/psmouse-base.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/input/mouse/psmouse-base.c 2008-06-23 19:13:12.000000000 -0400 @@ -1597,10 +1597,25 @@ static int psmouse_get_maxproto(char *bu return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name); } +static int __read_mostly nopsmouse; + +static int __init nopsmouse_setup(char *str) +{ + nopsmouse = 1; + printk(KERN_INFO "debug: not setting up psmouse.\n"); + + return 1; +} + +__setup("nopsmouse", nopsmouse_setup); + static int __init psmouse_init(void) { int err; + if (nopsmouse) + return 0; + kpsmoused_wq = create_singlethread_workqueue("kpsmoused"); if (!kpsmoused_wq) { printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n"); Index: linux-2.6.25.8-rt7/kernel/rtmutex-debug.h =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rtmutex-debug.h 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rtmutex-debug.h 2008-06-23 19:13:12.000000000 -0400 @@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(s extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); +extern void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ +# define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline int +debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) { - return (waiter != NULL); + return waiter != NULL; } Index: linux-2.6.25.8-rt7/drivers/net/8139too.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/net/8139too.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/net/8139too.c 2008-06-23 19:13:13.000000000 -0400 @@ -2199,7 +2199,11 @@ static irqreturn_t rtl8139_interrupt (in */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } Index: linux-2.6.25.8-rt7/arch/x86/kernel/kprobes.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/kprobes.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/kprobes.c 2008-06-23 19:13:13.000000000 -0400 @@ -451,7 +451,7 @@ static void __kprobes setup_singlestep(s /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return; } #endif @@ -477,7 +477,7 @@ static int __kprobes reenter_kprobe(stru arch_disarm_kprobe(p); regs->ip = (unsigned long)p->addr; reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; #endif case KPROBE_HIT_ACTIVE: @@ -573,7 +573,7 @@ static int __kprobes kprobe_handler(stru } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); + preempt_enable(); return 0; } @@ -874,7 +874,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, flags @@ -908,7 +908,7 @@ int __kprobes kprobe_fault_handler(struc restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -1049,7 +1049,7 @@ int __kprobes longjmp_break_handler(stru memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), kcb->jprobes_stack, MIN_STACK_SIZE(kcb->jprobe_saved_sp)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; Index: linux-2.6.25.8-rt7/arch/x86/mm/highmem_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mm/highmem_32.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mm/highmem_32.c 2008-06-23 19:13:58.000000000 -0400 @@ -3,9 +3,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } @@ -18,6 +18,27 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} +EXPORT_SYMBOL_GPL(kmap_to_page); /* PREEMPT_RT converts some modules to use this */ + static void debug_kmap_atomic_prot(enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM @@ -69,12 +90,12 @@ static void debug_kmap_atomic_prot(enum * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -84,19 +105,19 @@ void *kmap_atomic_prot(struct page *page idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); + WARN_ON_ONCE(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void *)vaddr; } -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -118,16 +139,18 @@ void kunmap_atomic(void *kvaddr, enum km arch_flush_lazy_mmu_mode(); pagefault_enable(); + preempt_enable(); } /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); @@ -138,7 +161,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -153,6 +176,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux-2.6.25.8-rt7/include/asm-x86/atomic_32.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/atomic_32.h 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/atomic_32.h 2008-06-23 19:13:13.000000000 -0400 @@ -195,10 +195,10 @@ static __inline__ int atomic_add_return( #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } Index: linux-2.6.25.8-rt7/drivers/pci/msi.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/pci/msi.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/pci/msi.c 2008-06-23 19:14:22.000000000 -0400 @@ -75,7 +75,7 @@ static void msi_set_enable(struct pci_de int pos; u16 control; - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSI); if (pos) { pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); control &= ~PCI_MSI_FLAGS_ENABLE; @@ -90,7 +90,7 @@ static void msix_set_enable(struct pci_d int pos; u16 control; - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX); if (pos) { pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); control &= ~PCI_MSIX_FLAGS_ENABLE; @@ -285,6 +285,10 @@ static void __pci_restore_msi_state(stru return; entry = get_irq_msi(dev->irq); + if (!entry) { + WARN_ON(1); + return; + } pos = entry->msi_attrib.pos; pci_intx_for_msi(dev, 0); @@ -351,7 +355,7 @@ static int msi_capability_init(struct pc msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSI); pci_read_config_word(dev, msi_control_reg(pos), &control); /* MSI Entry Initialization */ entry = alloc_msi_entry(); @@ -424,7 +428,7 @@ static int msix_capability_init(struct p msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX); /* Request & Map MSI-X table region */ pci_read_config_word(dev, msi_control_reg(pos), &control); nr_entries = multi_msix_capable(control); @@ -531,7 +535,7 @@ static int pci_msi_check_device(struct p if (ret) return ret; - if (!pci_find_capability(dev, type)) + if (!pci_find_capability_cached(dev, type)) return -EINVAL; return 0; @@ -650,7 +654,7 @@ int pci_enable_msix(struct pci_dev* dev, if (status) return status; - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX); pci_read_config_word(dev, msi_control_reg(pos), &control); nr_entries = multi_msix_capable(control); if (nvec > nr_entries) Index: linux-2.6.25.8-rt7/drivers/block/floppy.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/block/floppy.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/block/floppy.c 2008-06-23 19:13:14.000000000 -0400 @@ -4145,6 +4145,28 @@ static void floppy_device_release(struct { } +static int floppy_suspend(struct platform_device *dev, pm_message_t state) +{ + floppy_release_irq_and_dma(); + + return 0; +} + +static int floppy_resume(struct platform_device *dev) +{ + floppy_grab_irq_and_dma(); + + return 0; +} + +static struct platform_driver floppy_driver = { + .suspend = floppy_suspend, + .resume = floppy_resume, + .driver = { + .name = "floppy", + }, +}; + static struct platform_device floppy_device[N_DRIVE]; static struct kobject *floppy_find(dev_t dev, int *part, void *data) @@ -4193,10 +4215,14 @@ static int __init floppy_init(void) if (err) goto out_put_disk; + err = platform_driver_register(&floppy_driver); + if (err) + goto out_unreg_blkdev; + floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); if (!floppy_queue) { err = -ENOMEM; - goto out_unreg_blkdev; + goto out_unreg_driver; } blk_queue_max_sectors(floppy_queue, 64); @@ -4345,6 +4371,8 @@ out_flush_work: out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); blk_cleanup_queue(floppy_queue); +out_unreg_driver: + platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4540,6 +4568,7 @@ void cleanup_module(void) blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); unregister_blkdev(FLOPPY_MAJOR, "fd"); + platform_driver_unregister(&floppy_driver); for (drive = 0; drive < N_DRIVE; drive++) { del_timer_sync(&motor_off_timer[drive]); Index: linux-2.6.25.8-rt7/include/linux/hrtimer.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/hrtimer.h 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/hrtimer.h 2008-06-23 19:13:49.000000000 -0400 @@ -188,7 +188,7 @@ struct hrtimer_clock_base { * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { - spinlock_t lock; + raw_spinlock_t lock; struct lock_class_key lock_key; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; struct list_head cb_pending; @@ -197,6 +197,9 @@ struct hrtimer_cpu_base { int hres_active; unsigned long nr_events; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; #ifdef CONFIG_HIGH_RES_TIMERS @@ -279,6 +282,13 @@ static inline int hrtimer_restart(struct return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); @@ -314,6 +324,10 @@ static inline u64 hrtimer_forward_now(st return hrtimer_forward(timer, timer->base->get_time(), interval); } +/* Overrun count: */ +extern unsigned long +hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval); + /* Precise sleep: */ extern long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, Index: linux-2.6.25.8-rt7/mm/memory.c =================================================================== --- linux-2.6.25.8-rt7.orig/mm/memory.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/mm/memory.c 2008-06-23 19:13:58.000000000 -0400 @@ -271,18 +271,52 @@ void free_pgd_range(struct mmu_gather ** } while (pgd++, addr = next, addr != end); } +#ifdef CONFIG_IA64 +#define tlb_start_addr(tlb) (tlb)->start_addr +#define tlb_end_addr(tlb) (tlb)->end_addr +#else +#define tlb_start_addr(tlb) 0UL /* only ia64 really uses it */ +#define tlb_end_addr(tlb) 0UL /* only ia64 really uses it */ +#endif + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { +#ifdef CONFIG_PREEMPT + struct vm_area_struct *unlink = vma; + int fullmm = (*tlb)->fullmm; + + if (!vma) /* Sometimes when exiting after an oops */ + return; +#ifndef CONFIG_PREEMPT_RT + if (vma->vm_next) +#endif + tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb)); + /* + * Hide vma from rmap and vmtruncate before freeeing pgtables, + * with preemption enabled, except when unmapping just one area. + */ + while (unlink) { + anon_vma_unlink(unlink); + unlink_file_vma(unlink); + unlink = unlink->vm_next; + } +#ifndef CONFIG_PREEMPT_RT + if (vma->vm_next) +#endif + *tlb = tlb_gather_mmu(vma->vm_mm, fullmm); +#endif while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; +#ifndef CONFIG_PREEMPT /* * Hide vma from rmap and vmtruncate before freeing pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); +#endif if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, @@ -295,8 +329,10 @@ void free_pgtables(struct mmu_gather **t && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; +#ifndef CONFIG_PREEMPT anon_vma_unlink(vma); unlink_file_vma(vma); +#endif } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -781,10 +817,13 @@ static unsigned long unmap_page_range(st return addr; } -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else -/* No preempt: go for improved straight-line efficiency */ +/* + * No preempt: go for improved straight-line efficiency + * on PREEMPT_RT this is not a critical latency-path. + */ # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif @@ -2503,6 +2542,28 @@ unlock: return 0; } +void pagefault_disable(void) +{ + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; +} +EXPORT_SYMBOL(pagefault_enable); + /* * By the time we get here, we already hold the mm semaphore */ Index: linux-2.6.25.8-rt7/arch/x86/kernel/io_apic_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/io_apic_32.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/io_apic_32.c 2008-06-23 19:13:42.000000000 -0400 @@ -55,8 +55,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -260,14 +260,14 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +/* trigger = 0 (edge mode) */ +static void __pcix_mask_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + __modify_IO_APIC_irq(irq, 0, 0x00008000); } -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +/* mask = 0, trigger = 1 (level mode) */ +static void __pcix_unmask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); } @@ -290,6 +290,24 @@ static void unmask_IO_APIC_irq (unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -1235,23 +1253,28 @@ static int assign_irq_vector(int irq) return vector; } + static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, int vector, unsigned long trigger, + int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); } else { irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } set_intr_gate(vector, interrupt[irq]); } @@ -1321,7 +1344,8 @@ static void __init setup_IO_APIC_irqs(vo if (IO_APIC_IRQ(irq)) { vector = assign_irq_vector(irq); entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); + ioapic_register_intr(irq, vector, IOAPIC_AUTO, + apic > 0); if (!apic && (irq < 16)) disable_8259A_irq(irq); @@ -1492,7 +1516,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1899,7 +1923,8 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (time_after(jiffies, t1 + 4)) + if (time_after(jiffies, t1 + 4) && + time_before(jiffies, t1 + 16)) return 1; return 0; @@ -1988,8 +2013,10 @@ static void ack_ioapic_quirk_irq(unsigne if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } @@ -2014,6 +2041,18 @@ static struct irq_chip ioapic_chip __rea .retrigger = ioapic_retrigger_irq, }; +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; static inline void init_IO_APIC_traps(void) { @@ -2826,7 +2865,7 @@ int io_apic_set_pci_routing (int ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); - ioapic_register_intr(irq, entry.vector, edge_level); + ioapic_register_intr(irq, entry.vector, edge_level, ioapic > 0); if (!ioapic && (irq < 16)) disable_8259A_irq(irq); Index: linux-2.6.25.8-rt7/arch/x86/pci/Makefile_32 =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/pci/Makefile_32 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/pci/Makefile_32 2008-06-23 19:13:14.000000000 -0400 @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o Index: linux-2.6.25.8-rt7/include/linux/spinlock.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/spinlock.h 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/spinlock.h 2008-06-23 19:14:26.000000000 -0400 @@ -44,6 +44,42 @@ * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. + * + * + * Public types and naming conventions: + * ------------------------------------ + * spinlock_t: type: sleep-lock + * raw_spinlock_t: type: spin-lock (debug) + * + * spin_lock([raw_]spinlock_t): API: acquire lock, both types + * + * + * Internal types and naming conventions: + * ------------------------------------- + * __raw_spinlock_t: type: lowlevel spin-lock + * + * _spin_lock(struct rt_mutex): API: acquire sleep-lock + * __spin_lock(raw_spinlock_t): API: acquire spin-lock (highlevel) + * _raw_spin_lock(raw_spinlock_t): API: acquire spin-lock (debug) + * __raw_spin_lock(__raw_spinlock_t): API: acquire spin-lock (lowlevel) + * + * + * spin_lock(raw_spinlock_t) translates into the following chain of + * calls/inlines/macros, if spin-lock debugging is enabled: + * + * spin_lock() [include/linux/spinlock.h] + * -> __spin_lock() [kernel/spinlock.c] + * -> _raw_spin_lock() [lib/spinlock_debug.c] + * -> __raw_spin_lock() [include/asm/spinlock.h] + * + * spin_lock(spinlock_t) translates into the following chain of + * calls/inlines/macros: + * + * spin_lock() [include/linux/spinlock.h] + * -> _spin_lock() [include/linux/spinlock.h] + * -> rt_spin_lock() [kernel/rtmutex.c] + * -> rt_spin_lock_fastlock() [kernel/rtmutex.c] + * -> rt_spin_lock_slowlock() [kernel/rtmutex.c] */ #include @@ -51,29 +87,15 @@ #include #include #include +#include #include #include +#include +#include #include /* - * Must define these before including other files, inline functions need them - */ -#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME - -#define LOCK_SECTION_START(extra) \ - ".subsection 1\n\t" \ - extra \ - ".ifndef " LOCK_SECTION_NAME "\n\t" \ - LOCK_SECTION_NAME ":\n\t" \ - ".endif\n" - -#define LOCK_SECTION_END \ - ".previous\n\t" - -#define __lockfunc __attribute__((section(".spinlock.text"))) - -/* * Pull the raw_spinlock_t and raw_rwlock_t definitions: */ #include @@ -89,36 +111,10 @@ extern int __lockfunc generic__raw_read_ # include #endif -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key); -# define spin_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __spin_lock_init((lock), #lock, &__key); \ -} while (0) - -#else -# define spin_lock_init(lock) \ - do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) -#endif - -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key); -# define rwlock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __rwlock_init((lock), #lock, &__key); \ -} while (0) -#else -# define rwlock_init(lock) \ - do { *(lock) = RW_LOCK_UNLOCKED; } while (0) -#endif - -#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) +/* + * Pull the RT types: + */ +#include #ifdef CONFIG_GENERIC_LOCKBREAK #define spin_is_contended(lock) ((lock)->break_lock) @@ -126,12 +122,6 @@ do { \ #define spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock) #endif -/** - * spin_unlock_wait - wait until the spinlock gets unlocked - * @lock: the spinlock in question. - */ -#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) - /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: */ @@ -142,16 +132,16 @@ do { \ #endif #ifdef CONFIG_DEBUG_SPINLOCK - extern void _raw_spin_lock(spinlock_t *lock); -#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) - extern int _raw_spin_trylock(spinlock_t *lock); - extern void _raw_spin_unlock(spinlock_t *lock); - extern void _raw_read_lock(rwlock_t *lock); - extern int _raw_read_trylock(rwlock_t *lock); - extern void _raw_read_unlock(rwlock_t *lock); - extern void _raw_write_lock(rwlock_t *lock); - extern int _raw_write_trylock(rwlock_t *lock); - extern void _raw_write_unlock(rwlock_t *lock); + extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock); +# define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) + extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock); + extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock); + extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock); #else # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) # define _raw_spin_lock_flags(lock, flags) \ @@ -166,141 +156,460 @@ do { \ # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) #endif -#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) -#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) +extern int __bad_spinlock_type(void); +extern int __bad_rwlock_type(void); + +extern void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); +extern int __lockfunc +rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic); /* - * Define the various spin_lock and rw_lock methods. Note we define these - * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various - * methods are defined as nops in the case they are not required. + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. */ -#define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) -#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) -#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); -#define spin_lock(lock) _spin_lock(lock) - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) +#ifdef CONFIG_PREEMPT_RT +# define _spin_lock(l) rt_spin_lock(l) +# define _spin_lock_nested(l, s) rt_spin_lock_nested(l, s) +# define _spin_lock_bh(l) rt_spin_lock(l) +# define _spin_lock_irq(l) rt_spin_lock(l) +# define _spin_unlock(l) rt_spin_unlock(l) +# define _spin_unlock_no_resched(l) rt_spin_unlock(l) +# define _spin_unlock_bh(l) rt_spin_unlock(l) +# define _spin_unlock_irq(l) rt_spin_unlock(l) +# define _spin_unlock_irqrestore(l, f) rt_spin_unlock(l) +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + rt_spin_lock(lock); + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_nested(lock, subclass); + return 0; +} #else -# define spin_lock_nested(lock, subclass) _spin_lock(lock) +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + return 0; +} +# define _spin_lock(l) do { } while (0) +# define _spin_lock_nested(l, s) do { } while (0) +# define _spin_lock_bh(l) do { } while (0) +# define _spin_lock_irq(l) do { } while (0) +# define _spin_unlock(l) do { } while (0) +# define _spin_unlock_no_resched(l) do { } while (0) +# define _spin_unlock_bh(l) do { } while (0) +# define _spin_unlock_irq(l) do { } while (0) +# define _spin_unlock_irqrestore(l, f) do { } while (0) #endif -#define write_lock(lock) _write_lock(lock) -#define read_lock(lock) _read_lock(lock) +#define _spin_lock_init(sl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_spin_lock_init(sl, n, &__key); \ +} while (0) -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +# ifdef CONFIG_PREEMPT_RT +# define _spin_can_lock(l) (!rt_mutex_is_locked(&(l)->lock)) +# define _spin_is_locked(l) rt_mutex_is_locked(&(l)->lock) +# define _spin_unlock_wait(l) rt_spin_unlock_wait(l) + +# define _spin_trylock(l) rt_spin_trylock(l) +# define _spin_trylock_bh(l) rt_spin_trylock(l) +# define _spin_trylock_irq(l) rt_spin_trylock(l) +# define _spin_trylock_irqsave(l,f) rt_spin_trylock_irqsave(l, f) +# else + + extern int this_should_never_be_called_on_non_rt(spinlock_t *lock); +# define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l) +# define _spin_can_lock(l) TSNBCONRT(l) +# define _spin_is_locked(l) TSNBCONRT(l) +# define _spin_unlock_wait(l) TSNBCONRT(l) + +# define _spin_trylock(l) TSNBCONRT(l) +# define _spin_trylock_bh(l) TSNBCONRT(l) +# define _spin_trylock_irq(l) TSNBCONRT(l) +# define _spin_trylock_irqsave(l,f) TSNBCONRT(l) +#endif -#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) -#define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) -#define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, + unsigned long *flags); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); +extern void +__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define _rwlock_init(rwl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, n, &__key); \ +} while (0) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave_nested(lock, subclass) +#ifdef CONFIG_PREEMPT_RT +# define rt_read_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) +# define rt_write_can_lock(rwl) ((rwl)->owners.owner == NULL) #else -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave(lock) + extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock); +# define rt_read_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +# define rt_write_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) #endif +# define _read_can_lock(rwl) rt_read_can_lock(rwl) +# define _write_can_lock(rwl) rt_write_can_lock(rwl) + +# define _read_trylock(rwl) rt_read_trylock(rwl) +# define _write_trylock(rwl) rt_write_trylock(rwl) +# define _write_trylock_irqsave(rwl, flags) \ + rt_write_trylock_irqsave(rwl, flags) + +# define _read_lock(rwl) rt_read_lock(rwl) +# define _write_lock(rwl) rt_write_lock(rwl) +# define _read_unlock(rwl) rt_read_unlock(rwl) +# define _write_unlock(rwl) rt_write_unlock(rwl) + +# define _read_lock_bh(rwl) rt_read_lock(rwl) +# define _write_lock_bh(rwl) rt_write_lock(rwl) +# define _read_unlock_bh(rwl) rt_read_unlock(rwl) +# define _write_unlock_bh(rwl) rt_write_unlock(rwl) + +# define _read_lock_irq(rwl) rt_read_lock(rwl) +# define _write_lock_irq(rwl) rt_write_lock(rwl) +# define _read_unlock_irq(rwl) rt_read_unlock(rwl) +# define _write_unlock_irq(rwl) rt_write_unlock(rwl) + +# define _read_lock_irqsave(rwl) rt_read_lock_irqsave(rwl) +# define _write_lock_irqsave(rwl) rt_write_lock_irqsave(rwl) + +# define _read_unlock_irqrestore(rwl, f) rt_read_unlock(rwl) +# define _write_unlock_irqrestore(rwl, f) rt_write_unlock(rwl) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_spin_lock_init(lock, name, file, line) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_spin_lock_init((lock), #lock, &__key); \ +} while (0) + #else +#define __raw_spin_lock_init(lock) \ + do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) +# define _raw_spin_lock_init(lock, name, file, line) __raw_spin_lock_init(lock) +#endif -#define spin_lock_irqsave(lock, flags) _spin_lock_irqsave(lock, flags) -#define read_lock_irqsave(lock, flags) _read_lock_irqsave(lock, flags) -#define write_lock_irqsave(lock, flags) _write_lock_irqsave(lock, flags) -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - spin_lock_irqsave(lock, flags) +/* + * PICK_SPIN_OP()/PICK_RW_OP() are simple redirectors for PICK_FUNCTION + */ +#define PICK_SPIN_OP(...) \ + PICK_FUNCTION(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) +#define PICK_SPIN_OP_RET(...) \ + PICK_FUNCTION_RET(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) +#define PICK_RW_OP(...) PICK_FUNCTION(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) +#define PICK_RW_OP_RET(...) \ + PICK_FUNCTION_RET(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) + +#define spin_lock_init(lock) \ + PICK_SPIN_OP(_raw_spin_lock_init, _spin_lock_init, lock, #lock, \ + __FILE__, __LINE__) +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_rwlock_init(lock, name, file, line) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_rwlock_init((lock), #lock, &__key); \ +} while (0) +#else +#define __raw_rwlock_init(lock) \ + do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0) +# define _raw_rwlock_init(lock, name, file, line) __raw_rwlock_init(lock) #endif -#define spin_lock_irq(lock) _spin_lock_irq(lock) -#define spin_lock_bh(lock) _spin_lock_bh(lock) +#define rwlock_init(lock) \ + PICK_RW_OP(_raw_rwlock_init, _rwlock_init, lock, #lock, \ + __FILE__, __LINE__) + +#define __spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) -#define read_lock_irq(lock) _read_lock_irq(lock) -#define read_lock_bh(lock) _read_lock_bh(lock) +#define spin_is_locked(lock) \ + PICK_SPIN_OP_RET(__spin_is_locked, _spin_is_locked, lock) -#define write_lock_irq(lock) _write_lock_irq(lock) -#define write_lock_bh(lock) _write_lock_bh(lock) +#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) + +#define spin_unlock_wait(lock) \ + PICK_SPIN_OP(__spin_unlock_wait, _spin_unlock_wait, lock) /* - * We inline the unlock functions in the nondebug case: + * Define the various spin_lock and rw_lock methods. Note we define these + * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various + * methods are defined as nops in the case they are not required. */ -#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ - !defined(CONFIG_SMP) -# define spin_unlock(lock) _spin_unlock(lock) -# define read_unlock(lock) _read_unlock(lock) -# define write_unlock(lock) _write_unlock(lock) -# define spin_unlock_irq(lock) _spin_unlock_irq(lock) -# define read_unlock_irq(lock) _read_unlock_irq(lock) -# define write_unlock_irq(lock) _write_unlock_irq(lock) +#define spin_trylock(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock, _spin_trylock, lock)) + +#define read_trylock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__read_trylock, _read_trylock, lock)) + +#define write_trylock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_trylock, _write_trylock, lock)) + +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_trylock_irqsave, \ + _write_trylock_irqsave, lock, &flags)) + +#define __spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock) +#define __read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock) +#define __write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock) + +#define spin_can_lock(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_can_lock, _spin_can_lock,\ + lock)) + +#define read_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__read_can_lock, _read_can_lock, lock)) + +#define write_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_can_lock, _write_can_lock,\ + lock)) + +#define spin_lock(lock) PICK_SPIN_OP(__spin_lock, _spin_lock, lock) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define spin_lock_nested(lock, subclass) \ + PICK_SPIN_OP(__spin_lock_nested, _spin_lock_nested, lock, subclass) #else -# define spin_unlock(lock) \ - do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define read_unlock(lock) \ - do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define write_unlock(lock) \ - do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define spin_unlock_irq(lock) \ -do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define read_unlock_irq(lock) \ -do { \ - __raw_read_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ +# define spin_lock_nested(lock, subclass) spin_lock(lock) +#endif + +#define write_lock(lock) PICK_RW_OP(__write_lock, _write_lock, lock) + +#define read_lock(lock) PICK_RW_OP(__read_lock, _read_lock, lock) + +# define spin_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_SPIN_OP_RET(__spin_lock_irqsave, _spin_lock_irqsave, \ + lock); \ } while (0) -# define write_unlock_irq(lock) \ -do { \ - __raw_write_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_SPIN_OP_RET(__spin_lock_irqsave_nested, \ + _spin_lock_irqsave_nested, lock, subclass); \ } while (0) +#else +# define spin_lock_irqsave_nested(lock, flags, subclass) \ + spin_lock_irqsave(lock, flags) #endif -#define spin_unlock_irqrestore(lock, flags) \ - _spin_unlock_irqrestore(lock, flags) -#define spin_unlock_bh(lock) _spin_unlock_bh(lock) +# define read_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(__read_lock_irqsave, _read_lock_irqsave, lock);\ +} while (0) + +# define write_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(__write_lock_irqsave, _write_lock_irqsave,lock);\ +} while (0) + +#define spin_lock_irq(lock) PICK_SPIN_OP(__spin_lock_irq, _spin_lock_irq, lock) + +#define spin_lock_bh(lock) PICK_SPIN_OP(__spin_lock_bh, _spin_lock_bh, lock) + +#define read_lock_irq(lock) PICK_RW_OP(__read_lock_irq, _read_lock_irq, lock) -#define read_unlock_irqrestore(lock, flags) \ - _read_unlock_irqrestore(lock, flags) -#define read_unlock_bh(lock) _read_unlock_bh(lock) +#define read_lock_bh(lock) PICK_RW_OP(__read_lock_bh, _read_lock_bh, lock) -#define write_unlock_irqrestore(lock, flags) \ - _write_unlock_irqrestore(lock, flags) -#define write_unlock_bh(lock) _write_unlock_bh(lock) +#define write_lock_irq(lock) PICK_RW_OP(__write_lock_irq, _write_lock_irq, lock) -#define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) +#define write_lock_bh(lock) PICK_RW_OP(__write_lock_bh, _write_lock_bh, lock) -#define spin_trylock_irq(lock) \ -({ \ - local_irq_disable(); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_enable(); 0; }); \ -}) +#define spin_unlock(lock) PICK_SPIN_OP(__spin_unlock, _spin_unlock, lock) + +#define read_unlock(lock) PICK_RW_OP(__read_unlock, _read_unlock, lock) + +#define write_unlock(lock) PICK_RW_OP(__write_unlock, _write_unlock, lock) + +#define spin_unlock_no_resched(lock) \ + PICK_SPIN_OP(__spin_unlock_no_resched, _spin_unlock_no_resched, lock) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_SPIN_OP(__spin_unlock_irqrestore, _spin_unlock_irqrestore, \ + lock, flags); \ +} while (0) + +#define spin_unlock_irq(lock) \ + PICK_SPIN_OP(__spin_unlock_irq, _spin_unlock_irq, lock) +#define spin_unlock_bh(lock) \ + PICK_SPIN_OP(__spin_unlock_bh, _spin_unlock_bh, lock) + +#define read_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP(__read_unlock_irqrestore, _read_unlock_irqrestore, \ + lock, flags); \ +} while (0) + +#define read_unlock_irq(lock) \ + PICK_RW_OP(__read_unlock_irq, _read_unlock_irq, lock) +#define read_unlock_bh(lock) PICK_RW_OP(__read_unlock_bh, _read_unlock_bh, lock) + +#define write_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP(__write_unlock_irqrestore, _write_unlock_irqrestore, \ + lock, flags); \ +} while (0) +#define write_unlock_irq(lock) \ + PICK_RW_OP(__write_unlock_irq, _write_unlock_irq, lock) + +#define write_unlock_bh(lock) \ + PICK_RW_OP(__write_unlock_bh, _write_unlock_bh, lock) + +#define spin_trylock_bh(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_bh, _spin_trylock_bh,\ + lock)) + +#define spin_trylock_irq(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irq, \ + _spin_trylock_irq, lock)) #define spin_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irqsave, \ + _spin_trylock_irqsave, lock, &flags)) -#define write_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - write_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/* "lock on reference count zero" */ +#ifndef ATOMIC_DEC_AND_LOCK +# include + extern int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic); +#endif + +#define atomic_dec_and_lock(atomic, lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__atomic_dec_and_spin_lock, \ + _atomic_dec_and_spin_lock, lock, atomic)) + +/* + * bit-based spin_lock() + * + * Don't use this unless you really need to: spin_lock() and spin_unlock() + * are significantly faster. + */ +static inline void bit_spin_lock(int bitnum, unsigned long *addr) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + while (test_and_set_bit(bitnum, addr)) + while (test_bit(bitnum, addr)) + cpu_relax(); +#endif + __acquire(bitlock); +} + +/* + * Return true if it was acquired + */ +static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + if (test_and_set_bit(bitnum, addr)) + return 0; +#endif + __acquire(bitlock); + return 1; +} + +/* + * bit-based spin_unlock() + */ +static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + BUG_ON(!test_bit(bitnum, addr)); + smp_mb__before_clear_bit(); + clear_bit(bitnum, addr); +#endif + __release(bitlock); +} + +/* + * Return true if the lock is held. + */ +static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + return test_bit(bitnum, addr); +#else + return 1; +#endif +} + +/** + * __raw_spin_can_lock - would __raw_spin_trylock() succeed? + * @lock: the spinlock in question. + */ +#define __raw_spin_can_lock(lock) (!__raw_spin_is_locked(lock)) /* * Locks two spinlocks l1 and l2. * l1_first indicates if spinlock l1 should be taken first. */ -static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, - bool l1_first) +static inline void +raw_double_spin_lock(raw_spinlock_t *l1, raw_spinlock_t *l2, bool l1_first) + __acquires(l1) + __acquires(l2) +{ + if (l1_first) { + spin_lock(l1); + spin_lock(l2); + } else { + spin_lock(l2); + spin_lock(l1); + } +} + +static inline void +double_spin_lock(spinlock_t *l1, spinlock_t *l2, bool l1_first) __acquires(l1) __acquires(l2) { @@ -313,13 +622,15 @@ static inline void double_spin_lock(spin } } + /* * Unlocks two spinlocks l1 and l2. * l1_taken_first indicates if spinlock l1 was taken first and therefore * should be released after spinlock l2. */ -static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, - bool l1_taken_first) +static inline void +raw_double_spin_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2, + bool l1_taken_first) __releases(l1) __releases(l2) { @@ -332,27 +643,18 @@ static inline void double_spin_unlock(sp } } -/* - * Pull the atomic_t declaration: - * (asm-mips/atomic.h needs above definitions) - */ -#include -/** - * atomic_dec_and_lock - lock on reaching reference count zero - * @atomic: the atomic counter - * @lock: the spinlock in question - * - * Decrements @atomic by 1. If the result is 0, returns true and locks - * @lock. Returns false for all other cases. - */ -extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); -#define atomic_dec_and_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) - -/** - * spin_can_lock - would spin_trylock() succeed? - * @lock: the spinlock in question. - */ -#define spin_can_lock(lock) (!spin_is_locked(lock)) +static inline void +double_spin_unlock(spinlock_t *l1, spinlock_t *l2, bool l1_taken_first) + __releases(l1) + __releases(l2) +{ + if (l1_taken_first) { + spin_unlock(l2); + spin_unlock(l1); + } else { + spin_unlock(l1); + spin_unlock(l2); + } +} #endif /* __LINUX_SPINLOCK_H */ Index: linux-2.6.25.8-rt7/kernel/irq/migration.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/migration.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/migration.c 2008-06-23 19:13:15.000000000 -0400 @@ -61,6 +61,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_desc + irq; + int mask = 1; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -68,8 +69,17 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + /* + * If the irq is already in progress, it should be masked. + * If we unmask it, we might cause an interrupt storm on RT. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) + mask = 0; + + if (mask) + desc->chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + if (mask) + desc->chip->unmask(irq); } Index: linux-2.6.25.8-rt7/arch/x86/kernel/io_apic_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/io_apic_64.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/io_apic_64.c 2008-06-23 19:14:18.000000000 -0400 @@ -93,8 +93,8 @@ int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -207,6 +207,9 @@ static inline void io_apic_sync(unsigned reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -351,10 +354,11 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ + +DO_ACTION( __pcix_mask, 0, &= 0xffff7fff, ) /* edge */ +DO_ACTION( __pcix_unmask, 0, = (reg & 0xfffeffff) | 0x00008000, ) /* level */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -373,6 +377,23 @@ static void unmask_IO_APIC_irq (unsigned __unmask_IO_APIC_irq(irq); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { @@ -798,17 +819,20 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, unsigned long trigger, int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if (trigger) { irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); } else { irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } } @@ -853,7 +877,7 @@ static void setup_IO_APIC_irq(int apic, if (trigger) entry.mask = 1; - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, trigger, apic > 0); if (irq < 16) disable_8259A_irq(irq); @@ -1442,7 +1466,8 @@ static void ack_apic_level(unsigned int irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING) && + !(irq_desc[irq].status & IRQ_INPROGRESS)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } @@ -1486,17 +1511,39 @@ static void ack_apic_level(unsigned int move_masked_irq(irq); unmask_IO_APIC_irq(irq); } +#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \ + defined(CONFIG_PREEMPT_HARDIRQS) + /* + * With threaded interrupts, we always have IRQ_INPROGRESS + * when acking. + */ + else if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) + move_masked_irq(irq); +#endif } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -1696,7 +1743,6 @@ static inline void __init check_timer(vo */ unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); @@ -1722,7 +1768,6 @@ static inline void __init check_timer(vo setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } Index: linux-2.6.25.8-rt7/net/core/flow.c =================================================================== --- linux-2.6.25.8-rt7.orig/net/core/flow.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/net/core/flow.c 2008-06-23 19:13:30.000000000 -0400 @@ -40,9 +40,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT( static u32 flow_hash_shift; #define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); + +#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) static struct kmem_cache *flow_cachep __read_mostly; @@ -169,24 +170,24 @@ static int flow_key_compare(struct flowi void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head; + struct flow_cache_entry **table, *fle, **head = NULL /* shut up GCC */; unsigned int hash; int cpu; local_bh_disable(); - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!table) goto nocache; if (flow_hash_rnd_recalc(cpu)) flow_new_hash_rnd(cpu); hash = flow_hash_code(key, cpu); - head = &flow_table(cpu)[hash]; + head = &table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -196,6 +197,7 @@ void *flow_cache_lookup(struct flowi *ke if (ret) atomic_inc(fle->object_ref); + put_cpu_var_locked(flow_tables, cpu); local_bh_enable(); return ret; @@ -221,6 +223,8 @@ void *flow_cache_lookup(struct flowi *ke } nocache: + put_cpu_var_locked(flow_tables, cpu); + { int err; void *obj; @@ -250,14 +254,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache_entry **table; int i; int cpu; - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); for (i = 0; i < flow_hash_size; i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -268,6 +273,7 @@ static void flow_cache_flush_tasklet(uns atomic_dec(fle->object_ref); } } + put_cpu_var_locked(flow_tables, cpu); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); Index: linux-2.6.25.8-rt7/net/sunrpc/svc.c =================================================================== --- linux-2.6.25.8-rt7.orig/net/sunrpc/svc.c 2008-06-23 19:12:57.000000000 -0400 +++ linux-2.6.25.8-rt7/net/sunrpc/svc.c 2008-06-23 19:13:15.000000000 -0400 @@ -584,7 +584,7 @@ __svc_create_thread(svc_thread_fn func, struct svc_rqst *rqstp; int error = -ENOMEM; int have_oldmask = 0; - cpumask_t oldmask; + cpumask_t uninitialized_var(oldmask); rqstp = svc_prepare_thread(serv, pool); if (IS_ERR(rqstp)) { Index: linux-2.6.25.8-rt7/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/net/netfilter/nf_conntrack.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/net/netfilter/nf_conntrack.h 2008-06-23 19:14:03.000000000 -0400 @@ -63,11 +63,14 @@ union nf_conntrack_help { #ifdef CONFIG_NETFILTER_DEBUG #define NF_CT_ASSERT(x) \ do { \ - if (!(x)) \ + if (!(x)) { \ /* Wooah! I'm tripping my conntrack in a frenzy of \ netplay... */ \ printk("NF_CT_ASSERT: %s:%i(%s)\n", \ __FILE__, __LINE__, __FUNCTION__); \ + if (printk_ratelimit()) \ + WARN_ON(1); \ + } \ } while(0) #else #define NF_CT_ASSERT(x) @@ -255,11 +258,11 @@ extern atomic_t nf_conntrack_count; extern int nf_conntrack_max; DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -#define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) +#define NF_CT_STAT_INC(count) (__raw_get_cpu_var(nf_conntrack_stat).count++) #define NF_CT_STAT_INC_ATOMIC(count) \ do { \ local_bh_disable(); \ - __get_cpu_var(nf_conntrack_stat).count++; \ + __raw_get_cpu_var(nf_conntrack_stat).count++; \ local_bh_enable(); \ } while (0) Index: linux-2.6.25.8-rt7/arch/x86/kernel/crash.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/crash.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/crash.c 2008-06-23 19:13:16.000000000 -0400 @@ -78,14 +78,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); -} - static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; Index: linux-2.6.25.8-rt7/include/asm-x86/apic.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/apic.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/apic.h 2008-06-23 19:13:16.000000000 -0400 @@ -137,4 +137,6 @@ static inline void lapic_shutdown(void) #endif /* !CONFIG_X86_LOCAL_APIC */ +extern void smp_send_nmi_allbutself(void); + #endif /* __ASM_APIC_H */ Index: linux-2.6.25.8-rt7/include/linux/profile.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/profile.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/profile.h 2008-06-23 19:13:44.000000000 -0400 @@ -6,16 +6,18 @@ #include #include #include +#include #include #include extern int prof_on __read_mostly; -#define CPU_PROFILING 1 -#define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 -#define KVM_PROFILING 4 +#define CPU_PROFILING 1 +#define SCHED_PROFILING 2 +#define SLEEP_PROFILING 3 +#define KVM_PROFILING 4 +#define PREEMPT_PROFILING 5 struct proc_dir_entry; struct pt_regs; @@ -23,6 +25,7 @@ struct notifier_block; /* init basic kernel profiler */ void __init profile_init(void); +void __profile_tick(int type, struct pt_regs *regs); void profile_tick(int); /* @@ -53,6 +56,8 @@ enum profile_type { PROFILE_MUNMAP }; +extern int prof_pid; + #ifdef CONFIG_PROFILING struct task_struct; Index: linux-2.6.25.8-rt7/kernel/profile.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/profile.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/profile.c 2008-06-23 19:13:48.000000000 -0400 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +int prof_pid = -1; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -408,16 +410,20 @@ void profile_hits(int type, void *__pc, #endif /* !CONFIG_SMP */ EXPORT_SYMBOL_GPL(profile_hits); -void profile_tick(int type) +void __profile_tick(int type, struct pt_regs *regs) { - struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && + (prof_pid == -1 || prof_pid == current->pid)) profile_hit(type, (void *)profile_pc(regs)); } +void profile_tick(int type) +{ + return __profile_tick(type, get_irq_regs()); +} + #ifdef CONFIG_PROC_FS #include #include Index: linux-2.6.25.8-rt7/kernel/time/tick-common.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/time/tick-common.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/time/tick-common.c 2008-06-23 19:13:49.000000000 -0400 @@ -32,7 +32,7 @@ DEFINE_PER_CPU(struct tick_device, tick_ ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = -1; -DEFINE_SPINLOCK(tick_device_lock); +DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -68,7 +68,6 @@ static void tick_periodic(int cpu) } update_process_times(user_mode(get_irq_regs())); - profile_tick(CPU_PROFILING); } /* Index: linux-2.6.25.8-rt7/kernel/time/tick-sched.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/time/tick-sched.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/time/tick-sched.c 2008-06-23 19:13:50.000000000 -0400 @@ -219,10 +219,12 @@ void tick_nohz_stop_sched_tick(void) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - if (need_resched()) + if (need_resched() || need_resched_delayed()) goto end; cpu = smp_processor_id(); + +#ifndef CONFIG_PREEMPT_RT if (unlikely(local_softirq_pending())) { static int ratelimit; @@ -232,6 +234,7 @@ void tick_nohz_stop_sched_tick(void) ratelimit++; } } +#endif ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ @@ -476,7 +479,6 @@ static void tick_nohz_handler(struct clo } update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); /* Do not restart, when we are in the idle loop */ if (ts->tick_stopped) @@ -583,7 +585,6 @@ static enum hrtimer_restart tick_sched_t ts->idle_jiffies++; } update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); } /* Do not restart, when we are in the idle loop */ Index: linux-2.6.25.8-rt7/arch/ppc/boot/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/arch/ppc/boot/Makefile 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/ppc/boot/Makefile 2008-06-23 19:13:16.000000000 -0400 @@ -15,6 +15,15 @@ # KBUILD_CFLAGS used when building rest of boot (takes effect recursively) KBUILD_CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +KBUILD_CFLAGS := $(subst ${pg_flag},${space},${KBUILD_CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux-2.6.25.8-rt7/arch/arm/boot/compressed/head.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/boot/compressed/head.S 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/boot/compressed/head.S 2008-06-23 19:13:16.000000000 -0400 @@ -941,6 +941,19 @@ memdump: mov r12, r0 #endif .ltorg +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux-2.6.25.8-rt7/arch/arm/kernel/entry-common.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/entry-common.S 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/entry-common.S 2008-06-23 19:13:29.000000000 -0400 @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -44,7 +46,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_SIGPENDING beq no_work_pending @@ -54,7 +56,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -394,6 +397,114 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc, lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + mov r2, =mcount_trace_function + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl r2 +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr #endif Index: linux-2.6.25.8-rt7/arch/arm/kernel/fiq.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/fiq.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/fiq.c 2008-06-23 19:13:16.000000000 -0400 @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux-2.6.25.8-rt7/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/mm/copypage-v4mc.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/mm/copypage-v4mc.c 2008-06-23 19:13:35.000000000 -0400 @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-2.6.25.8-rt7/arch/arm/mm/copypage-xscale.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/mm/copypage-xscale.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/mm/copypage-xscale.c 2008-06-23 19:13:35.000000000 -0400 @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-2.6.25.8-rt7/arch/arm/mm/fault.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/mm/fault.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/mm/fault.c 2008-06-23 19:13:58.000000000 -0400 @@ -239,7 +239,7 @@ out: return fault; } -static int __kprobes +static int notrace __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -256,7 +256,7 @@ do_page_fault(unsigned long addr, unsign * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* @@ -338,7 +338,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int __kprobes +static int notrace __kprobes do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -381,7 +381,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { do_bad_area(addr, fsr, regs); @@ -391,7 +391,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -446,7 +446,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -460,7 +460,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void __exception +asmlinkage void __exception notrace do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -479,7 +479,7 @@ do_DataAbort(unsigned long addr, unsigne arm_notify_die("", regs, &info, fsr, 0); } -asmlinkage void __exception +asmlinkage void __exception notrace do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux-2.6.25.8-rt7/include/asm-arm/pgalloc.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/pgalloc.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/pgalloc.h 2008-06-23 19:13:16.000000000 -0400 @@ -111,7 +111,7 @@ static inline void __pmd_populate(pmd_t * * Ensure that we always set both PMD entries. */ -static inline void +static inline void notrace pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { unsigned long pte_ptr = (unsigned long)ptep; @@ -124,7 +124,7 @@ pmd_populate_kernel(struct mm_struct *mm __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); } -static inline void +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { __pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); Index: linux-2.6.25.8-rt7/include/asm-arm/timex.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/timex.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/timex.h 2008-06-23 19:14:15.000000000 -0400 @@ -16,9 +16,17 @@ typedef unsigned long cycles_t; +#ifndef mach_read_cycles + #define mach_read_cycles() (0) +#ifdef CONFIG_EVENT_TRACE + #define mach_cycles_to_usecs(d) (d) + #define mach_usecs_to_cycles(d) (d) +#endif +#endif + static inline cycles_t get_cycles (void) { - return 0; + return mach_read_cycles(); } #endif Index: linux-2.6.25.8-rt7/include/asm-arm/unistd.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/unistd.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/unistd.h 2008-06-23 19:13:16.000000000 -0400 @@ -380,6 +380,10 @@ #define __NR_eventfd (__NR_SYSCALL_BASE+351) #define __NR_fallocate (__NR_SYSCALL_BASE+352) +#ifndef __ASSEMBLY__ +#define NR_syscalls (__NR_fallocate + 1 - __NR_SYSCALL_BASE) +#endif + /* * The following SWIs are ARM private. */ Index: linux-2.6.25.8-rt7/arch/arm/Kconfig =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/Kconfig 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/Kconfig 2008-06-23 19:13:20.000000000 -0400 @@ -41,6 +41,10 @@ config GENERIC_CLOCKEVENTS_BROADCAST depends on GENERIC_CLOCKEVENTS default y if SMP && !LOCAL_TIMERS +config STACKTRACE_SUPPORT + bool + default y + config MMU bool default y @@ -691,18 +695,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source kernel/Kconfig.preempt config NO_IDLE_HZ bool "Dynamic tick timer" Index: linux-2.6.25.8-rt7/arch/arm/lib/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/lib/Makefile 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/lib/Makefile 2008-06-23 19:13:17.000000000 -0400 @@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC) += ecard.o io-ac lib-$(CONFIG_ARCH_CLPS7500) += io-acorn.o lib-$(CONFIG_ARCH_L7200) += io-acorn.o lib-$(CONFIG_ARCH_SHARK) += io-shark.o +lib-$(CONFIG_STACKTRACE) += stacktrace.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S Index: linux-2.6.25.8-rt7/arch/arm/lib/stacktrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/arch/arm/lib/stacktrace.c 2008-06-23 19:13:17.000000000 -0400 @@ -0,0 +1,7 @@ +#include +#include + +void save_stack_trace(struct stack_trace *trace) +{ +} + Index: linux-2.6.25.8-rt7/include/asm-arm/arch-ep93xx/timex.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/arch-ep93xx/timex.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/arch-ep93xx/timex.h 2008-06-23 19:13:17.000000000 -0400 @@ -1,5 +1,11 @@ /* * linux/include/asm-arm/arch-ep93xx/timex.h */ +#include +#include #define CLOCK_TICK_RATE 983040 + +#define mach_read_cycles() __raw_readl(EP93XX_TIMER4_VALUE_LOW) +#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32) +#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32) Index: linux-2.6.25.8-rt7/drivers/char/random.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/char/random.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/char/random.c 2008-06-23 19:13:17.000000000 -0400 @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, Index: linux-2.6.25.8-rt7/drivers/char/Kconfig =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/char/Kconfig 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/char/Kconfig 2008-06-23 19:14:01.000000000 -0400 @@ -752,6 +752,46 @@ config JS_RTC To compile this driver as a module, choose M here: the module will be called js-rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_HAS_DS1286 @@ -1041,6 +1081,24 @@ config TELCLOCK /sys/devices/platform/telco_clock, with a number of files for controlling the behavior of this hardware. +config RMEM + tristate "Access to physical memory via /dev/rmem" + default m + help + The /dev/mem device only allows mmap() memory available to + I/O mapped memory; it does not allow access to "real" + physical memory. The /dev/rmem device is a hack which does + allow access to physical memory. We use this instead of + patching /dev/mem because we don't expect this functionality + to ever be accepted into mainline. + +config ALLOC_RTSJ_MEM + tristate "RTSJ-specific hack to reserve memory" + default m + help + The RTSJ TCK conformance test requires reserving some physical + memory for testing /dev/rmem. + config DEVPORT bool depends on !M68K Index: linux-2.6.25.8-rt7/drivers/char/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/char/Makefile 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/char/Makefile 2008-06-23 19:14:01.000000000 -0400 @@ -86,6 +86,8 @@ obj-$(CONFIG_TOSHIBA) += toshiba.o obj-$(CONFIG_I8K) += i8k.o obj-$(CONFIG_DS1620) += ds1620.o obj-$(CONFIG_HW_RANDOM) += hw_random/ +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o obj-$(CONFIG_COBALT_LCD) += lcd.o obj-$(CONFIG_PPDEV) += ppdev.o obj-$(CONFIG_NWBUTTON) += nwbutton.o @@ -97,6 +99,7 @@ obj-$(CONFIG_CS5535_GPIO) += cs5535_gpio obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu.o obj-$(CONFIG_GPIO_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_RMEM) += rmem.o obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ @@ -112,6 +115,8 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_JS_RTC) += js-rtc.o js-rtc-y = rtc.o +obj-$(CONFIG_ALLOC_RTSJ_MEM) += alloc_rtsj_mem.o + # Files generated that shall be removed upon make clean clean-files := consolemap_deftbl.c defkeymap.c Index: linux-2.6.25.8-rt7/drivers/char/blocker.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/drivers/char/blocker.c 2008-06-23 19:13:17.000000000 -0400 @@ -0,0 +1,109 @@ +/* + * priority inheritance testing device + */ + +#include +#include +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux-2.6.25.8-rt7/drivers/char/lpptest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/drivers/char/lpptest.c 2008-06-23 19:13:17.000000000 -0400 @@ -0,0 +1,178 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux-2.6.25.8-rt7/drivers/char/rtc.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/char/rtc.c 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/char/rtc.c 2008-06-23 19:13:51.000000000 -0400 @@ -94,6 +94,32 @@ static unsigned long rtc_port; static int rtc_irq = PCI_IRQ_NONE; #endif +#ifdef CONFIG_MIPS +# include +#endif + +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef CONFIG_HPET_RTC_IRQ #undef RTC_IRQ #endif @@ -224,7 +250,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -268,9 +433,9 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync(&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -380,6 +545,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) { retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); @@ -627,6 +794,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -636,6 +808,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -737,6 +910,7 @@ static int rtc_open(struct inode *inode, if (rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -790,6 +964,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq(&rtc_lock); + rtc_close_event(); return 0; } @@ -1199,10 +1374,12 @@ static void rtc_dropped_irq(unsigned lon spin_unlock_irq(&rtc_lock); +#ifndef CONFIG_PREEMPT_RT if (printk_ratelimit()) { printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); } +#endif /* Now we have new data */ wake_up_interruptible(&rtc_wait); Index: linux-2.6.25.8-rt7/scripts/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/scripts/Makefile 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/scripts/Makefile 2008-06-23 19:14:05.000000000 -0400 @@ -12,6 +12,12 @@ hostprogs-$(CONFIG_LOGO) += pnmt hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c +HOST_OS := $(shell uname) +ifeq ($(HOST_OS),Linux) +ifdef CONFIG_LPPTEST +hostprogs-y += testlpp +endif +endif always := $(hostprogs-y) $(hostprogs-m) Index: linux-2.6.25.8-rt7/scripts/testlpp.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/scripts/testlpp.c 2008-06-23 19:13:17.000000000 -0400 @@ -0,0 +1,159 @@ +/* + * testlpp.c: use the /dev/lpptest device to test IRQ handling + * latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner + * + * licensed under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +#define HIST_SIZE 10000 + +static int hist_total; +static unsigned long hist[HIST_SIZE]; + +static void hist_hit(unsigned long usecs) +{ + hist_total++; + if (usecs >= HIST_SIZE-1) + hist[HIST_SIZE-1]++; + else + hist[usecs]++; +} + +static void print_hist(void) +{ + int i; + + printf("LPP latency histogram:\n"); + + for (i = 0; i < HIST_SIZE; i++) { + if (hist[i]) + printf("%3d usecs: %9ld\n", i, hist[i]); + } +} + +static inline unsigned long long int rdtsc(void) +{ + unsigned long long int x, y; + for (;;) { + __asm__ volatile ("rdtsc" : "=A" (x)); + __asm__ volatile ("rdtsc" : "=A" (y)); + if (y - x < 1000) + return y; + } +} + +static unsigned long long calibrate_loop(void) +{ + unsigned long long mytime1, mytime2; + + mytime1 = rdtsc(); + usleep(500000); + mytime2 = rdtsc(); + + return (mytime2 - mytime1) * 2; +} + +#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec) + +#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec) + +int fd, total; +unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec; + +void cleanup(int sig) +{ + ioctl (fd, LPPTEST_ENABLE, &tim); + if (sig) + printf("[ interrupted - exiting ]\n"); + printf("\ntotal number of responses: %d\n", total); + printf("average reponse latency: %.2lf usecs\n", + time_to_usecs(sum_tim/total)); + printf("minimum latency: %.2lf usecs\n", + time_to_usecs(min_tim)); + printf("maximum latency: %.2lf usecs\n", + time_to_usecs(max_tim)); + print_hist(); + exit(0); +} + +#define HZ 3000 + +int main (int argc, char **argv) +{ + unsigned int nr_requests = 0; + + if (argc > 2) { + fprintf(stderr, "usage: testlpp []\n"); + exit(-1); + } + if (argc == 2) + nr_requests = atol(argv[1]); + + if (getuid() != 0) { + fprintf(stderr, "need to run as root!\n"); + exit(-1); + } + mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1)); + + fd = open("/dev/lpptest", O_RDWR); + if (fd == -1) { + fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n"); + exit(-1); + } + + signal(SIGINT,&cleanup); + + ioctl (fd, LPPTEST_DISABLE, &tim); + + fprintf(stderr, "calibrating cycles to usecs: "); + cycles_per_sec = calibrate_loop(); + fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000); + if (nr_requests) + fprintf(stderr, "[max # of requests: %u]\n", nr_requests); + fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ); + + while(1) { + ioctl (fd, LPPTEST_TEST, &tim); + if (tim == 0) + printf ("No response from target.\n"); + else { + hist_hit(time_to_usecs_l(tim)); + if (tim > max_tim) { + printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim); + max_tim = tim; + } + if (tim < min_tim) + min_tim = tim; + total++; + if (total == nr_requests) + break; + sum_tim += tim; + } + usleep(1000000/HZ); + } + cleanup(0); + + return 0; +} + + Index: linux-2.6.25.8-rt7/include/linux/lockdep.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/lockdep.h 2008-06-23 19:12:56.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/lockdep.h 2008-06-23 19:14:25.000000000 -0400 @@ -304,6 +304,9 @@ extern void lock_acquire(struct lockdep_ extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); +extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, + unsigned long ip); + # define INIT_LOCKDEP .lockdep_recursion = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -320,6 +323,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) @@ -357,6 +361,38 @@ do { \ lock_acquired(&(_lock)->dep_map); \ } while (0) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ +do { \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + f_lock(&(_lock)->lock); \ + } \ + lock_acquired(&(_lock)->dep_map); \ +} while (0) + + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ +({ \ + int ret = 0; \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ret = f_lock(&(_lock)->lock); \ + } \ + if (!ret) \ + lock_acquired(&(_lock)->dep_map); \ + ret; \ +}) + +#define LOCK_CONTENDED_RT_RW(_lock, f_try, f_lock) \ +do { \ + if (!f_try(&(_lock)->owners)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + f_lock(&(_lock)->owners); \ + } \ + lock_acquired(&(_lock)->dep_map); \ +} while (0) + + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) @@ -365,6 +401,15 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + +#define LOCK_CONTENDED_RT_RW(_lock, f_try, f_lock) \ + f_lock(&(_lock)->owners) + #endif /* CONFIG_LOCK_STAT */ #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) Index: linux-2.6.25.8-rt7/kernel/lockdep_internals.h =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/lockdep_internals.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/lockdep_internals.h 2008-06-23 19:13:18.000000000 -0400 @@ -15,12 +15,12 @@ * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_KEYS_BITS 11 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) /* Index: linux-2.6.25.8-rt7/drivers/net/loopback.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/net/loopback.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/net/loopback.c 2008-06-23 19:13:50.000000000 -0400 @@ -152,13 +152,13 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = netdev_priv(dev); - lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); + lb_stats = per_cpu_ptr(pcpu_lstats, get_cpu()); lb_stats->bytes += skb->len; lb_stats->packets++; + put_cpu(); - netif_rx(skb); + netif_rx_ni(skb); return 0; } Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/time.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/time.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/time.c 2008-06-23 19:13:19.000000000 -0400 @@ -796,7 +796,7 @@ static cycle_t rtc_read(void) return (cycle_t)get_rtc(); } -static cycle_t timebase_read(void) +static cycle_t notrace timebase_read(void) { return (cycle_t)get_tb(); } Index: linux-2.6.25.8-rt7/include/asm-arm/atomic.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/atomic.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/atomic.h 2008-06-23 19:14:34.000000000 -0400 @@ -114,6 +114,46 @@ static inline void atomic_clear_mask(uns : "cc"); } +/* + * Atomic compare and exchange. + */ +#define __HAVE_ARCH_CMPXCHG 1 + +extern unsigned long wrong_size_cmpxchg(volatile void *ptr); + +static inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + volatile unsigned long *p = ptr; + + if (size == 4) { + unsigned long oldval, res; + + do { + __asm__ __volatile__("@ atomic_cmpxchg\n" + "ldrex %1, [%2]\n" + "mov %0, #0\n" + "teq %1, %3\n" + "strexeq %0, %4, [%2]\n" + : "=&r" (res), "=&r" (oldval) + : "r" (p), "Ir" (old), "r" (new) + : "cc"); + } while (res); + + return oldval; + } else + return wrong_size_cmpxchg(ptr); +} + +#define cmpxchg__disabled(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + #else /* ARM_ARCH_6 */ #include @@ -173,6 +213,41 @@ static inline void atomic_clear_mask(uns raw_local_irq_restore(flags); } +#ifndef CONFIG_SMP +/* + * Atomic compare and exchange. + */ +#define __HAVE_ARCH_CMPXCHG 1 + +extern unsigned long wrong_size_cmpxchg(volatile void *ptr); + +static inline unsigned long __cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long flags, prev; + volatile unsigned long *p = ptr; + + if (size == 4) { + raw_local_irq_save(flags); + if ((prev = *p) == old) + *p = new; + raw_local_irq_restore(flags); + return(prev); + } else + return wrong_size_cmpxchg(ptr); +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + +#endif + #endif /* __LINUX_ARM_ARCH__ */ #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) Index: linux-2.6.25.8-rt7/include/asm-arm/futex.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/futex.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/futex.h 2008-06-23 19:13:35.000000000 -0400 @@ -1,6 +1,125 @@ -#ifndef _ASM_FUTEX_H -#define _ASM_FUTEX_H +#ifndef _ASM_ARM_FUTEX_H +#define _ASM_ARM_FUTEX_H -#include +#ifdef __KERNEL__ +#include +#include +#include + +extern raw_spinlock_t futex_atomic_lock; + +#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \ + __asm__ __volatile__ ( \ + "1: ldrt %1, [%2] \n" \ + insn \ + "2: strt %0, [%2] \n" \ + " mov %0, #0 \n" \ + "3: \n" \ + " .section __ex_table, \"a\" \n" \ + " .align 3 \n" \ + " .long 1b, 4f, 2b, 4f \n" \ + " .previous \n" \ + " .section .fixup,\"ax\" \n" \ + "4: mov %0, %4 \n" \ + " b 3b \n" \ + " .previous" \ + : "=&r" (ret), "=&r" (oldval) \ + : "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \ + : "cc", "memory") + +static inline int +futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +{ + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; + int oparg = (encoded_op << 8) >> 20; + int cmparg = (encoded_op << 20) >> 20; + int oldval = 0, ret; + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) + oparg = 1 << oparg; + + if (!access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + + pagefault_disable(); + + spin_lock(&futex_atomic_lock); + + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op(" mov %0, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op(" add %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op(" orr %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op(" and %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op(" eor %0, %1, %3\n", + ret, oldval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } + + spin_unlock(&futex_atomic_lock); + + pagefault_enable(); + + if (!ret) { + switch (cmp) { + case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; + case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; + case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; + case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; + case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; + case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; + default: ret = -ENOSYS; + } + } + return ret; +} + +static inline int +futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +{ + int val; + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + return -EFAULT; + + spin_lock(&futex_atomic_lock); + + __asm__ __volatile__( "@futex_atomic_cmpxchg_inatomic \n" + "1: ldrt %0, [%3] \n" + " teq %0, %1 \n" + "2: streqt %2, [%3] \n" + "3: \n" + " .section __ex_table, \"a\" \n" + " .align 3 \n" + " .long 1b, 4f, 2b, 4f \n" + " .previous \n" + " .section .fixup,\"ax\" \n" + "4: mov %0, %4 \n" + " b 3b \n" + " .previous" + : "=&r" (val) + : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) + : "cc"); + + spin_unlock(&futex_atomic_lock); + + return val; +} + +#endif #endif Index: linux-2.6.25.8-rt7/arch/arm/kernel/process.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/process.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/process.c 2008-06-23 19:14:28.000000000 -0400 @@ -36,6 +36,8 @@ #include #include +DEFINE_RAW_SPINLOCK(futex_atomic_lock); + static const char *processor_modes[] = { "USER_26", "FIQ_26" , "IRQ_26" , "SVC_26" , "UK4_26" , "UK5_26" , "UK6_26" , "UK7_26" , "UK8_26" , "UK9_26" , "UK10_26", "UK11_26", "UK12_26", "UK13_26", "UK14_26", "UK15_26", @@ -133,7 +135,7 @@ static void default_idle(void) cpu_relax(); else { local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { timer_dyn_reprogram(); arch_idle(); } @@ -165,13 +167,15 @@ void cpu_idle(void) idle = default_idle; leds_event(led_idle_start); tick_nohz_stop_sched_tick(); - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) idle(); leds_event(led_idle_end); + local_irq_disable(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux-2.6.25.8-rt7/include/linux/bottom_half.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/bottom_half.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/bottom_half.h 2008-06-23 19:14:02.000000000 -0400 @@ -1,10 +1,17 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H +#ifdef CONFIG_PREEMPT_HARDIRQS +# define local_bh_disable() do { } while (0) +# define __local_bh_disable(ip) do { } while (0) +# define _local_bh_enable() do { } while (0) +# define local_bh_enable() do { } while (0) +# define local_bh_enable_ip(ip) do { } while (0) +#else extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); +#endif #endif /* _LINUX_BH_H */ Index: linux-2.6.25.8-rt7/include/linux/interrupt.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/interrupt.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/interrupt.h 2008-06-23 19:13:32.000000000 -0400 @@ -50,10 +50,12 @@ #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +#define IRQF_NODELAY 0x00002000 +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) typedef irqreturn_t (*irq_handler_t)(int, void *); @@ -65,7 +67,7 @@ struct irqaction { void *dev_id; struct irqaction *next; int irq; - struct proc_dir_entry *dir; + struct proc_dir_entry *dir, *threaded; }; extern irqreturn_t no_action(int cpl, void *dev_id); @@ -95,7 +97,7 @@ extern void devm_free_irq(struct device #ifdef CONFIG_LOCKDEP # define local_irq_enable_in_hardirq() do { } while (0) #else -# define local_irq_enable_in_hardirq() local_irq_enable() +# define local_irq_enable_in_hardirq() local_irq_enable_nort() #endif extern void disable_irq_nosync(unsigned int irq); @@ -196,6 +198,7 @@ static inline int disable_irq_wake(unsig #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) +// FIXME: PREEMPT_RT: set_bit()? #define or_softirq_pending(x) (local_softirq_pending() |= (x)) #endif @@ -257,6 +260,8 @@ enum HRTIMER_SOFTIRQ, #endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + /* Entries after this are ignored in split softirq mode */ + MAX_SOFTIRQ, }; /* softirq mask and active fields moved to irq_cpustat_t in @@ -269,13 +274,26 @@ struct softirq_action void *data; }; +#ifdef CONFIG_PREEMPT_HARDIRQS +# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) +# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +#else +# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) +#endif + asmlinkage void do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +extern void wakeup_irqd(void); +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern void wait_for_softirq(int softirq); +#else +# define wait_for_softirq(x) do {} while(0) +#endif /* Tasklets --- multithreaded analogue of BHs. @@ -290,8 +308,9 @@ extern void raise_softirq(unsigned int n to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its excecution is still not started, it will be executed only once. - * If this tasklet is already running on another CPU (or schedule is called - from tasklet itself), it is rescheduled for later. + * If this tasklet is already running on another CPU, it is rescheduled + for later. + * Schedule must not be called from the tasklet itself (a lockup occurs) * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. @@ -316,27 +335,36 @@ struct tasklet_struct name = { NULL, 0, enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ + TASKLET_STATE_PENDING /* Tasklet is pending */ }; -#ifdef CONFIG_SMP +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } +static inline int tasklet_tryunlock(struct tasklet_struct *t) +{ + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; +} + static inline void tasklet_unlock(struct tasklet_struct *t) { smp_mb__before_clear_bit(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} +extern void tasklet_unlock_wait(struct tasklet_struct *t); + #else #define tasklet_trylock(t) 1 +#define tasklet_tryunlock(t) 1 #define tasklet_unlock_wait(t) do { } while (0) #define tasklet_unlock(t) do { } while (0) #endif @@ -371,22 +399,14 @@ static inline void tasklet_disable(struc smp_mb(); } -static inline void tasklet_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} - -static inline void tasklet_hi_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} +extern void tasklet_enable(struct tasklet_struct *t); +extern void tasklet_hi_enable(struct tasklet_struct *t); extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +void takeover_tasklets(unsigned int cpu); /* * Autoprobing for irqs: @@ -446,4 +466,37 @@ static inline void init_irq_proc(void) int show_interrupts(struct seq_file *p, void *v); +#ifdef CONFIG_PREEMPT_RT +# define local_irq_disable_nort() do { } while (0) +# define local_irq_enable_nort() do { } while (0) +# define local_irq_enable_rt() local_irq_enable() +# define local_irq_save_nort(flags) do { local_save_flags(flags); } while (0) +# define local_irq_restore_nort(flags) do { (void)(flags); } while (0) +# define spin_lock_nort(lock) do { } while (0) +# define spin_unlock_nort(lock) do { } while (0) +# define spin_lock_bh_nort(lock) do { } while (0) +# define spin_unlock_bh_nort(lock) do { } while (0) +# define spin_lock_rt(lock) spin_lock(lock) +# define spin_unlock_rt(lock) spin_unlock(lock) +# define smp_processor_id_rt(cpu) (cpu) +# define in_atomic_rt() (!oops_in_progress && \ + (in_atomic() || irqs_disabled())) +# define read_trylock_rt(lock) ({read_lock(lock); 1; }) +#else +# define local_irq_disable_nort() local_irq_disable() +# define local_irq_enable_nort() local_irq_enable() +# define local_irq_enable_rt() do { } while (0) +# define local_irq_save_nort(flags) local_irq_save(flags) +# define local_irq_restore_nort(flags) local_irq_restore(flags) +# define spin_lock_rt(lock) do { } while (0) +# define spin_unlock_rt(lock) do { } while (0) +# define spin_lock_nort(lock) spin_lock(lock) +# define spin_unlock_nort(lock) spin_unlock(lock) +# define spin_lock_bh_nort(lock) spin_lock_bh(lock) +# define spin_unlock_bh_nort(lock) spin_unlock_bh(lock) +# define smp_processor_id_rt(cpu) smp_processor_id() +# define in_atomic_rt() 0 +# define read_trylock_rt(lock) read_trylock(lock) +#endif + #endif Index: linux-2.6.25.8-rt7/kernel/softirq.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/softirq.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/softirq.c 2008-06-23 19:14:24.000000000 -0400 @@ -6,15 +6,23 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include +#include +#include +#include #include #include #include +#include #include #include #include +#include #include #include #include @@ -48,7 +56,41 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; + int running; +#endif +}; + +static DEFINE_PER_CPU(struct softirqdata [MAX_SOFTIRQ], ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS +/* + * Preempting the softirq causes cases that would not be a + * problem when the softirq is not preempted. That is a + * process may have code to spin while waiting for a softirq + * to finish on another CPU. But if it happens that the + * process has preempted the softirq, this could cause a + * deadlock. + */ +void wait_for_softirq(int softirq) +{ + struct softirqdata *data = &__get_cpu_var(ksoftirqd)[softirq]; + if (data->running) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&data->wait, &wait); + if (data->running) + schedule(); + remove_wait_queue(&data->wait, &wait); + __set_current_state(TASK_RUNNING); + } +} +#endif /* * we cannot loop indefinitely here to avoid userspace starvation, @@ -56,15 +98,37 @@ static DEFINE_PER_CPU(struct task_struct * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -static inline void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; - if (tsk && tsk->state != TASK_RUNNING) - wake_up_process(tsk); + if (unlikely(!tsk)) + return; + /* + * Wake up the softirq task: + */ + wake_up_process(tsk); +} + +/* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } } +#ifndef CONFIG_PREEMPT_HARDIRQS + /* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: @@ -100,20 +164,6 @@ void local_bh_disable(void) EXPORT_SYMBOL(local_bh_disable); -void __local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - - /* - * softirqs should never be enabled by __local_bh_enable(), - * it always nests inside local_bh_enable() sections: - */ - WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); - - sub_preempt_count(SOFTIRQ_OFFSET); -} -EXPORT_SYMBOL_GPL(__local_bh_enable); - /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -138,7 +188,6 @@ void local_bh_enable(void) WARN_ON_ONCE(in_irq()); #endif - WARN_ON_ONCE(irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_save(flags); @@ -196,6 +245,8 @@ void local_bh_enable_ip(unsigned long ip } EXPORT_SYMBOL(local_bh_enable_ip); +#endif + /* * We restart softirq processing MAX_SOFTIRQ_RESTART times, * and we fall back to softirqd after that. @@ -205,52 +256,130 @@ EXPORT_SYMBOL(local_bh_enable_ip); * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_RESTART 20 -asmlinkage void __do_softirq(void) +static DEFINE_PER_CPU(u32, softirq_running); + +static void ___do_softirq(const int same_prio_only) { + int max_restart = MAX_SOFTIRQ_RESTART, max_loops = MAX_SOFTIRQ_RESTART; + __u32 pending, available_mask, same_prio_skipped; struct softirq_action *h; - __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; + struct task_struct *tsk; + int cpu, softirq; pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); - cpu = smp_processor_id(); restart: + available_mask = -1; + softirq = 0; + same_prio_skipped = 0; /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); - local_irq_enable(); - h = softirq_vec; do { + u32 softirq_mask = 1 << softirq; + if (pending & 1) { + u32 preempt_count = preempt_count(); + +#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) + /* + * If executed by a same-prio hardirq thread + * then skip pending softirqs that belong + * to softirq threads with different priority: + */ + if (same_prio_only) { + tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; + if (tsk && tsk->normal_prio != + current->normal_prio) { + same_prio_skipped |= softirq_mask; + available_mask &= ~softirq_mask; + goto next; + } + } +#endif + /* + * Is this softirq already being processed? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + available_mask &= ~softirq_mask; + goto next; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + local_irq_enable(); + h->action(h); + if (preempt_count != preempt_count()) { + print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } rcu_bh_qsctr_inc(cpu); + cond_resched_softirq_context(); + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; } +next: h++; + softirq++; pending >>= 1; } while (pending); - local_irq_disable(); - + or_softirq_pending(same_prio_skipped); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending & available_mask) { + if (--max_restart) + goto restart; + /* + * With softirq threading there's no reason not to + * finish the workload we have: + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + if (--max_loops) { + if (printk_ratelimit()) + printk("INFO: softirq overload: %08x\n", pending); + max_restart = MAX_SOFTIRQ_RESTART; + goto restart; + } + if (printk_ratelimit()) + printk("BUG: softirq loop! %08x\n", pending); +#endif + } if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + trace_softirq_enter(); + + ___do_softirq(0); trace_softirq_exit(); account_system_vtime(current); _local_bh_enable(); + } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -315,7 +444,7 @@ void irq_exit(void) tick_nohz_stop_sched_tick(); rcu_irq_exit(); #endif - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } /* @@ -323,19 +452,11 @@ void irq_exit(void) */ inline void raise_softirq_irqoff(unsigned int nr) { - __raise_softirq_irqoff(nr); + __do_raise_softirq_irqoff(nr); - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); +#ifdef CONFIG_PREEMPT_SOFTIRQS + wakeup_softirqd(nr); +#endif } void raise_softirq(unsigned int nr) @@ -364,14 +485,45 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; +static void inline +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) +{ + if (tasklet_trylock(t)) { +again: + /* We may have been preempted before tasklet_trylock + * and __tasklet_action may have already run. + * So double check the sched bit while the takslet + * is locked before adding it to the list. + */ + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { + WARN_ON(t->next != NULL); + t->next = head->list; + head->list = t; + raise_softirq_irqoff(nr); + tasklet_unlock(t); + } else { + /* This is subtle. If we hit the corner case above + * It is possible that we get preempted right here, + * and another task has successfully called + * tasklet_schedule(), then this function, and + * failed on the trylock. Thus we must be sure + * before releasing the tasklet lock, that the + * SCHED_BIT is clear. Otherwise the tasklet + * may get its SCHED_BIT set, but not added to the + * list + */ + if (!tasklet_tryunlock(t)) + goto again; + } + } +} + void __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; - raise_softirq_irqoff(TASKLET_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -382,81 +534,130 @@ void __tasklet_hi_schedule(struct taskle unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; - raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__tasklet_hi_schedule); -static void tasklet_action(struct softirq_action *a) +void tasklet_enable(struct tasklet_struct *t) { - struct tasklet_struct *list; + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_schedule(t); +} - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = NULL; - local_irq_enable(); +EXPORT_SYMBOL(tasklet_enable); + +void tasklet_hi_enable(struct tasklet_struct *t) +{ + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_hi_schedule(t); +} + +EXPORT_SYMBOL(tasklet_hi_enable); + +static void +__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) +{ + int loops = 1000000; while (list) { struct tasklet_struct *t = list; list = list->next; + /* + * Should always succeed - after a tasklist got on the + * list (after getting the SCHED bit set from 0 to 1), + * nothing but the tasklet softirq it got queued to can + * lock it: + */ + if (!tasklet_trylock(t)) { + WARN_ON(1); + continue; + } + + t->next = NULL; + + /* + * If we cannot handle the tasklet because it's disabled, + * mark it as pending. tasklet_enable() will later + * re-schedule the tasklet. + */ + if (unlikely(atomic_read(&t->count))) { +out_disabled: + /* implicit unlock: */ + wmb(); + t->state = TASKLET_STATEF_PENDING; + continue; + } - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); + /* + * After this point on the tasklet might be rescheduled + * on another CPU, but it can only be added to another + * CPU's tasklet list if we unlock the tasklet (which we + * dont do yet). + */ + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + WARN_ON(1); + +again: + t->func(t->data); + + /* + * Try to unlock the tasklet. We must use cmpxchg, because + * another CPU might have scheduled or disabled the tasklet. + * We only allow the STATE_RUN -> 0 transition here. + */ + while (!tasklet_tryunlock(t)) { + /* + * If it got disabled meanwhile, bail out: + */ + if (atomic_read(&t->count)) + goto out_disabled; + /* + * If it got scheduled meanwhile, re-execute + * the tasklet function: + */ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + goto again; + if (!--loops) { + printk("hm, tasklet state: %08lx\n", t->state); + WARN_ON(1); tasklet_unlock(t); - continue; + break; } - tasklet_unlock(t); } - - local_irq_disable(); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); } } -static void tasklet_hi_action(struct softirq_action *a) +static void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = NULL; + list = __get_cpu_var(tasklet_vec).list; + __get_cpu_var(tasklet_vec).list = NULL; local_irq_enable(); - while (list) { - struct tasklet_struct *t = list; + __tasklet_action(a, list); +} - list = list->next; +static void tasklet_hi_action(struct softirq_action *a) +{ + struct tasklet_struct *list; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } + local_irq_disable(); + list = __get_cpu_var(tasklet_hi_vec).list; + __get_cpu_var(tasklet_hi_vec).list = NULL; + local_irq_enable(); - local_irq_disable(); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } + __tasklet_action(a, list); } - void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -476,7 +677,7 @@ void tasklet_kill(struct tasklet_struct while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do - yield(); + msleep(1); while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); @@ -491,33 +692,98 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } -static int ksoftirqd(void * __bind_cpu) +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + +void tasklet_unlock_wait(struct tasklet_struct *t) { + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + /* + * Hack for now to avoid this busy-loop: + */ +#ifdef CONFIG_PREEMPT_RT + msleep(1); +#else + barrier(); +#endif + } +} +EXPORT_SYMBOL(tasklet_unlock_wait); + +#endif + +static int ksoftirqd(void * __data) +{ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; + struct softirqdata *data = __data; + u32 softirq_mask = (1 << data->nr); + struct softirq_action *h; + int cpu = data->cpu; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&data->wait); +#endif + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + current->flags |= PF_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); + if (!(local_softirq_pending() & softirq_mask)) { +sleep_more: + __preempt_enable_no_resched(); schedule(); preempt_disable(); } __set_current_state(TASK_RUNNING); - while (local_softirq_pending()) { +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 1; +#endif + + while (local_softirq_pending() & softirq_mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(cpu)) goto wait_to_die; - do_softirq(); - preempt_enable_no_resched(); + + local_irq_disable(); + /* + * Is the softirq already being executed by + * a hardirq context? + */ + if (per_cpu(softirq_running, cpu) & softirq_mask) { + local_irq_enable(); + set_current_state(TASK_INTERRUPTIBLE); + goto sleep_more; + } + per_cpu(softirq_running, cpu) |= softirq_mask; + __preempt_enable_no_resched(); + set_softirq_pending(local_softirq_pending() & ~softirq_mask); + local_bh_disable(); + local_irq_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qsctr_inc(data->cpu); + + local_irq_disable(); + per_cpu(softirq_running, cpu) &= ~softirq_mask; + _local_bh_enable(); + local_irq_enable(); + cond_resched(); preempt_disable(); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 0; + wake_up(&data->wait); +#endif } __set_current_state(TASK_RUNNING); return 0; @@ -564,7 +830,7 @@ void tasklet_kill_immediate(struct taskl BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { struct tasklet_struct **i; @@ -586,49 +852,83 @@ static void takeover_tasklets(unsigned i } #endif /* CONFIG_HOTPLUG_CPU */ +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [SCHED_SOFTIRQ] = "sched", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [BLOCK_SOFTIRQ] = "block", + [TASKLET_SOFTIRQ] = "tasklet", +#ifdef CONFIG_HIGH_RES_TIMERS + [HRTIMER_SOFTIRQ] = "hrtimer", +#endif + [RCU_SOFTIRQ] = "rcu", +}; + static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; + for (i = 0; i < MAX_SOFTIRQ; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + } + for (i = 0; i < MAX_SOFTIRQ; i++) { + p = kthread_create(ksoftirqd, + &per_cpu(ksoftirqd, hotcpu)[i], + "sirq-%s/%d", softirq_names[i], + hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d for %i failed\n", i, + hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; + } + break; + break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); + for (i = 0; i < MAX_SOFTIRQ; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); +#if 0 + for (i = 0; i < MAX_SOFTIRQ; i++) { + if (!per_cpu(ksoftirqd, hotcpu)[i].tsk) + continue; + kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk, + any_online_cpu(cpu_online_map)); + } +#endif case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param; - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler(p, SCHED_FIFO, ¶m); - kthread_stop(p); + for (i = 0; i < MAX_SOFTIRQ; i++) { + param.sched_priority = MAX_RT_PRIO-1; + p = per_cpu(ksoftirqd, hotcpu)[i].tsk; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + kthread_stop(p); + } takeover_tasklets(hotcpu); break; - } -#endif /* CONFIG_HOTPLUG_CPU */ } +#endif /* CONFIG_HOTPLUG_CPU */ + } return NOTIFY_OK; } @@ -647,6 +947,34 @@ __init int spawn_ksoftirqd(void) return 0; } + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); +#endif +#endif + #ifdef CONFIG_SMP /* * Call a function on all processors Index: linux-2.6.25.8-rt7/include/linux/irq.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/irq.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/irq.h 2008-06-23 19:13:47.000000000 -0400 @@ -19,10 +19,12 @@ #include #include #include +#include #include #include #include +#include struct irq_desc; typedef void (*irq_flow_handler_t)(unsigned int irq, @@ -61,6 +63,7 @@ typedef void (*irq_flow_handler_t)(unsig #define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ #define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ +#define IRQ_NODELAY 0x40000000 /* IRQ must run immediately */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -141,6 +144,8 @@ struct irq_chip { * @irq_count: stats field to detect stalled irqs * @irqs_unhandled: stats field for spurious unhandled interrupts * @last_unhandled: aging timer for unhandled count + * @thread: Thread pointer for threaded preemptible irq handling + * @wait_for_handler: Waitqueue to wait for a running preemptible handler * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -163,7 +168,10 @@ struct irq_desc { unsigned int irq_count; /* For detecting broken IRQs */ unsigned int irqs_unhandled; unsigned long last_unhandled; /* Aging timer for unhandled count */ - spinlock_t lock; + struct task_struct *thread; + wait_queue_head_t wait_for_handler; + cycles_t timestamp; + raw_spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; unsigned int cpu; @@ -396,7 +404,21 @@ extern int set_irq_msi(unsigned int irq, #define get_irq_data(irq) (irq_desc[irq].handler_data) #define get_irq_msi(irq) (irq_desc[irq].msi_desc) -#endif /* CONFIG_GENERIC_HARDIRQS */ +/* Early initialization of irqs */ +extern void early_init_hardirqs(void); + +#if defined(CONFIG_PREEMPT_HARDIRQS) +extern void init_hardirqs(void); +#else +static inline void init_hardirqs(void) { } +#endif + +#else /* end GENERIC HARDIRQS */ + +static inline void early_init_hardirqs(void) { } +static inline void init_hardirqs(void) { } + +#endif /* !CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ Index: linux-2.6.25.8-rt7/kernel/irq/autoprobe.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/autoprobe.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/autoprobe.c 2008-06-23 19:13:21.000000000 -0400 @@ -7,6 +7,7 @@ */ #include +#include #include #include #include Index: linux-2.6.25.8-rt7/kernel/irq/chip.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/chip.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/chip.c 2008-06-23 19:14:07.000000000 -0400 @@ -287,8 +287,10 @@ static inline void mask_ack_irq(struct i if (desc->chip->mask_ack) desc->chip->mask_ack(irq); else { - desc->chip->mask(irq); - desc->chip->ack(irq); + if (desc->chip->mask) + desc->chip->mask(irq); + if (desc->chip->ack) + desc->chip->ack(irq); } } @@ -313,8 +315,10 @@ handle_simple_irq(unsigned int irq, stru spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->status & IRQ_INPROGRESS)) { + desc->status |= IRQ_PENDING; goto out_unlock; + } desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; @@ -323,6 +327,11 @@ handle_simple_irq(unsigned int irq, stru goto out_unlock; desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -331,6 +340,8 @@ handle_simple_irq(unsigned int irq, stru spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); out_unlock: spin_unlock(&desc->lock); } @@ -369,6 +380,13 @@ handle_level_irq(unsigned int irq, struc goto out_unlock; desc->status |= IRQ_INPROGRESS; + + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -402,18 +420,16 @@ handle_fasteoi_irq(unsigned int irq, str spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; /* - * If its disabled or no action available + * If it's running, disabled or no action available * then mask it and get out of here: */ action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (unlikely(!action || (desc->status & (IRQ_INPROGRESS | + IRQ_DISABLED)))) { desc->status |= IRQ_PENDING; if (desc->chip->mask) desc->chip->mask(irq); @@ -421,6 +437,15 @@ handle_fasteoi_irq(unsigned int irq, str } desc->status |= IRQ_INPROGRESS; + /* + * In the threaded case we fall back to a mask+eoi sequence: + */ + if (redirect_hardirq(desc)) { + if (desc->chip->mask) + desc->chip->mask(irq); + goto out; + } + desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); @@ -430,9 +455,10 @@ handle_fasteoi_irq(unsigned int irq, str spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); } @@ -481,6 +507,12 @@ handle_edge_irq(unsigned int irq, struct /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + do { struct irqaction *action = desc->action; irqreturn_t action_ret; Index: linux-2.6.25.8-rt7/kernel/irq/handle.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/handle.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/handle.c 2008-06-23 19:13:47.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -53,12 +54,13 @@ struct irq_desc irq_desc[NR_IRQS] __cach .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif } }; +EXPORT_SYMBOL_GPL(irq_desc); /* * What should we do if we get a hw irq event on an illegal vector? @@ -131,26 +133,87 @@ irqreturn_t handle_IRQ_event(unsigned in irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_off(); +#endif + handle_dynamic_tick(action); - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); + /* + * Unconditionally enable interrupts for threaded + * IRQ handlers: + */ + if (!hardirq_count() || !(action->flags & IRQF_DISABLED)) + local_irq_enable(); do { + unsigned int preempt_count = preempt_count(); + ret = action->handler(irq, action->dev_id); + if (preempt_count() != preempt_count) { + print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + dump_stack(); + preempt_count() = preempt_count; + } if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (status & IRQF_SAMPLE_RANDOM) { + local_irq_enable(); add_interrupt_randomness(irq); + } local_irq_disable(); +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_on(); +#endif return retval; } +/* + * Hack - used for development only. + */ +int __read_mostly debug_direct_keyboard = 0; + +int __init debug_direct_keyboard_setup(char *str) +{ + debug_direct_keyboard = 1; + printk(KERN_INFO "Switching IRQ 1 (keyboard) to to direct!\n"); +#ifdef CONFIG_PREEMPT_RT + printk(KERN_INFO "WARNING: kernel may easily crash this way!\n"); +#endif + return 1; +} + +__setup("debug_direct_keyboard", debug_direct_keyboard_setup); + +int redirect_hardirq(struct irq_desc *desc) +{ + /* + * Direct execution: + */ + if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || + !desc->thread) + return 0; + +#ifdef __i386__ + if (debug_direct_keyboard && (desc - irq_desc == 1)) + return 0; +#endif + + BUG_ON(!irqs_disabled()); + if (desc->thread && desc->thread->state != TASK_RUNNING) + wake_up_process(desc->thread); + + return 1; +} + #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ /** * __do_IRQ - original all in one highlevel IRQ handler @@ -186,6 +249,13 @@ unsigned int __do_IRQ(unsigned int irq) desc->chip->end(irq); return 1; } + /* + * If the task is currently running in user mode, don't + * detect soft lockups. If CONFIG_DETECT_SOFTLOCKUP is not + * configured, this should be optimized out. + */ + if (user_mode(get_irq_regs())) + touch_softlockup_watchdog(); spin_lock(&desc->lock); if (desc->chip->ack) Index: linux-2.6.25.8-rt7/kernel/irq/internals.h =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/internals.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/internals.h 2008-06-23 19:13:21.000000000 -0400 @@ -10,6 +10,10 @@ extern void irq_chip_set_defaults(struct /* Set default handler: */ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int redirect_hardirq(struct irq_desc *desc); + +void recalculate_desc_flags(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); Index: linux-2.6.25.8-rt7/kernel/irq/manage.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/manage.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/manage.c 2008-06-23 19:14:17.000000000 -0400 @@ -8,8 +8,10 @@ */ #include -#include #include +#include +#include +#include #include #include "internals.h" @@ -41,8 +43,12 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) + wait_event(desc->wait_for_handler, + !(desc->status & IRQ_INPROGRESS)); + else + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ spin_lock_irqsave(&desc->lock, flags); @@ -185,6 +191,14 @@ void enable_irq(unsigned int irq) desc->depth--; } spin_unlock_irqrestore(&desc->lock, flags); +#ifdef CONFIG_HARDIRQS_SW_RESEND + /* + * Do a bh disable/enable pair to trigger any pending + * irq resend logic: + */ + local_bh_disable(); + local_bh_enable(); +#endif } EXPORT_SYMBOL(enable_irq); @@ -234,6 +248,21 @@ int set_irq_wake(unsigned int irq, unsig EXPORT_SYMBOL(set_irq_wake); /* + * If any action has IRQF_NODELAY then turn IRQ_NODELAY on: + */ +void recalculate_desc_flags(struct irq_desc *desc) +{ + struct irqaction *action; + + desc->status &= ~IRQ_NODELAY; + for (action = desc->action ; action; action = action->next) + if (action->flags & IRQF_NODELAY) + desc->status |= IRQ_NODELAY; +} + +static int start_irq_thread(int irq, struct irq_desc *desc); + +/* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. @@ -298,6 +327,9 @@ int setup_irq(unsigned int irq, struct i rand_initialize_irq(irq); } + if (!(new->flags & IRQF_NODELAY)) + if (start_irq_thread(irq, desc)) + return -ENOMEM; /* * The following block of code has to be executed atomically */ @@ -338,6 +370,11 @@ int setup_irq(unsigned int irq, struct i if (new->flags & IRQF_NOBALANCING) desc->status |= IRQ_NO_BALANCING; + /* + * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY: + */ + recalculate_desc_flags(desc); + if (!shared) { irq_chip_set_defaults(desc->chip); @@ -384,7 +421,7 @@ int setup_irq(unsigned int irq, struct i new->irq = irq; register_irq_proc(irq); - new->dir = NULL; + new->dir = new->threaded = NULL; register_handler_proc(irq, new); return 0; @@ -455,6 +492,7 @@ void free_irq(unsigned int irq, void *de else desc->chip->disable(irq); } + recalculate_desc_flags(desc); spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -470,9 +508,9 @@ void free_irq(unsigned int irq, void *de * parallel with our fake */ if (action->flags & IRQF_SHARED) { - local_irq_save(flags); + local_irq_save_nort(flags); action->handler(irq, dev_id); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif kfree(action); @@ -567,9 +605,9 @@ int request_irq(unsigned int irq, irq_ha */ unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); handler(irq, dev_id); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -580,3 +618,264 @@ int request_irq(unsigned int irq, irq_ha return retval; } EXPORT_SYMBOL(request_irq); + +#ifdef CONFIG_PREEMPT_HARDIRQS + +int hardirq_preemption = 1; + +EXPORT_SYMBOL(hardirq_preemption); + +/* + * Real-Time Preemption depends on hardirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init hardirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + hardirq_preemption = 0; + else + get_option(&str, &hardirq_preemption); + if (!hardirq_preemption) + printk("turning off hardirq preemption!\n"); + + return 1; +} + +__setup("hardirq-preempt=", hardirq_preempt_setup); + +#endif + +/* + * threaded simple handler + */ +static void thread_simple_irq(irq_desc_t *desc) +{ + struct irqaction *action = desc->action; + unsigned int irq = desc - irq_desc; + irqreturn_t action_ret; + + do { + if (!action || desc->depth) + break; + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while (desc->status & IRQ_PENDING); + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded level type irq handler + */ +static void thread_level_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded fasteoi type irq handler + */ +static void thread_fasteoi_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded edge type IRQ handler + */ +static void thread_edge_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->mask(irq); + return; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) == + (IRQ_PENDING | IRQ_MASKED)) && !desc->depth)) + desc->chip->unmask(irq); + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded edge type IRQ handler + */ +static void thread_do_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->disable(irq); + return; + } + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + spin_lock_irq(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; + desc->chip->end(irq); +} + +static void do_hardirq(struct irq_desc *desc) +{ + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + if (!(desc->status & IRQ_INPROGRESS)) + goto out; + + if (desc->handle_irq == handle_simple_irq) + thread_simple_irq(desc); + else if (desc->handle_irq == handle_level_irq) + thread_level_irq(desc); + else if (desc->handle_irq == handle_fasteoi_irq) + thread_fasteoi_irq(desc); + else if (desc->handle_irq == handle_edge_irq) + thread_edge_irq(desc); + else + thread_do_irq(desc); + out: + spin_unlock_irqrestore(&desc->lock, flags); + + if (waitqueue_active(&desc->wait_for_handler)) + wake_up(&desc->wait_for_handler); +} + +static int do_irqd(void * __desc) +{ + struct sched_param param = { 0, }; + struct irq_desc *desc = __desc; + +#ifdef CONFIG_SMP + cpumask_t cpus_allowed; + + cpus_allowed = desc->affinity; +#endif + current->flags |= PF_NOFREEZE | PF_HARDIRQ; + + /* + * Set irq thread priority to SCHED_FIFO/50: + */ + param.sched_priority = MAX_USER_RT_PRIO/2; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + + while (!kthread_should_stop()) { + local_irq_disable_nort(); + do { + set_current_state(TASK_INTERRUPTIBLE); + do_hardirq(desc); + } while (current->state == TASK_RUNNING); + + local_irq_enable_nort(); +#ifdef CONFIG_SMP + /* + * Did IRQ affinities change? + */ + if (!cpus_equal(cpus_allowed, desc->affinity)) + cpus_allowed = desc->affinity; +#endif + schedule(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +static int ok_to_create_irq_threads; + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + if (desc->thread || !ok_to_create_irq_threads) + return 0; + + desc->thread = kthread_create(do_irqd, desc, "IRQ-%d", irq); + if (!desc->thread) { + printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); + return -ENOMEM; + } + + /* + * An interrupt may have come in before the thread pointer was + * stored in desc->thread; make sure the thread gets woken up in + * such a case: + */ + smp_mb(); + wake_up_process(desc->thread); + + return 0; +} + +void __init init_hardirqs(void) +{ + int i; + ok_to_create_irq_threads = 1; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + + if (desc->action && !(desc->status & IRQ_NODELAY)) + start_irq_thread(i, desc); + } +} + +#else + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + return 0; +} + +#endif + +void __init early_init_hardirqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + init_waitqueue_head(&irq_desc[i].wait_for_handler); +} Index: linux-2.6.25.8-rt7/kernel/irq/proc.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/proc.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/proc.c 2008-06-23 19:13:21.000000000 -0400 @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include @@ -87,44 +89,6 @@ static int irq_spurious_read(char *page, jiffies_to_msecs(d->last_unhandled)); } -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } - } - spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - - if (!irq_desc[irq].dir || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); -} - -#undef MAX_NAMELEN - #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq) @@ -167,10 +131,96 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { + if (action->threaded) + remove_proc_entry(action->threaded->name, action->dir); if (action->dir) remove_proc_entry(action->dir->name, irq_desc[irq].dir); } +#ifndef CONFIG_PREEMPT_RT + +static int threaded_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "%c\n", + ((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1'); +} + +static int threaded_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + int c; + struct irqaction *action = data; + irq_desc_t *desc = irq_desc + action->irq; + + if (get_user(c, buffer)) + return -EFAULT; + if (c != '0' && c != '1') + return -EINVAL; + + spin_lock_irq(&desc->lock); + + if (c == '0') + action->flags |= IRQF_NODELAY; + if (c == '1') + action->flags &= ~IRQF_NODELAY; + recalculate_desc_flags(desc); + + spin_unlock_irq(&desc->lock); + + return 1; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_desc[irq].dir || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_desc[irq].dir); + + if (!action->dir) + return; +#ifndef CONFIG_PREEMPT_RT + { + struct proc_dir_entry *entry; + /* create /proc/irq/1234/handler/threaded */ + entry = create_proc_entry("threaded", 0600, action->dir); + if (!entry) + return; + entry->nlink = 1; + entry->data = (void *)action; + entry->read_proc = threaded_read_proc; + entry->write_proc = threaded_write_proc; + action->threaded = entry; + } +#endif +} + +#undef MAX_NAMELEN + void init_irq_proc(void) { int i; @@ -180,6 +230,9 @@ void init_irq_proc(void) if (!root_irq_dir) return; + /* create /proc/irq/prof_cpu_mask */ + create_prof_cpu_mask(root_irq_dir); + /* * Create entries for all existing IRQs. */ Index: linux-2.6.25.8-rt7/kernel/irq/spurious.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/irq/spurious.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/irq/spurious.c 2008-06-23 19:13:47.000000000 -0400 @@ -13,6 +13,11 @@ #include #include +#ifdef CONFIG_X86_IO_APIC +# include +# include +#endif + static int irqfixup __read_mostly; /* @@ -57,9 +62,8 @@ static int misrouted_irq(int irq) } action = action->next; } - local_irq_disable(); /* Now clean up the flags */ - spin_lock(&desc->lock); + spin_lock_irq(&desc->lock); action = desc->action; /* @@ -205,6 +209,12 @@ void note_interrupt(unsigned int irq, st * The interrupt is stuck */ __report_bad_irq(irq, desc, action_ret); +#ifdef CONFIG_X86_IO_APIC + if (!sis_apic_bug) { + sis_apic_bug = 1; + printk(KERN_ERR "turning off IO-APIC fast mode.\n"); + } +#else /* * Now kill the IRQ */ @@ -212,6 +222,7 @@ void note_interrupt(unsigned int irq, st desc->status |= IRQ_DISABLED; desc->depth = 1; desc->chip->disable(irq); +#endif } desc->irqs_unhandled = 0; } @@ -232,6 +243,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable ir static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqfixup boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -245,6 +261,11 @@ MODULE_PARM_DESC("irqfixup", "0: No fixu static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + printk(KERN_WARNING "irqpoll boot option not supported " + "w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); Index: linux-2.6.25.8-rt7/include/linux/timer.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/timer.h 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/timer.h 2008-06-23 19:13:21.000000000 -0400 @@ -144,10 +144,12 @@ static inline void add_timer(struct time __mod_timer(timer, timer->expires); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + extern int timer_pending_sync(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else +# define timer_pending_sync(t) timer_pending(t) # define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif Index: linux-2.6.25.8-rt7/kernel/timer.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/timer.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/timer.c 2008-06-23 19:14:24.000000000 -0400 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,7 @@ struct tvec_root { struct tvec_base { spinlock_t lock; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; unsigned long timer_jiffies; struct tvec_root tv1; struct tvec tv2; @@ -247,9 +249,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_relative static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) @@ -393,7 +393,7 @@ int __mod_timer(struct timer_list *timer { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0; + int ret = 0, cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -405,7 +405,8 @@ int __mod_timer(struct timer_list *timer ret = 1; } - new_base = __get_cpu_var(tvec_bases); + cpu = raw_smp_processor_id(); + new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { /* @@ -463,6 +464,18 @@ void add_timer_on(struct timer_list *tim spin_unlock_irqrestore(&base->lock, flags); } +/* + * Wait for a running timer + */ +void wait_for_running_timer(struct timer_list *timer) +{ + struct tvec_base *base = timer->base; + + if (base->running_timer == timer) + wait_event(base->wait_for_running_timer, + base->running_timer != timer); +} + /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -533,7 +546,35 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) +/* + * This function checks whether a timer is active and not running on any + * CPU. Upon successful (ret >= 0) exit the timer is not queued and the + * handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int timer_pending_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) + ret = 1; +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + + /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del @@ -590,7 +631,7 @@ int del_timer_sync(struct timer_list *ti int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - cpu_relax(); + wait_for_running_timer(timer); } } @@ -636,6 +677,20 @@ static inline void __run_timers(struct t struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; + if (softirq_need_resched()) { + spin_unlock_irq(&base->lock); + wake_up(&base->wait_for_running_timer); + cond_resched_softirq_context(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } + /* * Cascade timers: */ @@ -663,18 +718,17 @@ static inline void __run_timers(struct t int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + set_running_timer(base, NULL); + cond_resched_softirq_context(); spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + wake_up(&base->wait_for_running_timer); spin_unlock_irq(&base->lock); } @@ -804,9 +858,22 @@ unsigned long get_next_timer_interrupt(u struct tvec_base *base = __get_cpu_var(tvec_bases); unsigned long expires; +#ifdef CONFIG_PREEMPT_RT + /* + * On PREEMPT_RT we cannot sleep here. If the trylock does not + * succeed then we return the worst-case 'expires in 1 tick' + * value: + */ + if (spin_trylock(&base->lock)) { + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + } else + expires = now + 1; +#else spin_lock(&base->lock); expires = __next_timer_interrupt(base); spin_unlock(&base->lock); +#endif if (time_before_eq(expires, now)) return now; @@ -849,10 +916,10 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); + scheduler_tick(); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); - scheduler_tick(); run_posix_cpu_timers(p); } @@ -861,8 +928,29 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { + /* + * On PREEMPT_RT, we are running in the timer softirq thread, + * so consider 1 less running tasks: + */ +#ifdef CONFIG_PREEMPT_RT + return (nr_active() - 1) * FIXED_1; +#else return nr_active() * FIXED_1; +#endif +} + +#ifdef CONFIG_PREEMPT_RT +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_rt_tasks(void) +{ + extern unsigned long rt_nr_running(void); + extern unsigned long rt_nr_uninterruptible(void); + + return (rt_nr_running() + rt_nr_uninterruptible()) * FIXED_1; } +#endif /* * Hmm.. Changed this, as the GNU make sources (load.c) seems to @@ -876,6 +964,8 @@ unsigned long avenrun[3]; EXPORT_SYMBOL(avenrun); +unsigned long avenrun_rt[3]; + /* * calc_load - given tick count, update the avenrun load estimates. * This is called while holding a write_lock on xtime_lock. @@ -884,33 +974,32 @@ static inline void calc_load(unsigned lo { unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; +#ifdef CONFIG_PREEMPT_RT + unsigned long active_rt_tasks; /* fixed-point */ +#endif count -= ticks; if (unlikely(count < 0)) { active_tasks = count_active_tasks(); +#ifdef CONFIG_PREEMPT_RT + active_rt_tasks = count_active_rt_tasks(); +#endif do { CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); +#ifdef CONFIG_PREEMPT_RT + CALC_LOAD(avenrun_rt[0], EXP_1, active_rt_tasks); + CALC_LOAD(avenrun_rt[1], EXP_5, active_rt_tasks); + CALC_LOAD(avenrun_rt[2], EXP_15, active_rt_tasks); +#endif count += LOAD_FREQ; + } while (count < 0); } } /* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - struct tvec_base *base = __get_cpu_var(tvec_bases); - - hrtimer_run_pending(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* * Called by the local, per-CPU timer interrupt on SMP. */ void run_local_timers(void) @@ -921,13 +1010,42 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! + * Time of day handling: */ -static inline void update_times(unsigned long ticks) +static inline void update_times(void) { - update_wall_time(); - calc_load(ticks); + static unsigned long last_tick = INITIAL_JIFFIES; + unsigned long ticks, flags; + + /* + * Dont take the xtime_lock from every CPU in + * every tick - only when needed: + */ + if (jiffies == last_tick) + return; + + write_seqlock_irqsave(&xtime_lock, flags); + ticks = jiffies - last_tick; + if (ticks) { + last_tick += ticks; + calc_load(ticks); + } + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + struct tvec_base *base = per_cpu(tvec_bases, raw_smp_processor_id()); + + update_times(); + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); } /* @@ -939,7 +1057,7 @@ static inline void update_times(unsigned void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1278,6 +1396,7 @@ static int __cpuinit init_timers_cpu(int spin_lock_init(&base->lock); lockdep_set_class(&base->lock, base_lock_keys + cpu); + init_waitqueue_head(&base->wait_for_running_timer); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1315,7 +1434,7 @@ static void __cpuinit migrate_timers(int old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - local_irq_disable(); + local_irq_disable_nort(); double_spin_lock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); @@ -1332,7 +1451,7 @@ static void __cpuinit migrate_timers(int double_spin_unlock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); - local_irq_enable(); + local_irq_enable_nort(); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ Index: linux-2.6.25.8-rt7/kernel/itimer.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/itimer.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/itimer.c 2008-06-23 19:13:21.000000000 -0400 @@ -170,6 +170,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); Index: linux-2.6.25.8-rt7/kernel/posix-timers.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/posix-timers.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/posix-timers.c 2008-06-23 19:13:21.000000000 -0400 @@ -810,6 +810,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + hrtimer_wait_for_timer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -849,6 +850,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } @@ -881,6 +883,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } list_del(&timer->list); Index: linux-2.6.25.8-rt7/arch/x86/kernel/i8259_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/i8259_32.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/i8259_32.c 2008-06-23 19:13:42.000000000 -0400 @@ -29,7 +29,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { @@ -165,6 +165,8 @@ static void mask_and_ack_8259A(unsigned */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; + if (irq & 8) + outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ cached_irq_mask |= irqmask; handle_real_irq: @@ -292,10 +294,10 @@ void init_8259A(int auto_eoi) outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + if (!auto_eoi) /* master expects normal EOI */ + outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ @@ -347,6 +349,7 @@ static irqreturn_t math_error_irq(int cp */ static struct irqaction fpu_irq = { .handler = math_error_irq, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "fpu", }; Index: linux-2.6.25.8-rt7/arch/x86/mach-default/setup.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mach-default/setup.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mach-default/setup.c 2008-06-23 19:13:22.000000000 -0400 @@ -37,6 +37,7 @@ void __init pre_intr_init_hook(void) */ static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; @@ -85,7 +86,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.25.8-rt7/arch/x86/mach-visws/visws_apic.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mach-visws/visws_apic.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mach-visws/visws_apic.c 2008-06-23 19:13:22.000000000 -0400 @@ -257,11 +257,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; Index: linux-2.6.25.8-rt7/arch/x86/mach-voyager/setup.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/mach-voyager/setup.c 2008-06-23 19:12:55.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/mach-voyager/setup.c 2008-06-23 19:13:22.000000000 -0400 @@ -20,6 +20,7 @@ void __init pre_intr_init_hook(void) */ static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; @@ -46,7 +47,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.25.8-rt7/arch/mips/kernel/i8253.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/mips/kernel/i8253.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/mips/kernel/i8253.c 2008-06-23 19:13:22.000000000 -0400 @@ -97,7 +97,7 @@ static irqreturn_t timer_interrupt(int i static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.25.8-rt7/arch/x86/kernel/i8259_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/i8259_64.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/i8259_64.c 2008-06-23 19:13:38.000000000 -0400 @@ -97,8 +97,8 @@ static void (*__initdata interrupt[NR_VE */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +DEFINE_RAW_SPINLOCK(i8259A_lock); static struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -405,6 +405,7 @@ void init_8259A(int auto_eoi) static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; Index: linux-2.6.25.8-rt7/arch/x86/kernel/time_64.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/time_64.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/time_64.c 2008-06-23 19:13:22.000000000 -0400 @@ -101,7 +101,8 @@ unsigned long __init native_calculate_cp static struct irqaction irq0 = { .handler = timer_event_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | + IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux-2.6.25.8-rt7/arch/arm/common/time-acorn.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/common/time-acorn.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/common/time-acorn.c 2008-06-23 19:13:23.000000000 -0400 @@ -75,7 +75,7 @@ ioc_timer_interrupt(int irq, void *dev_i static struct irqaction ioc_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .handler = ioc_timer_interrupt }; Index: linux-2.6.25.8-rt7/arch/arm/oprofile/op_model_xscale.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/oprofile/op_model_xscale.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/oprofile/op_model_xscale.c 2008-06-23 19:13:23.000000000 -0400 @@ -381,8 +381,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + int irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/ppc_ksyms.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/ppc_ksyms.c 2008-06-23 19:13:40.000000000 -0400 @@ -15,7 +15,6 @@ #include #include -#include #include #include #include @@ -46,7 +45,7 @@ #include #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(local_irq_restore); +EXPORT_SYMBOL(raw_local_irq_restore); #endif #ifdef CONFIG_PPC32 @@ -166,7 +165,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux-2.6.25.8-rt7/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/platforms/iseries/setup.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/platforms/iseries/setup.c 2008-06-23 19:13:23.000000000 -0400 @@ -562,12 +562,14 @@ static void iseries_shared_idle(void) { while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched() && !hvlpevent_is_pending()) { + while (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); /* Recheck with irqs off */ - if (!need_resched() && !hvlpevent_is_pending()) + if (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) yield_shared_processor(); HMT_medium(); Index: linux-2.6.25.8-rt7/arch/powerpc/platforms/pseries/setup.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/platforms/pseries/setup.c 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/platforms/pseries/setup.c 2008-06-23 19:13:23.000000000 -0400 @@ -413,7 +413,8 @@ static void pseries_dedicated_idle_sleep set_thread_flag(TIF_POLLING_NRFLAG); while (get_tb() < start_snooze) { - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; ppc64_runlatch_off(); HMT_low(); @@ -424,7 +425,8 @@ static void pseries_dedicated_idle_sleep clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; } Index: linux-2.6.25.8-rt7/include/asm-powerpc/thread_info.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-powerpc/thread_info.h 2008-06-23 19:12:54.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-powerpc/thread_info.h 2008-06-23 19:14:07.000000000 -0400 @@ -121,9 +121,12 @@ static inline struct thread_info *curren #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ #define TIF_NOERROR 12 /* Force successful syscall return */ #define TIF_RESTORE_SIGMASK 13 /* Restore signal mask in do_signal */ -#define TIF_FREEZE 14 /* Freezing for suspend */ -#define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ -#define TIF_ABI_PENDING 16 /* 32/64 bit switch needed */ +#define TIF_NEED_RESCHED_DELAYED \ + 14 /* reschedule on return to userspace */ +#define TIF_FREEZE 15 /* Freezing for suspend */ +#define TIF_RUNLATCH 16 /* Is the runlatch enabled? */ +#define TIF_ABI_PENDING 17 /* 32/64 bit switch needed */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< /* + * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking + * mechanism. + * + * On PREEMPT_RT, we use per-CPU locks for this. That's why the + * calling convention is changed slightly: a new 'flags' argument + * is passed to 'irq disable/enable' - the PREEMPT_RT code stores + * the CPU number of the lock there. + */ +#ifndef CONFIG_PREEMPT_RT +# define slab_irq_disable(cpu) \ + do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_enable(cpu) local_irq_enable() +# define slab_irq_save(flags, cpu) \ + do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0) +# define slab_irq_restore(flags, cpu) local_irq_restore(flags) +/* + * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT, + * which has no per-CPU locking effect since we are holding the cache + * lock in that case already. + * + * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.) + */ +# define slab_irq_disable_nort(cpu) slab_irq_disable(cpu) +# define slab_irq_enable_nort(cpu) slab_irq_enable(cpu) +# define slab_irq_disable_rt(flags) do { (void)(flags); } while (0) +# define slab_irq_enable_rt(flags) do { (void)(flags); } while (0) +# define slab_spin_lock_irq(lock, cpu) \ + do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + spin_unlock_irq(lock) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); } while (0) +#else +DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, }; +# define slab_irq_disable(cpu) (void)get_cpu_var_locked(slab_irq_locks, &(cpu)) +# define slab_irq_enable(cpu) put_cpu_var_locked(slab_irq_locks, cpu) +# define slab_irq_save(flags, cpu) \ + do { slab_irq_disable(cpu); (void) (flags); } while (0) +# define slab_irq_restore(flags, cpu) \ + do { slab_irq_enable(cpu); (void) (flags); } while (0) +# define slab_irq_disable_rt(cpu) slab_irq_disable(cpu) +# define slab_irq_enable_rt(cpu) slab_irq_enable(cpu) +# define slab_irq_disable_nort(cpu) do { } while (0) +# define slab_irq_enable_nort(cpu) do { } while (0) +# define slab_spin_lock_irq(lock, cpu) \ + do { slab_irq_disable(cpu); spin_lock(lock); } while (0) +# define slab_spin_unlock_irq(lock, cpu) \ + do { spin_unlock(lock); slab_irq_enable(cpu); } while (0) +# define slab_spin_lock_irqsave(lock, flags, cpu) \ + do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0) +# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ + do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0) +#endif + +/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * @@ -313,7 +370,7 @@ struct kmem_list3 __initdata initkmem_li static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, int *this_cpu); static int enable_cpucache(struct kmem_cache *cachep); static void cache_reap(struct work_struct *unused); @@ -756,9 +813,10 @@ int slab_is_available(void) static DEFINE_PER_CPU(struct delayed_work, reap_work); -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +static inline struct array_cache * +cpu_cache_get(struct kmem_cache *cachep, int this_cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[this_cpu]; } static inline struct kmem_cache *__find_general_cachep(size_t size, @@ -992,7 +1050,7 @@ static int transfer_objects(struct array #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, l3, this_cpu) 0 static inline struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1003,27 +1061,29 @@ static inline void free_alien_cache(stru { } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { return 0; } static inline void *alternate_node_alloc(struct kmem_cache *cachep, - gfp_t flags) + gfp_t flags, int *this_cpu) { return NULL; } static inline void *____cache_alloc_node(struct kmem_cache *cachep, - gfp_t flags, int nodeid) + gfp_t flags, int nodeid, int *this_cpu) { return NULL; } #else /* CONFIG_NUMA */ -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); -static void *alternate_node_alloc(struct kmem_cache *, gfp_t); +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, int *this_cpu); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1064,7 +1124,8 @@ static void free_alien_cache(struct arra } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + int *this_cpu) { struct kmem_list3 *rl3 = cachep->nodelists[node]; @@ -1078,7 +1139,7 @@ static void __drain_alien_cache(struct k if (rl3->shared) transfer_objects(rl3->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, this_cpu); ac->avail = 0; spin_unlock(&rl3->list_lock); } @@ -1087,38 +1148,42 @@ static void __drain_alien_cache(struct k /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static int +reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu) { - int node = __get_cpu_var(reap_node); + int node = per_cpu(reap_node, *this_cpu); if (l3->alien) { struct array_cache *ac = l3->alien[node]; if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); + __drain_alien_cache(cachep, ac, node, this_cpu); spin_unlock_irq(&ac->lock); + return 1; } } + return 0; } static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { - int i = 0; + int i = 0, this_cpu; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { ac = alien[i]; if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + slab_spin_lock_irqsave(&ac->lock, flags, this_cpu); + __drain_alien_cache(cachep, ac, i, &this_cpu); + slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu); } } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static inline int +cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; @@ -1126,7 +1191,7 @@ static inline int cache_free_alien(struc struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = cpu_to_node(*this_cpu); /* * Make sure we are not freeing a object from another node to the array @@ -1142,13 +1207,13 @@ static inline int cache_free_alien(struc spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, alien, nodeid, this_cpu); } alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); + free_block(cachep, &objp, 1, nodeid, this_cpu); spin_unlock(&(cachep->nodelists[nodeid])->list_lock); } return 1; @@ -1165,6 +1230,7 @@ static void __cpuinit cpuup_canceled(lon struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; + int this_cpu; cpumask_t mask; mask = node_to_cpumask(node); @@ -1176,29 +1242,31 @@ static void __cpuinit cpuup_canceled(lon if (!l3) goto free_array_cache; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); /* Free limit for this kmem_list3 */ l3->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, + &this_cpu); if (!cpus_empty(mask)) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, + this_cpu); goto free_array_cache; } shared = l3->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = NULL; } alien = l3->alien; l3->alien = NULL; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); if (alien) { @@ -1227,6 +1295,7 @@ static int __cpuinit cpuup_prepare(long struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); const int memsize = sizeof(struct kmem_list3); + int this_cpu; /* * We need to do this right in the beginning since @@ -1257,11 +1326,11 @@ static int __cpuinit cpuup_prepare(long cachep->nodelists[node] = l3; } - spin_lock_irq(&cachep->nodelists[node]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[node]->list_lock, this_cpu); cachep->nodelists[node]->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + slab_spin_unlock_irq(&cachep->nodelists[node]->list_lock, this_cpu); } /* @@ -1298,7 +1367,7 @@ static int __cpuinit cpuup_prepare(long l3 = cachep->nodelists[node]; BUG_ON(!l3); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (!l3->shared) { /* * We are serialised from CPU_DEAD or @@ -1313,7 +1382,7 @@ static int __cpuinit cpuup_prepare(long alien = NULL; } #endif - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(alien); } @@ -1390,11 +1459,13 @@ static void init_list(struct kmem_cache int nodeid) { struct kmem_list3 *ptr; + int this_cpu; ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); BUG_ON(!ptr); - local_irq_disable(); + WARN_ON(spin_is_locked(&list->list_lock)); + slab_irq_disable(this_cpu); memcpy(ptr, list, sizeof(struct kmem_list3)); /* * Do not assume that spinlocks can be initialized via memcpy: @@ -1403,7 +1474,7 @@ static void init_list(struct kmem_cache MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->nodelists[nodeid] = ptr; - local_irq_enable(); + slab_irq_enable(this_cpu); } /* @@ -1566,36 +1637,34 @@ void __init kmem_cache_init(void) /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; + int this_cpu; ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), - sizeof(struct arraycache_init)); + slab_irq_disable(this_cpu); + BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu), + sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - - cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); + cache_cache.array[this_cpu] = ptr; + slab_irq_enable(this_cpu); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) - != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), - sizeof(struct arraycache_init)); + slab_irq_disable(this_cpu); + BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu) + != &initarray_generic.cache); + memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu), + sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; - local_irq_enable(); + malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr; + slab_irq_enable(this_cpu); } /* 5) Replace the bootstrap kmem_list3's */ { @@ -1747,7 +1816,7 @@ static void store_stackinfo(struct kmem_ *addr++ = 0x12345678; *addr++ = caller; - *addr++ = smp_processor_id(); + *addr++ = raw_smp_processor_id(); size -= 3 * sizeof(unsigned long); { unsigned long *sptr = &caller; @@ -1902,7 +1971,11 @@ static void check_poison_obj(struct kmem } #endif +static void +__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu); + #if DEBUG + /** * slab_destroy_objs - destroy a slab and its objects * @cachep: cache pointer being destroyed @@ -1911,7 +1984,8 @@ static void check_poison_obj(struct kmem * Call the registered destructor for each object in a slab that is being * destroyed. */ -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { @@ -1954,7 +2028,8 @@ static void slab_destroy_objs(struct kme * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void +slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1968,8 +2043,12 @@ static void slab_destroy(struct kmem_cac call_rcu(&slab_rcu->head, kmem_rcu_free); } else { kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + if (OFF_SLAB(cachep)) { + if (this_cpu) + __cache_free(cachep->slabp_cache, slabp, this_cpu); + else + kmem_cache_free(cachep->slabp_cache, slabp); + } } } @@ -2066,6 +2145,8 @@ static size_t calculate_slab_order(struc static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) { + int this_cpu; + if (g_cpucache_up == FULL) return enable_cpucache(cachep); @@ -2109,10 +2190,12 @@ static int __init_refok setup_cpu_cache( jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; + this_cpu = raw_smp_processor_id(); + + cpu_cache_get(cachep, this_cpu)->avail = 0; + cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep, this_cpu)->batchcount = 1; + cpu_cache_get(cachep, this_cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; return 0; @@ -2402,19 +2485,19 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { +/* + * On PREEMPT_RT we use locks to protect the per-CPU lists, + * and keep interrupts enabled. + */ +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(irqs_disabled()); -} - -static void check_spinlock_acquired(struct kmem_cache *cachep) -{ -#ifdef CONFIG_SMP - check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); #endif } @@ -2429,34 +2512,67 @@ static void check_spinlock_acquired_node #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) -#define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node); -static void do_drain(void *arg) +static void __do_drain(void *arg, int this_cpu) { struct kmem_cache *cachep = arg; + int node = cpu_to_node(this_cpu); struct array_cache *ac; - int node = numa_node_id(); check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, this_cpu); spin_lock(&cachep->nodelists[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, &this_cpu); spin_unlock(&cachep->nodelists[node]->list_lock); ac->avail = 0; } +#ifdef CONFIG_PREEMPT_RT +static void do_drain(void *arg, int this_cpu) +{ + __do_drain(arg, this_cpu); +} +#else +static void do_drain(void *arg) +{ + __do_drain(arg, smp_processor_id()); +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* + * execute func() for all CPUs. On PREEMPT_RT we dont actually have + * to run on the remote CPUs - we only have to take their CPU-locks. + * (This is a rare operation, so cacheline bouncing is not an issue.) + */ +static void +slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) +{ + unsigned int i; + + check_irq_on(); + for_each_online_cpu(i) { + spin_lock(&__get_cpu_lock(slab_irq_locks, i)); + func(arg, i); + spin_unlock(&__get_cpu_lock(slab_irq_locks, i)); + } +} +#else +# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1, 1) +#endif + static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1, 1); + slab_on_each_cpu(do_drain, cachep); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2481,16 +2597,16 @@ static int drain_freelist(struct kmem_ca struct kmem_list3 *l3, int tofree) { struct list_head *p; - int nr_freed; + int nr_freed, this_cpu; struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); p = l3->slabs_free.prev; if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); goto out; } @@ -2499,13 +2615,9 @@ static int drain_freelist(struct kmem_ca BUG_ON(slabp->inuse); #endif list_del(&slabp->list); - /* - * Safe to drop the lock. The slab is no longer linked - * to the cache. - */ l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, slabp, &this_cpu); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); nr_freed++; } out: @@ -2761,8 +2873,8 @@ static void slab_map_pages(struct kmem_c * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *objp, int *this_cpu) { struct slab *slabp; size_t offset; @@ -2791,7 +2903,8 @@ static int cache_grow(struct kmem_cache offset *= cachep->colour_off; if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_nort(*this_cpu); + slab_irq_enable_rt(*this_cpu); /* * The test for missing atomic flag is performed here, rather than @@ -2820,8 +2933,10 @@ static int cache_grow(struct kmem_cache cache_init_objs(cachep, slabp); + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(*this_cpu); + check_irq_off(); spin_lock(&l3->list_lock); @@ -2834,8 +2949,9 @@ static int cache_grow(struct kmem_cache opps1: kmem_freepages(cachep, objp); failed: + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(*this_cpu); return 0; } @@ -2957,7 +3073,8 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void * +cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { int batchcount; struct kmem_list3 *l3; @@ -2967,7 +3084,7 @@ static void *cache_alloc_refill(struct k retry: check_irq_off(); node = numa_node_id(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* @@ -2977,7 +3094,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + l3 = cachep->nodelists[cpu_to_node(*this_cpu)]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -3000,7 +3117,7 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - check_spinlock_acquired(cachep); + check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu)); /* * The slab was either on partial or free list so @@ -3014,8 +3131,9 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac->entry[ac->avail++] = + slab_get_obj(cachep, slabp, + cpu_to_node(*this_cpu)); } check_slabp(cachep, slabp); @@ -3034,10 +3152,10 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu); /* cache_grow can reenable interrupts, then ac could change. */ - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; @@ -3189,21 +3307,22 @@ static inline int should_failslab(struct #endif /* CONFIG_FAILSLAB */ -static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static inline void * +____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { void *objp; struct array_cache *ac; check_irq_off(); - ac = cpu_cache_get(cachep); + ac = cpu_cache_get(cachep, *this_cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac->entry[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, this_cpu); } return objp; } @@ -3215,7 +3334,8 @@ static inline void *____cache_alloc(stru * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. */ -static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags, + int *this_cpu) { int nid_alloc, nid_here; @@ -3227,7 +3347,7 @@ static void *alternate_node_alloc(struct else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return ____cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu); return NULL; } @@ -3239,7 +3359,7 @@ static void *alternate_node_alloc(struct * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */ -static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { struct zonelist *zonelist; gfp_t local_flags; @@ -3265,8 +3385,10 @@ retry: if (cpuset_zone_allowed_hardwall(*z, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) - obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid, + this_cpu); } if (!obj) { @@ -3277,19 +3399,24 @@ retry: * set and go into memory reserves if necessary. */ if (local_flags & __GFP_WAIT) - local_irq_enable(); + slab_irq_enable_nort(*this_cpu); + slab_irq_enable_rt(*this_cpu); + kmem_flagcheck(cache, flags); obj = kmem_getpages(cache, local_flags, -1); + + slab_irq_disable_rt(*this_cpu); if (local_flags & __GFP_WAIT) - local_irq_disable(); + slab_irq_disable_nort(*this_cpu); + if (obj) { /* * Insert into the appropriate per node queues */ nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + if (cache_grow(cache, flags, nid, obj, this_cpu)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + flags | GFP_THISNODE, nid, this_cpu); if (!obj) /* * Another processor may allocate the @@ -3310,7 +3437,7 @@ retry: * A interface to enable slab creation on nodeid */ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, int *this_cpu) { struct list_head *entry; struct slab *slabp; @@ -3358,11 +3485,11 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu); if (x) goto retry; - return fallback_alloc(cachep, flags); + return fallback_alloc(cachep, flags, this_cpu); done: return obj; @@ -3384,39 +3511,41 @@ static __always_inline void * __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, void *caller) { - unsigned long save_flags; + unsigned long irqflags; + int this_cpu; void *ptr; if (should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); + + slab_irq_save(irqflags, this_cpu); if (unlikely(nodeid == -1)) - nodeid = numa_node_id(); + nodeid = cpu_to_node(this_cpu); if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); + ptr = fallback_alloc(cachep, flags, &this_cpu); goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == cpu_to_node(this_cpu)) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); + ptr = ____cache_alloc(cachep, flags, &this_cpu); if (ptr) goto out; } /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); + ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu); out: - local_irq_restore(save_flags); + slab_irq_restore(irqflags, this_cpu); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); if (unlikely((flags & __GFP_ZERO) && ptr)) @@ -3426,33 +3555,33 @@ __cache_alloc_node(struct kmem_cache *ca } static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) { void *objp; if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { - objp = alternate_node_alloc(cache, flags); + objp = alternate_node_alloc(cache, flags, this_cpu); if (objp) goto out; } - objp = ____cache_alloc(cache, flags); + objp = ____cache_alloc(cache, flags, this_cpu); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); - + if (!objp) + objp = ____cache_alloc_node(cache, flags, + cpu_to_node(*this_cpu), this_cpu); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) { - return ____cache_alloc(cachep, flags); + return ____cache_alloc(cachep, flags, this_cpu); } #endif /* CONFIG_NUMA */ @@ -3461,15 +3590,16 @@ static __always_inline void * __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; + int this_cpu; void *objp; if (should_failslab(cachep, flags)) return NULL; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); + slab_irq_save(save_flags, this_cpu); + objp = __do_cache_alloc(cachep, flags, &this_cpu); + slab_irq_restore(save_flags, this_cpu); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3483,7 +3613,7 @@ __cache_alloc(struct kmem_cache *cachep, * Caller needs to acquire correct kmem_list's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) + int node, int *this_cpu) { int i; struct kmem_list3 *l3; @@ -3512,7 +3642,7 @@ static void free_block(struct kmem_cache * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, slabp, this_cpu); } else { list_add(&slabp->list, &l3->slabs_free); } @@ -3526,11 +3656,12 @@ static void free_block(struct kmem_cache } } -static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +static void +cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = cpu_to_node(*this_cpu); batchcount = ac->batchcount; #if DEBUG @@ -3552,7 +3683,7 @@ static void cache_flusharray(struct kmem } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, this_cpu); free_done: #if STATS { @@ -3581,9 +3712,9 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static void __cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu) { - struct array_cache *ac = cpu_cache_get(cachep); + struct array_cache *ac = cpu_cache_get(cachep, *this_cpu); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); @@ -3595,7 +3726,7 @@ static inline void __cache_free(struct k * variable to skip the call, which is mostly likely to be present in * the cache. */ - if (numa_platform && cache_free_alien(cachep, objp)) + if (numa_platform && cache_free_alien(cachep, objp, this_cpu)) return; if (likely(ac->avail < ac->limit)) { @@ -3604,7 +3735,7 @@ static inline void __cache_free(struct k return; } else { STATS_INC_FREEMISS(cachep); - cache_flusharray(cachep, ac); + cache_flusharray(cachep, ac, this_cpu); ac->entry[ac->avail++] = objp; } } @@ -3761,11 +3892,12 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + int this_cpu; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); debug_check_no_locks_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); - local_irq_restore(flags); + __cache_free(cachep, objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kmem_cache_free); @@ -3782,15 +3914,16 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + int this_cpu; if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); + slab_irq_save(flags, this_cpu); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); - local_irq_restore(flags); + __cache_free(c, (void *)objp, &this_cpu); + slab_irq_restore(flags, this_cpu); } EXPORT_SYMBOL(kfree); @@ -3811,7 +3944,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); */ static int alloc_kmemlist(struct kmem_cache *cachep) { - int node; + int node, this_cpu; struct kmem_list3 *l3; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3839,11 +3972,11 @@ static int alloc_kmemlist(struct kmem_ca if (l3) { struct array_cache *shared = l3->shared; - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &this_cpu); l3->shared = new_shared; if (!l3->alien) { @@ -3852,7 +3985,7 @@ static int alloc_kmemlist(struct kmem_ca } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); kfree(shared); free_alien_cache(new_alien); continue; @@ -3899,42 +4032,50 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; -static void do_ccupdate_local(void *info) +static void __do_ccupdate_local(void *info, int this_cpu) { struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); - old = cpu_cache_get(new->cachep); + old = cpu_cache_get(new->cachep, this_cpu); + + new->cachep->array[this_cpu] = new->new[this_cpu]; + new->new[this_cpu] = old; +} - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; +#ifdef CONFIG_PREEMPT_RT +static void do_ccupdate_local(void *arg, int this_cpu) +{ + __do_ccupdate_local(arg, this_cpu); } +#else +static void do_ccupdate_local(void *arg) +{ + __do_ccupdate_local(arg, smp_processor_id()); +} +#endif /* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared) { - struct ccupdate_struct *new; - int i; - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return -ENOMEM; + struct ccupdate_struct new; + int i, this_cpu; + memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); - if (!new->new[i]) { + if (!new.new[i]) { for (i--; i >= 0; i--) - kfree(new->new[i]); - kfree(new); + kfree(new.new[i]); return -ENOMEM; } } - new->cachep = cachep; + new.cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); + slab_on_each_cpu(do_ccupdate_local, (void *)&new); check_irq_on(); cachep->batchcount = batchcount; @@ -3942,15 +4083,15 @@ static int do_tune_cpucache(struct kmem_ cachep->shared = shared; for_each_online_cpu(i) { - struct array_cache *ccold = new->new[i]; + struct array_cache *ccold = new.new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu); + slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); kfree(ccold); } - kfree(new); + return alloc_kmemlist(cachep); } @@ -4012,29 +4153,31 @@ static int enable_cpucache(struct kmem_c * Drain an array if it contains any elements taking the l3 lock only if * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. + * returns non-zero if some work is done */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, - struct array_cache *ac, int force, int node) +int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { - int tofree; + int tofree, this_cpu; if (!ac || !ac->avail) - return; + return 0; if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &this_cpu); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } + return 1; } /** @@ -4051,11 +4194,12 @@ void drain_array(struct kmem_cache *cach */ static void cache_reap(struct work_struct *w) { + int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu); struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); struct delayed_work *work = container_of(w, struct delayed_work, work); + int work_done = 0; if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ @@ -4071,9 +4215,12 @@ static void cache_reap(struct work_struc */ l3 = searchp->nodelists[node]; - reap_alien(searchp, l3); + work_done += reap_alien(searchp, l3, &this_cpu); + + node = cpu_to_node(this_cpu); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + work_done += drain_array(searchp, l3, + cpu_cache_get(searchp, this_cpu), 0, node); /* * These are racy checks but it does not matter @@ -4084,7 +4231,7 @@ static void cache_reap(struct work_struc l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared, 0, node); + work_done += drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) l3->free_touched = 0; @@ -4103,7 +4250,8 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, + round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC)); } #ifdef CONFIG_SLABINFO @@ -4162,7 +4310,7 @@ static int s_show(struct seq_file *m, vo unsigned long num_slabs, free_objects = 0, shared_avail = 0; const char *name; char *error = NULL; - int node; + int this_cpu, node; struct kmem_list3 *l3; active_objs = 0; @@ -4173,7 +4321,7 @@ static int s_show(struct seq_file *m, vo continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) { if (slabp->inuse != cachep->num && !error) @@ -4198,7 +4346,7 @@ static int s_show(struct seq_file *m, vo if (l3->shared) shared_avail += l3->shared->avail; - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4394,7 +4542,7 @@ static int leaks_show(struct seq_file *m struct kmem_list3 *l3; const char *name; unsigned long *n = m->private; - int node; + int node, this_cpu; int i; if (!(cachep->flags & SLAB_STORE_USER)) @@ -4412,13 +4560,13 @@ static int leaks_show(struct seq_file *m continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + slab_spin_lock_irq(&l3->list_lock, this_cpu); list_for_each_entry(slabp, &l3->slabs_full, list) handle_slab(n, cachep, slabp); list_for_each_entry(slabp, &l3->slabs_partial, list) handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + slab_spin_unlock_irq(&l3->list_lock, this_cpu); } name = cachep->name; if (n[0] == n[1]) { Index: linux-2.6.25.8-rt7/mm/page_alloc.c =================================================================== --- linux-2.6.25.8-rt7.orig/mm/page_alloc.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/mm/page_alloc.c 2008-06-23 19:14:14.000000000 -0400 @@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_r EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + (void)get_cpu_var_locked(pcp_locks, this_cpu); + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return zone_pcp(zone, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); @@ -524,8 +571,9 @@ static void free_one_page(struct zone *z static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; int reserved = 0; + int this_cpu; + int i; for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); @@ -537,10 +585,10 @@ static void __free_pages_ok(struct page arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); + lock_cpu_pcp(&flags, &this_cpu); + count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); } /* @@ -888,15 +936,16 @@ void drain_zone_pages(struct zone *zone, { unsigned long flags; int to_drain; + int this_cpu; - local_irq_save(flags); + lock_cpu_pcp(&flags, &this_cpu); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; free_pages_bulk(zone, to_drain, &pcp->list, 0); pcp->count -= to_drain; - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); } #endif @@ -920,12 +969,15 @@ static void drain_pages(unsigned int cpu continue; pset = zone_pcp(zone, cpu); - + if (!pset) { + WARN_ON(1); + continue; + } pcp = &pset->pcp; - local_irq_save(flags); + lock_cpu_pcp(&flags, &cpu); free_pages_bulk(zone, pcp->count, &pcp->list, 0); pcp->count = 0; - local_irq_restore(flags); + unlock_cpu_pcp(flags, cpu); } } @@ -942,7 +994,40 @@ void drain_local_pages(void *arg) */ void drain_all_pages(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * HACK!!!!! + * For RT we can't use IPIs to run drain_local_pages, since + * that code will call spin_locks that will now sleep. + * But, schedule_on_each_cpu will call kzalloc, which will + * call page_alloc which was what calls this. + * + * Luckily, there's a condition to get here, and that is if + * the order passed in to alloc_pages is greater than 0 + * (alloced more than a page size). The slabs only allocate + * what is needed, and the allocation made by schedule_on_each_cpu + * does an alloc of "sizeof(void *)*nr_cpu_ids". + * + * So we can safely call schedule_on_each_cpu if that number + * is less than a page. Otherwise don't bother. At least warn of + * this issue. + * + * And yes, this is one big hack. Please fix ;-) + */ + if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) + schedule_on_each_cpu(drain_local_pages, NULL, 0, 1); + else { + static int once; + if (!once) { + printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); + once = 1; + } + drain_local_pages(NULL); + } + +#else on_each_cpu(drain_local_pages, NULL, 0, 1); +#endif } #ifdef CONFIG_HIBERNATION @@ -987,8 +1072,10 @@ void mark_free_pages(struct zone *zone) static void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; + int this_cpu; if (PageAnon(page)) page->mapping = NULL; @@ -1000,9 +1087,11 @@ static void free_hot_cold_page(struct pa arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; - local_irq_save(flags); - __count_vm_event(PGFREE); + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp; + + count_vm_event(PGFREE); + if (cold) list_add_tail(&page->lru, &pcp->list); else @@ -1013,8 +1102,7 @@ static void free_hot_cold_page(struct pa free_pages_bulk(zone, pcp->batch, &pcp->list, 0); pcp->count -= pcp->batch; } - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); } void free_hot_page(struct page *page) @@ -1056,16 +1144,15 @@ static struct page *buffered_rmqueue(str unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; + struct per_cpu_pageset *pset; int migratetype = allocflags_to_migratetype(gfp_flags); + int this_cpu; again: - cpu = get_cpu(); + pset = get_zone_pcp(zone, &flags, &this_cpu); if (likely(order == 0)) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = &pset->pcp; - pcp = &zone_pcp(zone, cpu)->pcp; - local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype); @@ -1094,7 +1181,7 @@ again: list_del(&page->lru); pcp->count--; } else { - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) @@ -1103,8 +1190,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1112,8 +1198,7 @@ again: return page; failed: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); return NULL; } Index: linux-2.6.25.8-rt7/include/linux/smp.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/smp.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/smp.h 2008-06-23 19:14:06.000000000 -0400 @@ -7,6 +7,7 @@ */ #include +#include extern void cpu_idle(void); @@ -33,6 +34,24 @@ extern void smp_send_stop(void); */ extern void smp_send_reschedule(int cpu); +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +/* + * trigger a reschedule on all other CPUs: + */ +extern void smp_send_reschedule_allbutself(void); + +#ifdef HAVE_RESCHEDULE_ALLBUTSELF_CPUMASK +extern void smp_send_reschedule_allbutself_cpumask(cpumask_t); +#else +static inline void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) { + smp_send_reschedule_allbutself(); +} +#endif + /* * Prepare machine for booting other CPUs. @@ -100,6 +119,8 @@ static inline int up_smp_call_function(v 0; \ }) static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_allbutself(void) { } +static inline void smp_send_reschedule_allbutself_cpumask(cpumask_t mask) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_single(cpuid, func, info, retry, wait) \ @@ -139,7 +160,7 @@ static inline void smp_send_reschedule(i #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() -#define put_cpu_no_resched() preempt_enable_no_resched() +#define put_cpu_no_resched() __preempt_enable_no_resched() void smp_setup_processor_id(void); Index: linux-2.6.25.8-rt7/kernel/stop_machine.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/stop_machine.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/stop_machine.c 2008-06-23 19:13:44.000000000 -0400 @@ -62,7 +62,7 @@ static int stopmachine(void *cpu) /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + __yield(); else cpu_relax(); } @@ -108,7 +108,7 @@ static int stop_machine(void) /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + __yield(); /* If some failed, kill them all. */ if (ret < 0) { @@ -132,7 +132,7 @@ static void restart_machine(void) { stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } struct stop_machine_data Index: linux-2.6.25.8-rt7/net/ipv4/tcp.c =================================================================== --- linux-2.6.25.8-rt7.orig/net/ipv4/tcp.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/net/ipv4/tcp.c 2008-06-23 19:13:26.000000000 -0400 @@ -1307,11 +1307,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); + preempt_enable(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); } else { - preempt_enable_no_resched(); + preempt_enable(); } } #endif Index: linux-2.6.25.8-rt7/net/ipv4/route.c =================================================================== --- linux-2.6.25.8-rt7.orig/net/ipv4/route.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/net/ipv4/route.c 2008-06-23 19:13:50.000000000 -0400 @@ -207,13 +207,13 @@ struct rt_hash_bucket { struct rtable *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) + defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT) /* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) # define RT_HASH_LOCK_SZ 256 #else # if NR_CPUS >= 32 @@ -245,7 +245,7 @@ static __init void rt_hash_lock_init(voi spin_lock_init(&rt_hash_locks[i]); } #else -# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL) static inline void rt_hash_lock_init(void) { Index: linux-2.6.25.8-rt7/drivers/acpi/processor_idle.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/acpi/processor_idle.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/acpi/processor_idle.c 2008-06-23 19:13:45.000000000 -0400 @@ -216,7 +216,7 @@ static void acpi_safe_halt(void) * test NEED_RESCHED: */ smp_mb(); - if (!need_resched()) { + if (!need_resched() || !need_resched_delayed()) { safe_halt(); local_irq_disable(); } @@ -1485,7 +1485,7 @@ static int acpi_idle_enter_simple(struct */ smp_mb(); - if (unlikely(need_resched())) { + if (unlikely(need_resched() || need_resched_delayed())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return 0; @@ -1530,7 +1530,7 @@ static int acpi_idle_enter_simple(struct } static int c3_cpu_count; -static DEFINE_SPINLOCK(c3_lock); +static DEFINE_RAW_SPINLOCK(c3_lock); /** * acpi_idle_enter_bm - enters C3 with proper BM handling @@ -1574,7 +1574,7 @@ static int acpi_idle_enter_bm(struct cpu */ smp_mb(); - if (unlikely(need_resched())) { + if (unlikely(need_resched() || need_resched_delayed())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return 0; Index: linux-2.6.25.8-rt7/drivers/input/ff-memless.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/input/ff-memless.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/input/ff-memless.c 2008-06-23 19:13:27.000000000 -0400 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include Index: linux-2.6.25.8-rt7/fs/proc/array.c =================================================================== --- linux-2.6.25.8-rt7.orig/fs/proc/array.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/fs/proc/array.c 2008-06-23 19:13:27.000000000 -0400 @@ -136,12 +136,13 @@ static inline void task_name(struct seq_ */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char *get_task_state(struct task_struct *tsk) @@ -308,6 +309,19 @@ static inline void task_context_switch_c p->nivcsw); } +#define get_blocked_on(t) (-1) + +static inline void show_blocked_on(struct seq_file *m, struct task_struct *p) +{ + pid_t pid = get_blocked_on(p); + + if (pid < 0) + return; + + seq_printf(m, "BlckOn: %d\n", pid); +} + + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -327,6 +341,7 @@ int proc_pid_status(struct seq_file *m, task_show_regs(m, task); #endif task_context_switch_counts(m, task); + show_blocked_on(m, task); return 0; } Index: linux-2.6.25.8-rt7/include/linux/bit_spinlock.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/bit_spinlock.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/bit_spinlock.h 2008-06-23 19:13:27.000000000 -0400 @@ -1,6 +1,8 @@ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H +#if 0 + /* * bit-based spin_lock() * @@ -91,5 +93,7 @@ static inline int bit_spin_is_locked(int #endif } +#endif + #endif /* __LINUX_BIT_SPINLOCK_H */ Index: linux-2.6.25.8-rt7/include/linux/mutex.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/mutex.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/mutex.h 2008-06-23 19:14:04.000000000 -0400 @@ -12,11 +12,82 @@ #include #include +#include #include #include #include +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { .name = #lockname } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + +#ifdef CONFIG_PREEMPT_RT + +#include + +struct mutex { + struct rt_mutex lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + + +#define __MUTEX_INITIALIZER(mutexname) \ + { \ + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void +_mutex_init(struct mutex *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc _mutex_lock(struct mutex *lock); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern int __lockfunc _mutex_lock_killable(struct mutex *lock); +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_trylock(struct mutex *lock); +extern void __lockfunc _mutex_unlock(struct mutex *lock); + +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) +#define mutex_lock(l) _mutex_lock(l) +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) +#define mutex_lock_killable(l) _mutex_lock_killable(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible_nested(l, s) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable_nested(l, s) +#else +# define mutex_lock_nested(l, s) _mutex_lock(l) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible(l) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable(l) +#endif + +# define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + _mutex_init((mutex), #mutex, &__key); \ +} while (0) + +#else /* * Simple, straightforward mutexes with strict semantics: * @@ -86,13 +157,6 @@ do { \ # define mutex_destroy(mutex) do { } while (0) #endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { .name = #lockname } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .count = ATOMIC_INIT(1) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -149,3 +213,4 @@ extern int mutex_trylock(struct mutex *l extern void mutex_unlock(struct mutex *lock); #endif +#endif Index: linux-2.6.25.8-rt7/include/linux/plist.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/plist.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/plist.h 2008-06-23 19:14:12.000000000 -0400 @@ -81,7 +81,7 @@ struct plist_head { struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST - spinlock_t *lock; + raw_spinlock_t *lock; #endif }; @@ -99,13 +99,13 @@ struct plist_node { /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name - * @_lock: lock to initialize for this list + * @_lock: lock * to initialize for this list */ #define PLIST_HEAD_INIT(head, _lock) \ { \ .prio_list = LIST_HEAD_INIT((head).prio_list), \ .node_list = LIST_HEAD_INIT((head).node_list), \ - PLIST_HEAD_LOCK_INIT(&(_lock)) \ + PLIST_HEAD_LOCK_INIT(_lock) \ } /** @@ -125,7 +125,7 @@ struct plist_node { * @lock: list spinlock, remembered for debugging */ static inline void -plist_head_init(struct plist_head *head, spinlock_t *lock) +plist_head_init(struct plist_head *head, raw_spinlock_t *lock) { INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); @@ -148,6 +148,8 @@ static inline void plist_node_init(struc extern void plist_add(struct plist_node *node, struct plist_head *head); extern void plist_del(struct plist_node *node, struct plist_head *head); +extern void plist_head_splice(struct plist_head *src, struct plist_head *dst); + /** * plist_for_each - iterate over the plist * @pos: the type * to use as a loop counter Index: linux-2.6.25.8-rt7/include/linux/rt_lock.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/include/linux/rt_lock.h 2008-06-23 19:14:27.000000000 -0400 @@ -0,0 +1,282 @@ +#ifndef __LINUX_RT_LOCK_H +#define __LINUX_RT_LOCK_H + +/* + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar + * + * This file contains the main data structure definitions. + */ +#include +#include +#include +#include + +#ifdef CONFIG_PREEMPT_RT +/* + * spinlocks - an RT mutex plus lock-break field: + */ +typedef struct { + struct rt_mutex lock; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + .save_state = 1, \ + .file = __FILE__, \ + .line = __LINE__, } +#else +# define __RT_SPIN_INITIALIZER(name) \ + { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) } +#endif + +#define __SPIN_LOCK_UNLOCKED(name) (spinlock_t) \ + { .lock = __RT_SPIN_INITIALIZER(name), \ + SPIN_DEP_MAP_INIT(name) } + +#else /* !PREEMPT_RT */ + +typedef raw_spinlock_t spinlock_t; + +#define __SPIN_LOCK_UNLOCKED _RAW_SPIN_LOCK_UNLOCKED + +#endif + +#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) + + +#define __DEFINE_SPINLOCK(name) \ + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_SPINLOCK(name) \ + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name) + +#ifdef CONFIG_PREEMPT_RT + +struct rw_mutex { + struct task_struct *owner; + struct rt_mutex mutex; + atomic_t count; /* number of times held for read */ + atomic_t owners; /* number of owners as readers */ + struct list_head readers; + int prio; +}; + +/* + * RW-semaphores are a spinlock plus a reader-depth count. + * + * Note that the semantics are different from the usual + * Linux rw-sems, in PREEMPT_RT mode we do not allow + * multiple readers to hold the lock at once, we only allow + * a read-lock owner to read-lock recursively. This is + * better for latency, makes the implementation inherently + * fair and makes it simpler as well: + */ +struct rw_semaphore { + struct rw_mutex owners; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +/* + * rwlocks - an RW semaphore plus lock-break field: + */ +typedef struct { + struct rw_mutex owners; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} rwlock_t; + +#define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ + { .owners.mutex = __RT_SPIN_INITIALIZER(name.owners.mutex), \ + .owners.prio = MAX_PRIO, \ + RW_DEP_MAP_INIT(name) } +#else /* !PREEMPT_RT */ + +typedef raw_rwlock_t rwlock_t; + +#define __RW_LOCK_UNLOCKED _RAW_RW_LOCK_UNLOCKED + +#endif + +#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) + + +#define DEFINE_RWLOCK(name) \ + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) + +#ifdef CONFIG_PREEMPT_RT + +/* + * Semaphores - a spinlock plus the semaphore count: + */ +struct semaphore { + atomic_t count; + struct rt_mutex lock; +}; + +#define DECLARE_MUTEX(name) \ +struct semaphore name = \ + { .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) } + +extern void +__sema_init(struct semaphore *sem, int val, char *name, char *file, int line); + +#define rt_sema_init(sem, val) \ + __sema_init(sem, val, #sem, __FILE__, __LINE__) + +extern void +__init_MUTEX(struct semaphore *sem, char *name, char *file, int line); +#define rt_init_MUTEX(sem) \ + __init_MUTEX(sem, #sem, __FILE__, __LINE__) + +extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void); + +/* + * No locked initialization for RT semaphores + */ +#define rt_init_MUTEX_LOCKED(sem) \ + there_is_no_init_MUTEX_LOCKED_for_RT_semaphores() +extern void rt_down(struct semaphore *sem); +extern int rt_down_interruptible(struct semaphore *sem); +extern int rt_down_trylock(struct semaphore *sem); +extern void rt_up(struct semaphore *sem); + +#define rt_sem_is_locked(s) rt_mutex_is_locked(&(s)->lock) +#define rt_sema_count(s) atomic_read(&(s)->count) + +extern int __bad_func_type(void); + +#include + +/* + * PICK_SEM_OP() is a small redirector to allow less typing of the lock + * types struct compat_semaphore, struct semaphore, at the front of the + * PICK_FUNCTION macro. + */ +#define PICK_SEM_OP(...) PICK_FUNCTION(struct compat_semaphore *, \ + struct semaphore *, ##__VA_ARGS__) +#define PICK_SEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_semaphore *,\ + struct semaphore *, ##__VA_ARGS__) + +#define sema_init(sem, val) \ + PICK_SEM_OP(compat_sema_init, rt_sema_init, sem, val) + +#define init_MUTEX(sem) PICK_SEM_OP(compat_init_MUTEX, rt_init_MUTEX, sem) + +#define init_MUTEX_LOCKED(sem) \ + PICK_SEM_OP(compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem) + +#define down(sem) PICK_SEM_OP(compat_down, rt_down, sem) + +#define down_interruptible(sem) \ + PICK_SEM_OP_RET(compat_down_interruptible, rt_down_interruptible, sem) + +#define down_trylock(sem) \ + PICK_SEM_OP_RET(compat_down_trylock, rt_down_trylock, sem) + +#define up(sem) PICK_SEM_OP(compat_up, rt_up, sem) + +#define sem_is_locked(sem) \ + PICK_SEM_OP_RET(compat_sem_is_locked, rt_sem_is_locked, sem) + +#define sema_count(sem) PICK_SEM_OP_RET(compat_sema_count, rt_sema_count, sem) + +/* + * rwsems: + */ + +#define __RWSEM_INITIALIZER(name) \ + { .owners.mutex = __RT_MUTEX_INITIALIZER(name.owners.mutex), \ + .owners.prio = MAX_PRIO, \ + RW_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key); + +# define rt_init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwsem_init((sem), #sem, &__key); \ +} while (0) + +extern void __dont_do_this_in_rt(struct rw_semaphore *rwsem); + +#define rt_down_read_non_owner(rwsem) __dont_do_this_in_rt(rwsem) +#define rt_up_read_non_owner(rwsem) __dont_do_this_in_rt(rwsem) + +extern void rt_down_write(struct rw_semaphore *rwsem); +extern void +rt_down_read_nested(struct rw_semaphore *rwsem, int subclass); +extern void +rt_down_write_nested(struct rw_semaphore *rwsem, int subclass); +extern void rt_down_read(struct rw_semaphore *rwsem); +extern int rt_down_write_trylock(struct rw_semaphore *rwsem); +extern int rt_down_read_trylock(struct rw_semaphore *rwsem); +extern void rt_up_read(struct rw_semaphore *rwsem); +extern void rt_up_write(struct rw_semaphore *rwsem); +extern void rt_downgrade_write(struct rw_semaphore *rwsem); + +# define rt_rwsem_is_locked(rws) ((rws)->owners.owner != NULL) + +#define PICK_RWSEM_OP(...) PICK_FUNCTION(struct compat_rw_semaphore *, \ + struct rw_semaphore *, ##__VA_ARGS__) +#define PICK_RWSEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_rw_semaphore *,\ + struct rw_semaphore *, ##__VA_ARGS__) + +#define init_rwsem(rwsem) PICK_RWSEM_OP(compat_init_rwsem, rt_init_rwsem, rwsem) + +#define down_read(rwsem) PICK_RWSEM_OP(compat_down_read, rt_down_read, rwsem) + +#define down_read_non_owner(rwsem) \ + PICK_RWSEM_OP(compat_down_read_non_owner, rt_down_read_non_owner, rwsem) + +#define down_read_trylock(rwsem) \ + PICK_RWSEM_OP_RET(compat_down_read_trylock, rt_down_read_trylock, rwsem) + +#define down_write(rwsem) PICK_RWSEM_OP(compat_down_write, rt_down_write, rwsem) + +#define down_read_nested(rwsem, subclass) \ + PICK_RWSEM_OP(compat_down_read_nested, rt_down_read_nested, \ + rwsem, subclass) + +#define down_write_nested(rwsem, subclass) \ + PICK_RWSEM_OP(compat_down_write_nested, rt_down_write_nested, \ + rwsem, subclass) + +#define down_write_trylock(rwsem) \ + PICK_RWSEM_OP_RET(compat_down_write_trylock, rt_down_write_trylock,\ + rwsem) + +#define up_read(rwsem) PICK_RWSEM_OP(compat_up_read, rt_up_read, rwsem) + +#define up_read_non_owner(rwsem) \ + PICK_RWSEM_OP(compat_up_read_non_owner, rt_up_read_non_owner, rwsem) + +#define up_write(rwsem) PICK_RWSEM_OP(compat_up_write, rt_up_write, rwsem) + +#define downgrade_write(rwsem) \ + PICK_RWSEM_OP(compat_downgrade_write, rt_downgrade_write, rwsem) + +#define rwsem_is_locked(rwsem) \ + PICK_RWSEM_OP_RET(compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem) + +#endif /* CONFIG_PREEMPT_RT */ + +#endif + Index: linux-2.6.25.8-rt7/include/linux/rtmutex.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/rtmutex.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/rtmutex.h 2008-06-23 19:14:12.000000000 -0400 @@ -24,7 +24,7 @@ * @owner: the mutex owner */ struct rt_mutex { - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct plist_head wait_list; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -63,8 +63,8 @@ struct hrtimer_sleeper; #endif #define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ + { .wait_lock = RAW_SPIN_LOCK_UNLOCKED(mutexname) \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, &mutexname.wait_lock) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -88,6 +88,8 @@ extern void rt_mutex_destroy(struct rt_m extern void rt_mutex_lock(struct rt_mutex *lock); extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, int detect_deadlock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock, + int detect_deadlock); extern int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, int detect_deadlock); @@ -98,7 +100,7 @@ extern void rt_mutex_unlock(struct rt_mu #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, &tsk.pi_lock), \ INIT_RT_MUTEX_DEBUG(tsk) #else # define INIT_RT_MUTEXES(tsk) Index: linux-2.6.25.8-rt7/include/linux/rwsem-spinlock.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/rwsem-spinlock.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/rwsem-spinlock.h 2008-06-23 19:13:27.000000000 -0400 @@ -28,7 +28,7 @@ struct rwsem_waiter; * - if activity is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ -struct rw_semaphore { +struct compat_rw_semaphore { __s32 activity; spinlock_t wait_lock; struct list_head wait_list; @@ -43,33 +43,32 @@ struct rw_semaphore { # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#define __RWSEM_INITIALIZER(name) \ -{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } +#define __COMPAT_RWSEM_INITIALIZER(name) \ +{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) -extern void __down_read(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); +extern void __down_read(struct compat_rw_semaphore *sem); +extern int __down_read_trylock(struct compat_rw_semaphore *sem); +extern void __down_write(struct compat_rw_semaphore *sem); +extern void __down_write_nested(struct compat_rw_semaphore *sem, int subclass); +extern int __down_write_trylock(struct compat_rw_semaphore *sem); +extern void __up_read(struct compat_rw_semaphore *sem); +extern void __up_write(struct compat_rw_semaphore *sem); +extern void __downgrade_write(struct compat_rw_semaphore *sem); -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->activity != 0); } Index: linux-2.6.25.8-rt7/include/linux/rwsem.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/rwsem.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/rwsem.h 2008-06-23 19:13:27.000000000 -0400 @@ -9,6 +9,10 @@ #include +#ifdef CONFIG_PREEMPT_RT +# include +#endif + #ifdef __KERNEL__ #include @@ -16,48 +20,59 @@ #include #include -struct rw_semaphore; +#ifndef CONFIG_PREEMPT_RT +/* + * On !PREEMPT_RT all rw-semaphores are compat: + */ +#define compat_rw_semaphore rw_semaphore +#endif + +struct compat_rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK -#include /* use a generic implementation */ +# include /* use a generic implementation */ +# ifndef CONFIG_PREEMPT_RT +# define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER +# define DECLARE_RWSEM COMPAT_DECLARE_RWSEM +# endif #else -#include /* use an arch-specific implementation */ +# include /* use an arch-specific implementation */ #endif /* * lock for reading */ -extern void down_read(struct rw_semaphore *sem); +extern void compat_down_read(struct compat_rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -extern int down_read_trylock(struct rw_semaphore *sem); +extern int compat_down_read_trylock(struct compat_rw_semaphore *sem); /* * lock for writing */ -extern void down_write(struct rw_semaphore *sem); +extern void compat_down_write(struct compat_rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -extern int down_write_trylock(struct rw_semaphore *sem); +extern int compat_down_write_trylock(struct compat_rw_semaphore *sem); /* * release a read lock */ -extern void up_read(struct rw_semaphore *sem); +extern void compat_up_read(struct compat_rw_semaphore *sem); /* * release a write lock */ -extern void up_write(struct rw_semaphore *sem); +extern void compat_up_write(struct compat_rw_semaphore *sem); /* * downgrade write lock to read lock */ -extern void downgrade_write(struct rw_semaphore *sem); +extern void compat_downgrade_write(struct compat_rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -73,22 +88,79 @@ extern void downgrade_write(struct rw_se * lockdep_set_class() at lock initialization time. * See Documentation/lockdep-design.txt for more details.) */ -extern void down_read_nested(struct rw_semaphore *sem, int subclass); -extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void +compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass); +extern void +compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass); /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); +extern void +compat_down_read_non_owner(struct compat_rw_semaphore *sem); +extern void +compat_up_read_non_owner(struct compat_rw_semaphore *sem); #else -# define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) +# define compat_down_read_nested(sem, subclass) compat_down_read(sem) +# define compat_down_write_nested(sem, subclass) compat_down_write(sem) +# define compat_down_read_non_owner(sem) compat_down_read(sem) +# define compat_up_read_non_owner(sem) compat_up_read(sem) #endif +#ifndef CONFIG_PREEMPT_RT + +#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM + +/* + * NOTE, lockdep: this has to be a macro, so that separate class-keys + * get generated by the compiler, if the same function does multiple + * init_rwsem() calls to different rwsems. + */ +#define init_rwsem(rwsem) compat_init_rwsem(rwsem) + +static inline void down_read(struct compat_rw_semaphore *rwsem) +{ + compat_down_read(rwsem); +} +static inline int down_read_trylock(struct compat_rw_semaphore *rwsem) +{ + return compat_down_read_trylock(rwsem); +} +static inline void down_write(struct compat_rw_semaphore *rwsem) +{ + compat_down_write(rwsem); +} +static inline int down_write_trylock(struct compat_rw_semaphore *rwsem) +{ + return compat_down_write_trylock(rwsem); +} +static inline void up_read(struct compat_rw_semaphore *rwsem) +{ + compat_up_read(rwsem); +} +static inline void up_write(struct compat_rw_semaphore *rwsem) +{ + compat_up_write(rwsem); +} +static inline void downgrade_write(struct compat_rw_semaphore *rwsem) +{ + compat_downgrade_write(rwsem); +} +static inline int rwsem_is_locked(struct compat_rw_semaphore *sem) +{ + return compat_rwsem_is_locked(sem); +} +# define down_read_nested(sem, subclass) \ + compat_down_read_nested(sem, subclass) +# define down_write_nested(sem, subclass) \ + compat_down_write_nested(sem, subclass) +# define down_read_non_owner(sem) \ + compat_down_read_non_owner(sem) +# define up_read_non_owner(sem) \ + compat_up_read_non_owner(sem) +#endif /* !CONFIG_PREEMPT_RT */ + #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_H */ Index: linux-2.6.25.8-rt7/include/linux/semaphore.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/include/linux/semaphore.h 2008-06-23 19:13:27.000000000 -0400 @@ -0,0 +1,49 @@ +#ifndef _LINUX_SEMAPHORE_H +#define _LINUX_SEMAPHORE_H + +#ifdef CONFIG_PREEMPT_RT +# include +#else + +#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX + +static inline void sema_init(struct compat_semaphore *sem, int val) +{ + compat_sema_init(sem, val); +} +static inline void init_MUTEX(struct compat_semaphore *sem) +{ + compat_init_MUTEX(sem); +} +static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem) +{ + compat_init_MUTEX_LOCKED(sem); +} +static inline void down(struct compat_semaphore *sem) +{ + compat_down(sem); +} +static inline int down_interruptible(struct compat_semaphore *sem) +{ + return compat_down_interruptible(sem); +} +static inline int down_trylock(struct compat_semaphore *sem) +{ + return compat_down_trylock(sem); +} +static inline void up(struct compat_semaphore *sem) +{ + compat_up(sem); +} +static inline int sem_is_locked(struct compat_semaphore *sem) +{ + return compat_sem_is_locked(sem); +} +static inline int sema_count(struct compat_semaphore *sem) +{ + return compat_sema_count(sem); +} + +#endif /* CONFIG_PREEMPT_RT */ + +#endif /* _LINUX_SEMAPHORE_H */ Index: linux-2.6.25.8-rt7/include/linux/seqlock.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/seqlock.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/seqlock.h 2008-06-23 19:14:10.000000000 -0400 @@ -32,46 +32,83 @@ typedef struct { unsigned sequence; spinlock_t lock; -} seqlock_t; +} __seqlock_t; + +typedef struct { + unsigned sequence; + raw_spinlock_t lock; +} __raw_seqlock_t; + +#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock) + +#ifdef CONFIG_PREEMPT_RT +typedef __seqlock_t seqlock_t; +#else +typedef __raw_seqlock_t seqlock_t; +#endif + +typedef __raw_seqlock_t raw_seqlock_t; /* * These macros triggered gcc-3.x compile-time problems. We think these are * OK now. Be cautious. */ -#define __SEQLOCK_UNLOCKED(lockname) \ - { 0, __SPIN_LOCK_UNLOCKED(lockname) } +#define __RAW_SEQLOCK_UNLOCKED(lockname) \ + { 0, RAW_SPIN_LOCK_UNLOCKED(lockname) } + +#ifdef CONFIG_PREEMPT_RT +# define __SEQLOCK_UNLOCKED(lockname) { 0, __SPIN_LOCK_UNLOCKED(lockname) } +#else +# define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname) +#endif #define SEQLOCK_UNLOCKED \ __SEQLOCK_UNLOCKED(old_style_seqlock_init) -#define seqlock_init(x) \ - do { \ - (x)->sequence = 0; \ - spin_lock_init(&(x)->lock); \ - } while (0) +#define raw_seqlock_init(x) \ + do { *(x) = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0) + +#define seqlock_init(x) \ + do { *(x) = (seqlock_t) __SEQLOCK_UNLOCKED(x); spin_lock_init(&(x)->lock); } while (0) #define DEFINE_SEQLOCK(x) \ seqlock_t x = __SEQLOCK_UNLOCKED(x) +#define DEFINE_RAW_SEQLOCK(name) \ + raw_seqlock_t name __cacheline_aligned_in_smp = \ + __RAW_SEQLOCK_UNLOCKED(name) + + /* Lock out other writers and update the count. * Acts like a normal spin_lock/unlock. * Don't need preempt_disable() because that is in the spin_lock already. */ -static inline void write_seqlock(seqlock_t *sl) +static inline void __write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); ++sl->sequence; smp_wmb(); } -static inline void write_sequnlock(seqlock_t *sl) +static __always_inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) +{ + unsigned long flags; + + local_save_flags(flags); + __write_seqlock(sl); + return flags; +} + +static inline void __write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; spin_unlock(&sl->lock); } -static inline int write_tryseqlock(seqlock_t *sl) +#define __write_sequnlock_irqrestore(sl, flags) __write_sequnlock(sl) + +static inline int __write_tryseqlock(seqlock_t *sl) { int ret = spin_trylock(&sl->lock); @@ -83,7 +120,7 @@ static inline int write_tryseqlock(seqlo } /* Start of read calculation -- fetch last complete writer token */ -static __always_inline unsigned read_seqbegin(const seqlock_t *sl) +static __always_inline unsigned __read_seqbegin(const seqlock_t *sl) { unsigned ret = sl->sequence; smp_rmb(); @@ -98,12 +135,195 @@ static __always_inline unsigned read_seq * * Using xor saves one conditional branch. */ -static __always_inline int read_seqretry(const seqlock_t *sl, unsigned iv) +static inline int __read_seqretry(seqlock_t *sl, unsigned iv) +{ + int ret; + + smp_rmb(); + ret = (iv & 1) | (sl->sequence ^ iv); + /* + * If invalid then serialize with the writer, to make sure we + * are not livelocking it: + */ + if (unlikely(ret)) { + unsigned long flags; + spin_lock_irqsave(&sl->lock, flags); + spin_unlock_irqrestore(&sl->lock, flags); + } + return ret; +} + +static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl) +{ + spin_lock(&sl->lock); + ++sl->sequence; + smp_wmb(); +} + +static __always_inline unsigned long +__write_seqlock_irqsave_raw(raw_seqlock_t *sl) +{ + unsigned long flags; + + local_irq_save(flags); + __write_seqlock_raw(sl); + return flags; +} + +static __always_inline void __write_seqlock_irq_raw(raw_seqlock_t *sl) +{ + local_irq_disable(); + __write_seqlock_raw(sl); +} + +static __always_inline void __write_seqlock_bh_raw(raw_seqlock_t *sl) +{ + local_bh_disable(); + __write_seqlock_raw(sl); +} + +static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl) +{ + smp_wmb(); + sl->sequence++; + spin_unlock(&sl->lock); +} + +static __always_inline void +__write_sequnlock_irqrestore_raw(raw_seqlock_t *sl, unsigned long flags) +{ + __write_sequnlock_raw(sl); + local_irq_restore(flags); + preempt_check_resched(); +} + +static __always_inline void __write_sequnlock_irq_raw(raw_seqlock_t *sl) +{ + __write_sequnlock_raw(sl); + local_irq_enable(); + preempt_check_resched(); +} + +static __always_inline void __write_sequnlock_bh_raw(raw_seqlock_t *sl) +{ + __write_sequnlock_raw(sl); + local_bh_enable(); +} + +static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl) +{ + int ret = spin_trylock(&sl->lock); + + if (ret) { + ++sl->sequence; + smp_wmb(); + } + return ret; +} + +static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl) +{ + unsigned ret = sl->sequence; + smp_rmb(); + return ret; +} + +static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv) { smp_rmb(); return (iv & 1) | (sl->sequence ^ iv); } +extern int __bad_seqlock_type(void); + +/* + * PICK_SEQ_OP() is a small redirector to allow less typing of the lock + * types raw_seqlock_t, seqlock_t, at the front of the PICK_FUNCTION + * macro. + */ +#define PICK_SEQ_OP(...) \ + PICK_FUNCTION(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__) +#define PICK_SEQ_OP_RET(...) \ + PICK_FUNCTION_RET(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__) + +#define write_seqlock(sl) PICK_SEQ_OP(__write_seqlock_raw, __write_seqlock, sl) + +#define write_sequnlock(sl) \ + PICK_SEQ_OP(__write_sequnlock_raw, __write_sequnlock, sl) + +#define write_tryseqlock(sl) \ + PICK_SEQ_OP_RET(__write_tryseqlock_raw, __write_tryseqlock, sl) + +#define read_seqbegin(sl) \ + PICK_SEQ_OP_RET(__read_seqbegin_raw, __read_seqbegin, sl) + +#define read_seqretry(sl, iv) \ + PICK_SEQ_OP_RET(__read_seqretry_raw, __read_seqretry, sl, iv) + +#define write_seqlock_irqsave(lock, flags) \ +do { \ + flags = PICK_SEQ_OP_RET(__write_seqlock_irqsave_raw, \ + __write_seqlock_irqsave, lock); \ +} while (0) + +#define write_seqlock_irq(lock) \ + PICK_SEQ_OP(__write_seqlock_irq_raw, __write_seqlock, lock) + +#define write_seqlock_bh(lock) \ + PICK_SEQ_OP(__write_seqlock_bh_raw, __write_seqlock, lock) + +#define write_sequnlock_irqrestore(lock, flags) \ + PICK_SEQ_OP(__write_sequnlock_irqrestore_raw, \ + __write_sequnlock_irqrestore, lock, flags) + +#define write_sequnlock_bh(lock) \ + PICK_SEQ_OP(__write_sequnlock_bh_raw, __write_sequnlock, lock) + +#define write_sequnlock_irq(lock) \ + PICK_SEQ_OP(__write_sequnlock_irq_raw, __write_sequnlock, lock) + +static __always_inline +unsigned long __seq_irqsave_raw(raw_seqlock_t *sl) +{ + unsigned long flags; + + local_irq_save(flags); + return flags; +} + +static __always_inline unsigned long __seq_irqsave(seqlock_t *sl) +{ + unsigned long flags; + + local_save_flags(flags); + return flags; +} + +#define read_seqbegin_irqsave(lock, flags) \ +({ \ + flags = PICK_SEQ_OP_RET(__seq_irqsave_raw, __seq_irqsave, lock);\ + read_seqbegin(lock); \ +}) + +static __always_inline int +__read_seqretry_irqrestore(seqlock_t *sl, unsigned iv, unsigned long flags) +{ + return __read_seqretry(sl, iv); +} + +static __always_inline int +__read_seqretry_irqrestore_raw(raw_seqlock_t *sl, unsigned iv, + unsigned long flags) +{ + int ret = read_seqretry(sl, iv); + local_irq_restore(flags); + preempt_check_resched(); + return ret; +} + +#define read_seqretry_irqrestore(lock, iv, flags) \ + PICK_SEQ_OP_RET(__read_seqretry_irqrestore_raw, \ + __read_seqretry_irqrestore, lock, iv, flags) /* * Version using sequence counter only. @@ -154,32 +374,4 @@ static inline void write_seqcount_end(se smp_wmb(); s->sequence++; } - -/* - * Possible sw/hw IRQ protected versions of the interfaces. - */ -#define write_seqlock_irqsave(lock, flags) \ - do { local_irq_save(flags); write_seqlock(lock); } while (0) -#define write_seqlock_irq(lock) \ - do { local_irq_disable(); write_seqlock(lock); } while (0) -#define write_seqlock_bh(lock) \ - do { local_bh_disable(); write_seqlock(lock); } while (0) - -#define write_sequnlock_irqrestore(lock, flags) \ - do { write_sequnlock(lock); local_irq_restore(flags); } while(0) -#define write_sequnlock_irq(lock) \ - do { write_sequnlock(lock); local_irq_enable(); } while(0) -#define write_sequnlock_bh(lock) \ - do { write_sequnlock(lock); local_bh_enable(); } while(0) - -#define read_seqbegin_irqsave(lock, flags) \ - ({ local_irq_save(flags); read_seqbegin(lock); }) - -#define read_seqretry_irqrestore(lock, iv, flags) \ - ({ \ - int ret = read_seqretry(lock, iv); \ - local_irq_restore(flags); \ - ret; \ - }) - #endif /* __LINUX_SEQLOCK_H */ Index: linux-2.6.25.8-rt7/include/linux/spinlock_api_smp.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/spinlock_api_smp.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/spinlock_api_smp.h 2008-06-23 19:13:27.000000000 -0400 @@ -19,43 +19,58 @@ int in_lock_functions(unsigned long addr #define assert_spin_locked(x) BUG_ON(!spin_is_locked(x)) -void __lockfunc _spin_lock(spinlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) - __acquires(lock); -void __lockfunc _read_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_bh(spinlock_t *lock) __acquires(lock); -void __lockfunc _read_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock_bh(rwlock_t *lock) __acquires(lock); -void __lockfunc _spin_lock_irq(spinlock_t *lock) __acquires(lock); -void __lockfunc _read_lock_irq(rwlock_t *lock) __acquires(lock); -void __lockfunc _write_lock_irq(rwlock_t *lock) __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) - __acquires(lock); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) - __acquires(lock); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) - __acquires(lock); -int __lockfunc _spin_trylock(spinlock_t *lock); -int __lockfunc _read_trylock(rwlock_t *lock); -int __lockfunc _write_trylock(rwlock_t *lock); -int __lockfunc _spin_trylock_bh(spinlock_t *lock); -void __lockfunc _spin_unlock(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) __releases(lock); -void __lockfunc _read_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(lock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) - __releases(lock); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); +#define ACQUIRE_SPIN __acquires(lock) +#define ACQUIRE_RW __acquires(lock) +#define RELEASE_SPIN __releases(lock) +#define RELEASE_RW __releases(lock) + +void __lockfunc __spin_lock(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) + ACQUIRE_SPIN; +void __lockfunc __read_lock(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __read_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) ACQUIRE_SPIN; +void __lockfunc __read_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; +void __lockfunc __write_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; +unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) + ACQUIRE_SPIN; +unsigned long __lockfunc +__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) ACQUIRE_SPIN; +unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) + ACQUIRE_RW; +unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) + ACQUIRE_RW; +int __lockfunc __spin_trylock(raw_spinlock_t *lock); +int __lockfunc +__spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags); +int __lockfunc __read_trylock(raw_rwlock_t *lock); +int __lockfunc __write_trylock(raw_rwlock_t *lock); +int __lockfunc +__write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags); +int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock); +int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock); +void __lockfunc __spin_unlock(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) + RELEASE_SPIN; +void __lockfunc __read_unlock(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) RELEASE_SPIN; +void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; +void __lockfunc +__spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) + RELEASE_SPIN; +void __lockfunc +__read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + RELEASE_RW; +void +__lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + RELEASE_RW; #endif /* __LINUX_SPINLOCK_API_SMP_H */ Index: linux-2.6.25.8-rt7/include/linux/spinlock_api_up.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/spinlock_api_up.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/spinlock_api_up.h 2008-06-23 19:13:27.000000000 -0400 @@ -33,12 +33,22 @@ #define __LOCK_IRQ(lock) \ do { local_irq_disable(); __LOCK(lock); } while (0) -#define __LOCK_IRQSAVE(lock, flags) \ - do { local_irq_save(flags); __LOCK(lock); } while (0) +#define __LOCK_IRQSAVE(lock) \ + ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; }) + +#define __TRYLOCK_IRQSAVE(lock, flags) \ + ({ local_irq_save(*(flags)); __LOCK(lock); 1; }) + +#define __spin_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) + +#define __write_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) #define __UNLOCK(lock) \ do { preempt_enable(); __release(lock); (void)(lock); } while (0) +#define __UNLOCK_NO_RESCHED(lock) \ + do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0) + #define __UNLOCK_BH(lock) \ do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0) @@ -48,34 +58,36 @@ #define __UNLOCK_IRQRESTORE(lock, flags) \ do { local_irq_restore(flags); __UNLOCK(lock); } while (0) -#define _spin_lock(lock) __LOCK(lock) -#define _spin_lock_nested(lock, subclass) __LOCK(lock) -#define _read_lock(lock) __LOCK(lock) -#define _write_lock(lock) __LOCK(lock) -#define _spin_lock_bh(lock) __LOCK_BH(lock) -#define _read_lock_bh(lock) __LOCK_BH(lock) -#define _write_lock_bh(lock) __LOCK_BH(lock) -#define _spin_lock_irq(lock) __LOCK_IRQ(lock) -#define _read_lock_irq(lock) __LOCK_IRQ(lock) -#define _write_lock_irq(lock) __LOCK_IRQ(lock) -#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) -#define _spin_trylock(lock) ({ __LOCK(lock); 1; }) -#define _read_trylock(lock) ({ __LOCK(lock); 1; }) -#define _write_trylock(lock) ({ __LOCK(lock); 1; }) -#define _spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) -#define _spin_unlock(lock) __UNLOCK(lock) -#define _read_unlock(lock) __UNLOCK(lock) -#define _write_unlock(lock) __UNLOCK(lock) -#define _spin_unlock_bh(lock) __UNLOCK_BH(lock) -#define _write_unlock_bh(lock) __UNLOCK_BH(lock) -#define _read_unlock_bh(lock) __UNLOCK_BH(lock) -#define _spin_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _read_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _write_unlock_irq(lock) __UNLOCK_IRQ(lock) -#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) -#define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) -#define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __spin_lock(lock) __LOCK(lock) +#define __spin_lock_nested(lock, subclass) __LOCK(lock) +#define __read_lock(lock) __LOCK(lock) +#define __write_lock(lock) __LOCK(lock) +#define __spin_lock_bh(lock) __LOCK_BH(lock) +#define __read_lock_bh(lock) __LOCK_BH(lock) +#define __write_lock_bh(lock) __LOCK_BH(lock) +#define __spin_lock_irq(lock) __LOCK_IRQ(lock) +#define __read_lock_irq(lock) __LOCK_IRQ(lock) +#define __write_lock_irq(lock) __LOCK_IRQ(lock) +#define __spin_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __read_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __write_lock_irqsave(lock) __LOCK_IRQSAVE(lock) +#define __spin_trylock(lock) ({ __LOCK(lock); 1; }) +#define __read_trylock(lock) ({ __LOCK(lock); 1; }) +#define __write_trylock(lock) ({ __LOCK(lock); 1; }) +#define __spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) +#define __spin_trylock_irq(lock) ({ __LOCK_IRQ(lock); 1; }) +#define __spin_unlock(lock) __UNLOCK(lock) +#define __spin_unlock_no_resched(lock) __UNLOCK_NO_RESCHED(lock) +#define __read_unlock(lock) __UNLOCK(lock) +#define __write_unlock(lock) __UNLOCK(lock) +#define __spin_unlock_bh(lock) __UNLOCK_BH(lock) +#define __write_unlock_bh(lock) __UNLOCK_BH(lock) +#define __read_unlock_bh(lock) __UNLOCK_BH(lock) +#define __spin_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __read_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __write_unlock_irq(lock) __UNLOCK_IRQ(lock) +#define __spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +#define __write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) #endif /* __LINUX_SPINLOCK_API_UP_H */ Index: linux-2.6.25.8-rt7/include/linux/spinlock_types.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/spinlock_types.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/spinlock_types.h 2008-06-23 19:13:27.000000000 -0400 @@ -15,10 +15,27 @@ # include #endif +/* + * Must define these before including other files, inline functions need them + */ +#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME + +#define LOCK_SECTION_START(extra) \ + ".subsection 1\n\t" \ + extra \ + ".ifndef " LOCK_SECTION_NAME "\n\t" \ + LOCK_SECTION_NAME ":\n\t" \ + ".endif\n" + +#define LOCK_SECTION_END \ + ".previous\n\t" + +#define __lockfunc __attribute__((section(".spinlock.text"))) + #include typedef struct { - raw_spinlock_t raw_lock; + __raw_spinlock_t raw_lock; #ifdef CONFIG_GENERIC_LOCKBREAK unsigned int break_lock; #endif @@ -29,12 +46,12 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} spinlock_t; +} raw_spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead typedef struct { - raw_rwlock_t raw_lock; + __raw_rwlock_t raw_lock; #ifdef CONFIG_GENERIC_LOCKBREAK unsigned int break_lock; #endif @@ -45,7 +62,7 @@ typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -} rwlock_t; +} raw_rwlock_t; #define RWLOCK_MAGIC 0xdeaf1eed @@ -64,24 +81,24 @@ typedef struct { #endif #ifdef CONFIG_DEBUG_SPINLOCK -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ .magic = SPINLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ +#define _RAW_RW_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ .magic = RWLOCK_MAGIC, \ .owner = SPINLOCK_OWNER_INIT, \ .owner_cpu = -1, \ RW_DEP_MAP_INIT(lockname) } #else -# define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ +# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ SPIN_DEP_MAP_INIT(lockname) } -#define __RW_LOCK_UNLOCKED(lockname) \ - (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ +# define _RAW_RW_LOCK_UNLOCKED(lockname) \ + { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ RW_DEP_MAP_INIT(lockname) } #endif @@ -91,10 +108,22 @@ typedef struct { * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. */ -#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) -#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) -#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) +# define RAW_SPIN_LOCK_UNLOCKED(lockname) \ + (raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED(lockname) + +# define RAW_RW_LOCK_UNLOCKED(lockname) \ + (raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED(lockname) + +#define DEFINE_RAW_SPINLOCK(name) \ + raw_spinlock_t name __cacheline_aligned_in_smp = \ + RAW_SPIN_LOCK_UNLOCKED(name) + +#define __DEFINE_RAW_SPINLOCK(name) \ + raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED(name) + +#define DEFINE_RAW_RWLOCK(name) \ + raw_rwlock_t name __cacheline_aligned_in_smp = \ + RAW_RW_LOCK_UNLOCKED(name) #endif /* __LINUX_SPINLOCK_TYPES_H */ Index: linux-2.6.25.8-rt7/include/linux/spinlock_types_up.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/spinlock_types_up.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/spinlock_types_up.h 2008-06-23 19:13:27.000000000 -0400 @@ -16,13 +16,13 @@ typedef struct { volatile unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 1 } #else -typedef struct { } raw_spinlock_t; +typedef struct { } __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { } @@ -30,7 +30,7 @@ typedef struct { } raw_spinlock_t; typedef struct { /* no debug version on UP */ -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { } Index: linux-2.6.25.8-rt7/include/linux/spinlock_up.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/linux/spinlock_up.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/linux/spinlock_up.h 2008-06-23 19:13:27.000000000 -0400 @@ -20,19 +20,19 @@ #ifdef CONFIG_DEBUG_SPINLOCK #define __raw_spin_is_locked(x) ((x)->slock == 0) -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { lock->slock = 0; } static inline void -__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +__raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { local_irq_save(flags); lock->slock = 0; } -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { char oldval = lock->slock; @@ -41,7 +41,7 @@ static inline int __raw_spin_trylock(raw return oldval > 0; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { lock->slock = 1; } Index: linux-2.6.25.8-rt7/kernel/futex.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/futex.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/futex.c 2008-06-23 19:14:39.000000000 -0400 @@ -129,12 +129,14 @@ static struct futex_hash_bucket futex_qu /* Futex-fs vfsmount entry: */ static struct vfsmount *futex_mnt; +int futex_performance_hack; + /* * Take mm->mmap_sem, when futex is shared */ static inline void futex_lock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) down_read(fshared); } @@ -143,7 +145,7 @@ static inline void futex_lock_mm(struct */ static inline void futex_unlock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) up_read(fshared); } @@ -961,8 +963,12 @@ static int futex_requeue(u32 __user *uad plist_add(&this->list, &hb2->chain); this->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_PREEMPT_RT + this->list.plist.lock = NULL; +#else this->list.plist.lock = &hb2->lock; #endif +#endif } this->key = key2; get_futex_key_refs(&key2); @@ -1022,8 +1028,12 @@ static inline void __queue_me(struct fut plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_PREEMPT_RT + q->list.plist.lock = NULL; +#else q->list.plist.lock = &hb->lock; #endif +#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1118,21 +1128,64 @@ static void unqueue_me_pi(struct futex_q * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1140,26 +1193,35 @@ static int fixup_pi_state_owner(u32 __us WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1263,6 +1325,10 @@ static int futex_wait(u32 __user *uaddr, * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ if (likely(!plist_node_empty(&q.list))) { + unsigned long nosched_flag = current->flags & PF_NOSCHED; + + current->flags &= ~PF_NOSCHED; + if (!abs_time) schedule(); else { @@ -1287,6 +1353,8 @@ static int futex_wait(u32 __user *uaddr, /* Flag if a timeout occured */ rem = (t.task == NULL); } + + current->flags |= nosched_flag; } __set_current_state(TASK_RUNNING); @@ -1524,7 +1592,7 @@ static int futex_lock_pi(u32 __user *uad * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1556,7 +1624,8 @@ static int futex_lock_pi(u32 __user *uad int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) @@ -2178,7 +2247,11 @@ static int __init futex_init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { +#ifdef CONFIG_PREEMPT_RT + plist_head_init(&futex_queues[i].chain, NULL); +#else plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); +#endif spin_lock_init(&futex_queues[i].lock); } Index: linux-2.6.25.8-rt7/kernel/rt.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/kernel/rt.c 2008-06-23 19:14:27.000000000 -0400 @@ -0,0 +1,479 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * historic credit for proving that Linux spinlocks can be implemented via + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow + * and others) who prototyped it on 2.4 and did lots of comparative + * research and analysis; TimeSys, for proving that you can implement a + * fully preemptible kernel via the use of IRQ threading and mutexes; + * Bill Huey for persuasively arguing on lkml that the mutex model is the + * right one; and to MontaVista, who ported pmutexes to 2.6. + * + * This code is a from-scratch implementation and is not based on pmutexes, + * but the idea of converting spinlocks to mutexes is used here too. + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + * (also by Steven Rostedt) + * - Converted single pi_lock to individual task locks. + * + * By Esben Nielsen: + * Doing priority inheritance with help of the scheduler. + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * - major rework based on Esben Nielsens initial patch + * - replaced thread_info references by task_struct refs + * - removed task->pending_owner dependency + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks + * in the scheduler return path as discussed with Steven Rostedt + * + * Copyright (C) 2006, Kihon Technologies Inc. + * Steven Rostedt + * - debugged and patched Thomas Gleixner's rework. + * - added back the cmpxchg to the rework. + * - turned atomic require back on for SMP. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_PREEMPT_RT +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + //trace_lock_init(); +} +#endif + +/* + * struct mutex functions + */ +void _mutex_init(struct mutex *lock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + __rt_mutex_init(&lock->lock, name); +} +EXPORT_SYMBOL(_mutex_init); + +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock); +} +EXPORT_SYMBOL(_mutex_lock); + +static int __lockfunc __rt_mutex_lock_interruptible(struct rt_mutex *lock) +{ + return rt_mutex_lock_interruptible(lock, 0); +} + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock, + __rt_mutex_lock_interruptible); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible); + +int __lockfunc _mutex_lock_killable(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_lock_killable(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT(lock, rt_mutex_trylock, rt_mutex_lock); +} +EXPORT_SYMBOL(_mutex_lock_nested); + +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = LOCK_CONTENDED_RT_RET(lock, rt_mutex_trylock, + __rt_mutex_lock_interruptible); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); + +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = rt_mutex_lock_killable(&lock->lock, 0); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable_nested); +#endif + +int __lockfunc _mutex_trylock(struct mutex *lock) +{ + int ret = rt_mutex_trylock(&lock->lock); + + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(_mutex_trylock); + +void __lockfunc _mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, 1, _RET_IP_); + rt_mutex_unlock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_unlock); + +/* + * rwlock_t functions + */ +int __lockfunc rt_write_trylock(rwlock_t *rwlock) +{ + int ret = rt_mutex_down_write_trylock(&rwlock->owners); + + if (ret) + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) +{ + *flags = 0; + return rt_write_trylock(rwlock); +} +EXPORT_SYMBOL(rt_write_trylock_irqsave); + +int __lockfunc rt_read_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = rt_mutex_down_read_trylock(&rwlock->owners); + if (ret) + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +void __lockfunc rt_write_lock(rwlock_t *rwlock) +{ + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT_RW(rwlock, rt_mutex_down_write_trylock, rt_rwlock_write_lock); +} +EXPORT_SYMBOL(rt_write_lock); + +void __lockfunc rt_read_lock(rwlock_t *rwlock) +{ + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED_RT_RW(rwlock, rt_mutex_down_read_trylock, rt_rwlock_read_lock); +} + +EXPORT_SYMBOL(rt_read_lock); + +void __lockfunc rt_write_unlock(rwlock_t *rwlock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + rt_rwlock_write_unlock(&rwlock->owners); +} +EXPORT_SYMBOL(rt_write_unlock); + +void __lockfunc rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); + rt_rwlock_read_unlock(&rwlock->owners); +} +EXPORT_SYMBOL(rt_read_unlock); + +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) +{ + rt_write_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_write_lock_irqsave); + +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) +{ + rt_read_lock(rwlock); + + return 0; +} +EXPORT_SYMBOL(rt_read_lock_irqsave); + +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); + lockdep_init_map(&rwlock->dep_map, name, key, 0); +#endif + rt_mutex_rwsem_init(&rwlock->owners, name); +} +EXPORT_SYMBOL(__rt_rwlock_init); + +/* + * rw_semaphores + */ + +void rt_up_write(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_up_write(&rwsem->owners); +} +EXPORT_SYMBOL(rt_up_write); + +void rt_up_read(struct rw_semaphore *rwsem) +{ + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rt_mutex_up_read(&rwsem->owners); +} +EXPORT_SYMBOL(rt_up_read); + +/* + * downgrade a write lock into a read lock + */ +void rt_downgrade_write(struct rw_semaphore *rwsem) +{ + rt_mutex_downgrade_write(&rwsem->owners); +} +EXPORT_SYMBOL(rt_downgrade_write); + +int rt_down_write_trylock(struct rw_semaphore *rwsem) +{ + int ret = rt_mutex_down_write_trylock(&rwsem->owners); + + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_write_trylock); + +static void __rt_down_write(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT_RW(rwsem, rt_mutex_down_write_trylock, rt_mutex_down_write); +} + +void rt_down_write(struct rw_semaphore *rwsem) +{ + __rt_down_write(rwsem, 0); +} +EXPORT_SYMBOL(rt_down_write); + +void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) +{ + __rt_down_write(rwsem, subclass); +} +EXPORT_SYMBOL(rt_down_write_nested); + +int rt_down_read_trylock(struct rw_semaphore *rwsem) +{ + int ret; + + ret = rt_mutex_down_read_trylock(&rwsem->owners); + if (ret) + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(rt_down_read_trylock); + +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) +{ + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_RT_RW(rwsem, rt_mutex_down_read_trylock, rt_mutex_down_read); +} + +void rt_down_read(struct rw_semaphore *rwsem) +{ + __rt_down_read(rwsem, 0); +} +EXPORT_SYMBOL(rt_down_read); + +void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) +{ + __rt_down_read(rwsem, subclass); +} +EXPORT_SYMBOL(rt_down_read_nested); + +void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); + lockdep_init_map(&rwsem->dep_map, name, key, 0); +#endif + rt_mutex_rwsem_init(&rwsem->owners, name); +} +EXPORT_SYMBOL(__rt_rwsem_init); + +/* + * Semaphores + */ +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem) +{ + int count = atomic_dec_return(&sem->count); + + if (unlikely(count > 0)) + rt_mutex_unlock(&sem->lock); +} + +void rt_down(struct semaphore *sem) +{ + rt_mutex_lock(&sem->lock); + __down_complete(sem); +} +EXPORT_SYMBOL(rt_down); + +int rt_down_interruptible(struct semaphore *sem) +{ + int ret; + + ret = rt_mutex_lock_interruptible(&sem->lock, 0); + if (ret) + return ret; + __down_complete(sem); + return 0; +} +EXPORT_SYMBOL(rt_down_interruptible); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int rt_down_trylock(struct semaphore *sem) +{ + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (rt_mutex_trylock(&sem->lock)) { + __down_complete(sem); + return 0; + } + return 1; +} +EXPORT_SYMBOL(rt_down_trylock); + +void rt_up(struct semaphore *sem) +{ + int count; + + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (likely(count == 1)) + rt_mutex_unlock(&sem->lock); + preempt_enable(); +} +EXPORT_SYMBOL(rt_up); + +void __sema_init(struct semaphore *sem, int val, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __rt_mutex_init(&sem->lock, name); + rt_mutex_lock(&sem->lock); + break; + default: + __rt_mutex_init(&sem->lock, name); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +void __init_MUTEX(struct semaphore *sem, char *name, char *file, + int line) +{ + __sema_init(sem, 1, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX); + Index: linux-2.6.25.8-rt7/kernel/rtmutex-debug.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rtmutex-debug.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rtmutex-debug.c 2008-06-23 19:14:38.000000000 -0400 @@ -16,6 +16,7 @@ * * See rt.c in preempt-rt for proper credits and further information */ +#include #include #include #include @@ -29,61 +30,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +57,11 @@ static void printk_lock(struct rt_mutex void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +#ifdef CONFIG_PREEMPT_RT + WARN_ON(task->reader_lock_count); +#endif } /* @@ -125,7 +74,7 @@ void debug_rt_mutex_deadlock(int detect, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +88,7 @@ void debug_rt_mutex_print_deadlock(struc { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +98,8 @@ void debug_rt_mutex_print_deadlock(struc return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) + return; printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +130,6 @@ void debug_rt_mutex_print_deadlock(struc printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +138,8 @@ void debug_rt_mutex_lock(struct rt_mutex void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + if (debug_locks) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +149,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,9 +163,9 @@ void debug_rt_mutex_init_waiter(struct r void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } @@ -231,9 +181,36 @@ void debug_rt_mutex_init(struct rt_mutex void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (task->lock_count >= MAX_LOCK_STACK) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count overflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[task->lock_count] = lock; +#endif + task->lock_count++; +#endif } void rt_mutex_deadlock_account_unlock(struct task_struct *task) { +#ifdef CONFIG_DEBUG_PREEMPT + if (!task->lock_count) { + if (!debug_locks_off()) + return; + printk("BUG: %s/%d: lock count underflow!\n", + task->comm, task->pid); + dump_stack(); + return; + } + task->lock_count--; +#ifdef CONFIG_PREEMPT_RT + task->owned_lock[task->lock_count] = NULL; +#endif +#endif } - Index: linux-2.6.25.8-rt7/kernel/rwsem.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/rwsem.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/rwsem.c 2008-06-23 19:13:27.000000000 -0400 @@ -16,7 +16,7 @@ /* * lock for reading */ -void __sched down_read(struct rw_semaphore *sem) +void __sched compat_down_read(struct compat_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -24,12 +24,12 @@ void __sched down_read(struct rw_semapho LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } -EXPORT_SYMBOL(down_read); +EXPORT_SYMBOL(compat_down_read); /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int down_read_trylock(struct rw_semaphore *sem) +int compat_down_read_trylock(struct compat_rw_semaphore *sem) { int ret = __down_read_trylock(sem); @@ -38,12 +38,12 @@ int down_read_trylock(struct rw_semaphor return ret; } -EXPORT_SYMBOL(down_read_trylock); +EXPORT_SYMBOL(compat_down_read_trylock); /* * lock for writing */ -void __sched down_write(struct rw_semaphore *sem) +void __sched compat_down_write(struct compat_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -51,12 +51,12 @@ void __sched down_write(struct rw_semaph LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } -EXPORT_SYMBOL(down_write); +EXPORT_SYMBOL(compat_down_write); /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int down_write_trylock(struct rw_semaphore *sem) +int compat_down_write_trylock(struct compat_rw_semaphore *sem) { int ret = __down_write_trylock(sem); @@ -65,36 +65,36 @@ int down_write_trylock(struct rw_semapho return ret; } -EXPORT_SYMBOL(down_write_trylock); +EXPORT_SYMBOL(compat_down_write_trylock); /* * release a read lock */ -void up_read(struct rw_semaphore *sem) +void compat_up_read(struct compat_rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_read(sem); } -EXPORT_SYMBOL(up_read); +EXPORT_SYMBOL(compat_up_read); /* * release a write lock */ -void up_write(struct rw_semaphore *sem) +void compat_up_write(struct compat_rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); __up_write(sem); } -EXPORT_SYMBOL(up_write); +EXPORT_SYMBOL(compat_up_write); /* * downgrade write lock to read lock */ -void downgrade_write(struct rw_semaphore *sem) +void compat_downgrade_write(struct compat_rw_semaphore *sem) { /* * lockdep: a downgraded write will live on as a write @@ -103,11 +103,11 @@ void downgrade_write(struct rw_semaphore __downgrade_write(sem); } -EXPORT_SYMBOL(downgrade_write); +EXPORT_SYMBOL(compat_downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void down_read_nested(struct rw_semaphore *sem, int subclass) +void compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -115,18 +115,18 @@ void down_read_nested(struct rw_semaphor LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } -EXPORT_SYMBOL(down_read_nested); +EXPORT_SYMBOL(compat_down_read_nested); -void down_read_non_owner(struct rw_semaphore *sem) +void compat_down_read_non_owner(struct compat_rw_semaphore *sem) { might_sleep(); __down_read(sem); } -EXPORT_SYMBOL(down_read_non_owner); +EXPORT_SYMBOL(compat_down_read_non_owner); -void down_write_nested(struct rw_semaphore *sem, int subclass) +void compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -134,14 +134,14 @@ void down_write_nested(struct rw_semapho LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } -EXPORT_SYMBOL(down_write_nested); +EXPORT_SYMBOL(compat_down_write_nested); -void up_read_non_owner(struct rw_semaphore *sem) +void compat_up_read_non_owner(struct compat_rw_semaphore *sem) { __up_read(sem); } -EXPORT_SYMBOL(up_read_non_owner); +EXPORT_SYMBOL(compat_up_read_non_owner); #endif Index: linux-2.6.25.8-rt7/kernel/spinlock.c =================================================================== --- linux-2.6.25.8-rt7.orig/kernel/spinlock.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/kernel/spinlock.c 2008-06-23 19:13:27.000000000 -0400 @@ -21,7 +21,7 @@ #include #include -int __lockfunc _spin_trylock(spinlock_t *lock) +int __lockfunc __spin_trylock(raw_spinlock_t *lock) { preempt_disable(); if (_raw_spin_trylock(lock)) { @@ -32,9 +32,46 @@ int __lockfunc _spin_trylock(spinlock_t preempt_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock); +EXPORT_SYMBOL(__spin_trylock); -int __lockfunc _read_trylock(rwlock_t *lock) +int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + __preempt_enable_no_resched(); + local_irq_enable(); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(__spin_trylock_irq); + +int __lockfunc __spin_trylock_irqsave(raw_spinlock_t *lock, + unsigned long *flags) +{ + local_irq_save(*flags); + preempt_disable(); + + if (_raw_spin_trylock(lock)) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + return 1; + } + + __preempt_enable_no_resched(); + local_irq_restore(*flags); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(__spin_trylock_irqsave); + +int __lockfunc __read_trylock(raw_rwlock_t *lock) { preempt_disable(); if (_raw_read_trylock(lock)) { @@ -45,9 +82,9 @@ int __lockfunc _read_trylock(rwlock_t *l preempt_enable(); return 0; } -EXPORT_SYMBOL(_read_trylock); +EXPORT_SYMBOL(__read_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc __write_trylock(raw_rwlock_t *lock) { preempt_disable(); if (_raw_write_trylock(lock)) { @@ -58,7 +95,21 @@ int __lockfunc _write_trylock(rwlock_t * preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(__write_trylock); + +int __lockfunc __write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags) +{ + int ret; + + local_irq_save(*flags); + ret = __write_trylock(lock); + if (ret) + return ret; + + local_irq_restore(*flags); + return 0; +} +EXPORT_SYMBOL(__write_trylock_irqsave); /* * If lockdep is enabled then we use the non-preemption spin-ops @@ -67,15 +118,15 @@ EXPORT_SYMBOL(_write_trylock); */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -void __lockfunc _read_lock(rwlock_t *lock) +void __lockfunc __read_lock(raw_rwlock_t *lock) { preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(__read_lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) { unsigned long flags; @@ -94,27 +145,27 @@ unsigned long __lockfunc _spin_lock_irqs #endif return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(__spin_lock_irqsave); -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) { local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(__spin_lock_irq); -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock_bh); +EXPORT_SYMBOL(__spin_lock_bh); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; @@ -124,27 +175,27 @@ unsigned long __lockfunc _read_lock_irqs LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); return flags; } -EXPORT_SYMBOL(_read_lock_irqsave); +EXPORT_SYMBOL(__read_lock_irqsave); -void __lockfunc _read_lock_irq(rwlock_t *lock) +void __lockfunc __read_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(__read_lock_irq); -void __lockfunc _read_lock_bh(rwlock_t *lock) +void __lockfunc __read_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(__read_lock_bh); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; @@ -154,43 +205,43 @@ unsigned long __lockfunc _write_lock_irq LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); return flags; } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(__write_lock_irqsave); -void __lockfunc _write_lock_irq(rwlock_t *lock) +void __lockfunc __write_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(__write_lock_irq); -void __lockfunc _write_lock_bh(rwlock_t *lock) +void __lockfunc __write_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(__write_lock_bh); -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc __spin_lock(raw_spinlock_t *lock) { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } -EXPORT_SYMBOL(_spin_lock); +EXPORT_SYMBOL(__spin_lock); -void __lockfunc _write_lock(rwlock_t *lock) +void __lockfunc __write_lock(raw_rwlock_t *lock) { preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(__write_lock); #else /* CONFIG_PREEMPT: */ @@ -203,7 +254,7 @@ EXPORT_SYMBOL(_write_lock); */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc __##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -213,15 +264,16 @@ void __lockfunc _##op##_lock(locktype##_ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ + (lock)->break_lock) \ + __raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ +EXPORT_SYMBOL(__##op##_lock); \ \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -235,23 +287,24 @@ unsigned long __lockfunc _##op##_lock_ir \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ + (lock)->break_lock) \ + __raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ +EXPORT_SYMBOL(__##op##_lock_irqsave); \ \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + __##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ +EXPORT_SYMBOL(__##op##_lock_irq); \ \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -260,39 +313,40 @@ void __lockfunc _##op##_lock_bh(locktype /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = __##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_bh) +EXPORT_SYMBOL(__##op##_lock_bh) /* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, raw_rwlock); +BUILD_LOCK_OPS(write, raw_rwlock); #endif /* CONFIG_PREEMPT */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } +EXPORT_SYMBOL(__spin_lock_nested); -EXPORT_SYMBOL(_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +unsigned long __lockfunc +__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { unsigned long flags; @@ -311,117 +365,130 @@ unsigned long __lockfunc _spin_lock_irqs #endif return flags; } - -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(__spin_lock_irqsave_nested); #endif -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc __spin_unlock(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(__spin_unlock); + +void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) +{ + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + __preempt_enable_no_resched(); +} +/* not exported */ -void __lockfunc _write_unlock(rwlock_t *lock) +void __lockfunc __write_unlock(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(__write_unlock); -void __lockfunc _read_unlock(rwlock_t *lock) +void __lockfunc __read_unlock(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(__read_unlock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc __spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(__spin_unlock_irqrestore); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(__spin_unlock_irq); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(__spin_unlock_bh); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc __read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(__read_unlock_irqrestore); -void __lockfunc _read_unlock_irq(rwlock_t *lock) +void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(__read_unlock_irq); -void __lockfunc _read_unlock_bh(rwlock_t *lock) +void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_read_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(__read_unlock_bh); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); + __preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(__write_unlock_irqrestore); -void __lockfunc _write_unlock_irq(rwlock_t *lock) +void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); + __preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(__write_unlock_irq); -void __lockfunc _write_unlock_bh(rwlock_t *lock) +void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); _raw_write_unlock(lock); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(__write_unlock_bh); -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); @@ -430,11 +497,11 @@ int __lockfunc _spin_trylock_bh(spinlock return 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); local_bh_enable_ip((unsigned long)__builtin_return_address(0)); return 0; } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(__spin_trylock_bh); int in_lock_functions(unsigned long addr) { @@ -442,6 +509,17 @@ int in_lock_functions(unsigned long addr extern char __lock_text_start[], __lock_text_end[]; return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; + && addr < (unsigned long)__lock_text_end; } EXPORT_SYMBOL(in_lock_functions); + +void notrace __debug_atomic_dec_and_test(atomic_t *v) +{ + static int warn_once = 1; + + if (!atomic_read(v) && warn_once) { + warn_once = 0; + printk("BUG: atomic counter underflow!\n"); + WARN_ON(1); + } +} Index: linux-2.6.25.8-rt7/lib/dec_and_lock.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/dec_and_lock.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/dec_and_lock.c 2008-06-23 19:13:27.000000000 -0400 @@ -17,7 +17,7 @@ * because the spin-lock and the decrement must be * "atomic". */ -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic) { #ifdef CONFIG_SMP /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ @@ -32,4 +32,4 @@ int _atomic_dec_and_lock(atomic_t *atomi return 0; } -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(__atomic_dec_and_spin_lock); Index: linux-2.6.25.8-rt7/lib/kernel_lock.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/kernel_lock.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/kernel_lock.c 2008-06-23 19:13:44.000000000 -0400 @@ -23,7 +23,7 @@ * * Don't use in new code. */ -static DECLARE_MUTEX(kernel_sem); +DECLARE_MUTEX(kernel_sem); /* * Re-acquire the kernel semaphore. @@ -34,22 +34,25 @@ static DECLARE_MUTEX(kernel_sem); * about recursion, both due to the down() and due to the enabling of * preemption. schedule() will re-check the preemption flag after * reacquiring the semaphore. + * + * Called with interrupts disabled. */ int __lockfunc __reacquire_kernel_lock(void) { struct task_struct *task = current; int saved_lock_depth = task->lock_depth; + local_irq_enable(); BUG_ON(saved_lock_depth < 0); task->lock_depth = -1; - preempt_enable_no_resched(); down(&kernel_sem); - preempt_disable(); task->lock_depth = saved_lock_depth; + local_irq_disable(); + return 0; } @@ -66,11 +69,15 @@ void __lockfunc lock_kernel(void) struct task_struct *task = current; int depth = task->lock_depth + 1; - if (likely(!depth)) + if (likely(!depth)) { /* * No recursion worries - we set up lock_depth _after_ */ down(&kernel_sem); +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = __builtin_return_address(0); +#endif + } task->lock_depth = depth; } @@ -81,8 +88,12 @@ void __lockfunc unlock_kernel(void) BUG_ON(task->lock_depth < 0); - if (likely(--task->lock_depth < 0)) + if (likely(--task->lock_depth == -1)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + current->last_kernel_lock = NULL; +#endif up(&kernel_sem); + } } EXPORT_SYMBOL(lock_kernel); Index: linux-2.6.25.8-rt7/lib/locking-selftest.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/locking-selftest.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/locking-selftest.c 2008-06-23 19:13:44.000000000 -0400 @@ -158,7 +158,7 @@ static void init_shared_classes(void) local_bh_disable(); \ local_irq_disable(); \ trace_softirq_enter(); \ - WARN_ON(!in_softirq()); + /* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */ #define SOFTIRQ_EXIT() \ trace_softirq_exit(); \ @@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem) #undef E /* + * FIXME: turns these into raw-spinlock tests on -rt + */ +#ifndef CONFIG_PREEMPT_RT + +/* * locking an irq-safe lock with irqs enabled: */ #define E1() \ @@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ #include "locking-selftest-softirq.h" // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) +#endif /* !CONFIG_PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) @@ -940,6 +947,9 @@ static void dotest(void (*testcase_fn)(v { unsigned long saved_preempt_count = preempt_count(); int expected_failure = 0; +#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) + int saved_lock_count = current->lock_count; +#endif WARN_ON(irqs_disabled()); @@ -989,6 +999,9 @@ static void dotest(void (*testcase_fn)(v #endif reset_locks(); +#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) + current->lock_count = saved_lock_count; +#endif } static inline void print_testname(const char *testname) @@ -998,7 +1011,7 @@ static inline void print_testname(const #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ - dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ @@ -1006,17 +1019,17 @@ static inline void print_testname(const dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3(desc, name, nr) \ - print_testname(desc"/"#nr); \ - dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ +#define DO_TESTCASE_3(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); -#define DO_TESTCASE_3RW(desc, name, nr) \ - print_testname(desc"/"#nr); \ +#define DO_TESTCASE_3RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ - dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ printk("\n"); @@ -1047,7 +1060,7 @@ static inline void print_testname(const print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1179,6 +1192,7 @@ void locking_selftest(void) /* * irq-context testcases: */ +#ifndef CONFIG_PREEMPT_RT DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); @@ -1188,6 +1202,7 @@ void locking_selftest(void) DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); +#endif if (unexpected_testcase_failures) { printk("-----------------------------------------------------------------\n"); Index: linux-2.6.25.8-rt7/lib/plist.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/plist.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/plist.c 2008-06-23 19:14:11.000000000 -0400 @@ -53,7 +53,9 @@ static void plist_check_list(struct list static void plist_check_head(struct plist_head *head) { +#ifndef CONFIG_PREEMPT_RT WARN_ON(!head->lock); +#endif if (head->lock) WARN_ON_SMP(!spin_is_locked(head->lock)); plist_check_list(&head->prio_list); @@ -64,6 +66,30 @@ static void plist_check_head(struct plis # define plist_check_head(h) do { } while (0) #endif +static inline struct plist_node *prev_node(struct plist_node *iter) +{ + return list_entry(iter->plist.node_list.prev, struct plist_node, + plist.node_list); +} + +static inline struct plist_node *next_node(struct plist_node *iter) +{ + return list_entry(iter->plist.node_list.next, struct plist_node, + plist.node_list); +} + +static inline struct plist_node *prev_prio(struct plist_node *iter) +{ + return list_entry(iter->plist.prio_list.prev, struct plist_node, + plist.prio_list); +} + +static inline struct plist_node *next_prio(struct plist_node *iter) +{ + return list_entry(iter->plist.prio_list.next, struct plist_node, + plist.prio_list); +} + /** * plist_add - add @node to @head * @@ -81,8 +107,7 @@ void plist_add(struct plist_node *node, if (node->prio < iter->prio) goto lt_prio; else if (node->prio == iter->prio) { - iter = list_entry(iter->plist.prio_list.next, - struct plist_node, plist.prio_list); + iter = next_prio(iter); goto eq_prio; } } @@ -116,3 +141,44 @@ void plist_del(struct plist_node *node, plist_check_head(head); } + +void plist_head_splice(struct plist_head *src, struct plist_head *dst) +{ + struct plist_node *src_iter_first, *src_iter_last, *dst_iter; + struct plist_node *tail = container_of(dst, struct plist_node, plist); + + dst_iter = next_prio(tail); + + while (!plist_head_empty(src) && dst_iter != tail) { + src_iter_first = plist_first(src); + + src_iter_last = next_prio(src_iter_first); + src_iter_last = prev_node(src_iter_last); + + WARN_ON(src_iter_first->prio != src_iter_last->prio); + WARN_ON(list_empty(&src_iter_first->plist.prio_list)); + + while (src_iter_first->prio > dst_iter->prio) { + dst_iter = next_prio(dst_iter); + if (dst_iter == tail) + goto tail; + } + + list_del_init(&src_iter_first->plist.prio_list); + + if (src_iter_first->prio < dst_iter->prio) { + list_add_tail(&src_iter_first->plist.prio_list, + &dst_iter->plist.prio_list); + } else if (src_iter_first->prio == dst_iter->prio) { + dst_iter = next_prio(dst_iter); + } else BUG(); + + list_splice2_tail(&src_iter_first->plist.node_list, + &src_iter_last->plist.node_list, + &dst_iter->plist.node_list); + } + +tail: + list_splice_tail_init(&src->prio_list, &dst->prio_list); + list_splice_tail_init(&src->node_list, &dst->node_list); +} Index: linux-2.6.25.8-rt7/lib/rwsem-spinlock.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/rwsem-spinlock.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/rwsem-spinlock.c 2008-06-23 19:13:27.000000000 -0400 @@ -20,7 +20,7 @@ struct rwsem_waiter { /* * initialise the semaphore */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, +void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -44,8 +44,8 @@ void __init_rwsem(struct rw_semaphore *s * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if wakewrite is non-zero */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +static inline struct compat_rw_semaphore * +__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -103,8 +103,8 @@ __rwsem_do_wake(struct rw_semaphore *sem /* * wake a single writer */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) +static inline struct compat_rw_semaphore * +__rwsem_wake_one_writer(struct compat_rw_semaphore *sem) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -125,7 +125,7 @@ __rwsem_wake_one_writer(struct rw_semaph /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct compat_rw_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -168,7 +168,7 @@ void __sched __down_read(struct rw_semap /* * trylock for reading -- returns 1 if successful, 0 if contention */ -int __down_read_trylock(struct rw_semaphore *sem) +int __down_read_trylock(struct compat_rw_semaphore *sem) { unsigned long flags; int ret = 0; @@ -191,7 +191,8 @@ int __down_read_trylock(struct rw_semaph * get a write lock on the semaphore * - we increment the waiting count anyway to indicate an exclusive lock */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void __sched +__down_write_nested(struct compat_rw_semaphore *sem, int subclass) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -231,7 +232,7 @@ void __sched __down_write_nested(struct ; } -void __sched __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct compat_rw_semaphore *sem) { __down_write_nested(sem, 0); } @@ -239,7 +240,7 @@ void __sched __down_write(struct rw_sema /* * trylock for writing -- returns 1 if successful, 0 if contention */ -int __down_write_trylock(struct rw_semaphore *sem) +int __down_write_trylock(struct compat_rw_semaphore *sem) { unsigned long flags; int ret = 0; @@ -260,7 +261,7 @@ int __down_write_trylock(struct rw_semap /* * release a read lock on the semaphore */ -void __up_read(struct rw_semaphore *sem) +void __up_read(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -275,7 +276,7 @@ void __up_read(struct rw_semaphore *sem) /* * release a write lock on the semaphore */ -void __up_write(struct rw_semaphore *sem) +void __up_write(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -292,7 +293,7 @@ void __up_write(struct rw_semaphore *sem * downgrade a write lock into a read lock * - just wake up any readers at the front of the queue */ -void __downgrade_write(struct rw_semaphore *sem) +void __downgrade_write(struct compat_rw_semaphore *sem) { unsigned long flags; @@ -305,7 +306,7 @@ void __downgrade_write(struct rw_semapho spin_unlock_irqrestore(&sem->wait_lock, flags); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__compat_init_rwsem); EXPORT_SYMBOL(__down_read); EXPORT_SYMBOL(__down_read_trylock); EXPORT_SYMBOL(__down_write_nested); Index: linux-2.6.25.8-rt7/lib/rwsem.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/rwsem.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/rwsem.c 2008-06-23 19:13:27.000000000 -0400 @@ -11,8 +11,8 @@ /* * Initialize an rwsem: */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) +void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -26,7 +26,7 @@ void __init_rwsem(struct rw_semaphore *s INIT_LIST_HEAD(&sem->wait_list); } -EXPORT_SYMBOL(__init_rwsem); +EXPORT_SYMBOL(__compat_init_rwsem); struct rwsem_waiter { struct list_head list; Index: linux-2.6.25.8-rt7/lib/semaphore-sleepers.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/semaphore-sleepers.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/semaphore-sleepers.c 2008-06-23 19:13:27.000000000 -0400 @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -48,12 +49,12 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ void __sched __down(struct semaphore *se tsk->state = TASK_RUNNING; } -int __sched __down_interruptible(struct semaphore *sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -153,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore *sem) +int __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -174,3 +175,10 @@ int __down_trylock(struct semaphore *sem spin_unlock_irqrestore(&sem->wait.lock, flags); return 1; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-2.6.25.8-rt7/lib/spinlock_debug.c =================================================================== --- linux-2.6.25.8-rt7.orig/lib/spinlock_debug.c 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/lib/spinlock_debug.c 2008-06-23 19:13:27.000000000 -0400 @@ -13,8 +13,8 @@ #include #include -void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key) +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -23,16 +23,16 @@ void __spin_lock_init(spinlock_t *lock, debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + lock->raw_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -EXPORT_SYMBOL(__spin_lock_init); +EXPORT_SYMBOL(__raw_spin_lock_init); -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) +void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -41,15 +41,15 @@ void __rwlock_init(rwlock_t *lock, const debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); #endif - lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; + lock->raw_lock = (__raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } -EXPORT_SYMBOL(__rwlock_init); +EXPORT_SYMBOL(__raw_rwlock_init); -static void spin_bug(spinlock_t *lock, const char *msg) +static void spin_bug(raw_spinlock_t *lock, const char *msg) { struct task_struct *owner = NULL; @@ -73,7 +73,7 @@ static void spin_bug(spinlock_t *lock, c #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) static inline void -debug_spin_lock_before(spinlock_t *lock) +debug_spin_lock_before(raw_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(lock->owner == current, lock, "recursion"); @@ -81,13 +81,13 @@ debug_spin_lock_before(spinlock_t *lock) lock, "cpu recursion"); } -static inline void debug_spin_lock_after(spinlock_t *lock) +static inline void debug_spin_lock_after(raw_spinlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_spin_unlock(spinlock_t *lock) +static inline void debug_spin_unlock(raw_spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked"); @@ -98,7 +98,7 @@ static inline void debug_spin_unlock(spi lock->owner_cpu = -1; } -static void __spin_lock_debug(spinlock_t *lock) +static void __spin_lock_debug(raw_spinlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -125,7 +125,7 @@ static void __spin_lock_debug(spinlock_t } } -void _raw_spin_lock(spinlock_t *lock) +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); if (unlikely(!__raw_spin_trylock(&lock->raw_lock))) @@ -133,7 +133,7 @@ void _raw_spin_lock(spinlock_t *lock) debug_spin_lock_after(lock); } -int _raw_spin_trylock(spinlock_t *lock) +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { int ret = __raw_spin_trylock(&lock->raw_lock); @@ -148,13 +148,13 @@ int _raw_spin_trylock(spinlock_t *lock) return ret; } -void _raw_spin_unlock(spinlock_t *lock) +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { debug_spin_unlock(lock); __raw_spin_unlock(&lock->raw_lock); } -static void rwlock_bug(rwlock_t *lock, const char *msg) +static void rwlock_bug(raw_rwlock_t *lock, const char *msg) { if (!debug_locks_off()) return; @@ -167,8 +167,8 @@ static void rwlock_bug(rwlock_t *lock, c #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) +#if 1 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __raw_read_lock_debug(raw_rwlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -193,13 +193,13 @@ static void __read_lock_debug(rwlock_t * } #endif -void _raw_read_lock(rwlock_t *lock) +void __lockfunc _raw_read_lock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - __raw_read_lock(&lock->raw_lock); + __raw_read_lock_debug(lock); } -int _raw_read_trylock(rwlock_t *lock) +int __lockfunc _raw_read_trylock(raw_rwlock_t *lock) { int ret = __raw_read_trylock(&lock->raw_lock); @@ -212,13 +212,13 @@ int _raw_read_trylock(rwlock_t *lock) return ret; } -void _raw_read_unlock(rwlock_t *lock) +void __lockfunc _raw_read_unlock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); __raw_read_unlock(&lock->raw_lock); } -static inline void debug_write_lock_before(rwlock_t *lock) +static inline void debug_write_lock_before(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); @@ -226,13 +226,13 @@ static inline void debug_write_lock_befo lock, "cpu recursion"); } -static inline void debug_write_lock_after(rwlock_t *lock) +static inline void debug_write_lock_after(raw_rwlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); lock->owner = current; } -static inline void debug_write_unlock(rwlock_t *lock) +static inline void debug_write_unlock(raw_rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); @@ -242,8 +242,8 @@ static inline void debug_write_unlock(rw lock->owner_cpu = -1; } -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) +#if 1 /* This can cause lockups */ +static void __raw_write_lock_debug(raw_rwlock_t *lock) { u64 i; u64 loops = loops_per_jiffy * HZ; @@ -268,14 +268,14 @@ static void __write_lock_debug(rwlock_t } #endif -void _raw_write_lock(rwlock_t *lock) +void __lockfunc _raw_write_lock(raw_rwlock_t *lock) { debug_write_lock_before(lock); - __raw_write_lock(&lock->raw_lock); + __raw_write_lock_debug(lock); debug_write_lock_after(lock); } -int _raw_write_trylock(rwlock_t *lock) +int __lockfunc _raw_write_trylock(raw_rwlock_t *lock) { int ret = __raw_write_trylock(&lock->raw_lock); @@ -290,7 +290,7 @@ int _raw_write_trylock(rwlock_t *lock) return ret; } -void _raw_write_unlock(rwlock_t *lock) +void __lockfunc _raw_write_unlock(raw_rwlock_t *lock) { debug_write_unlock(lock); __raw_write_unlock(&lock->raw_lock); Index: linux-2.6.25.8-rt7/include/linux/pickop.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.25.8-rt7/include/linux/pickop.h 2008-06-23 19:13:27.000000000 -0400 @@ -0,0 +1,32 @@ +#ifndef _LINUX_PICKOP_H +#define _LINUX_PICKOP_H + +#undef PICK_TYPE_EQUAL +#define PICK_TYPE_EQUAL(var, type) \ + __builtin_types_compatible_p(typeof(var), type) + +extern int __bad_func_type(void); + +#define PICK_FUNCTION(type1, type2, func1, func2, arg0, ...) \ +do { \ + if (PICK_TYPE_EQUAL((arg0), type1)) \ + func1((type1)(arg0), ##__VA_ARGS__); \ + else if (PICK_TYPE_EQUAL((arg0), type2)) \ + func2((type2)(arg0), ##__VA_ARGS__); \ + else __bad_func_type(); \ +} while (0) + +#define PICK_FUNCTION_RET(type1, type2, func1, func2, arg0, ...) \ +({ \ + unsigned long __ret; \ + \ + if (PICK_TYPE_EQUAL((arg0), type1)) \ + __ret = func1((type1)(arg0), ##__VA_ARGS__); \ + else if (PICK_TYPE_EQUAL((arg0), type2)) \ + __ret = func2((type2)(arg0), ##__VA_ARGS__); \ + else __ret = __bad_func_type(); \ + \ + __ret; \ +}) + +#endif /* _LINUX_PICKOP_H */ Index: linux-2.6.25.8-rt7/arch/x86/kernel/apm_32.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/kernel/apm_32.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/kernel/apm_32.c 2008-06-23 19:13:27.000000000 -0400 @@ -783,7 +783,7 @@ static int apm_do_idle(void) */ smp_mb(); } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } Index: linux-2.6.25.8-rt7/arch/x86/lib/semaphore_32.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/x86/lib/semaphore_32.S 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/x86/lib/semaphore_32.S 2008-06-23 19:13:27.000000000 -0400 @@ -30,7 +30,7 @@ * value or just clobbered.. */ .section .sched.text, "ax" -ENTRY(__down_failed) +ENTRY(__compat_down_failed) CFI_STARTPROC FRAME pushl %edx @@ -39,7 +39,7 @@ ENTRY(__down_failed) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down + call __compat_down popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -50,8 +50,9 @@ ENTRY(__down_failed) ret CFI_ENDPROC ENDPROC(__down_failed) + ENDPROC(__compat_down_failed) -ENTRY(__down_failed_interruptible) +ENTRY(__compat_down_failed_interruptible) CFI_STARTPROC FRAME pushl %edx @@ -60,7 +61,7 @@ ENTRY(__down_failed_interruptible) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_interruptible + call __compat_down_interruptible popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -71,8 +72,9 @@ ENTRY(__down_failed_interruptible) ret CFI_ENDPROC ENDPROC(__down_failed_interruptible) + ENDPROC(__compat_down_failed_interruptible) -ENTRY(__down_failed_trylock) +ENTRY(__compat_down_failed_trylock) CFI_STARTPROC FRAME pushl %edx @@ -81,7 +83,7 @@ ENTRY(__down_failed_trylock) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_trylock + call __compat_down_trylock popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -91,9 +93,9 @@ ENTRY(__down_failed_trylock) ENDFRAME ret CFI_ENDPROC - ENDPROC(__down_failed_trylock) + ENDPROC(__compat_down_failed_trylock) -ENTRY(__up_wakeup) +ENTRY(__compat_up_wakeup) CFI_STARTPROC FRAME pushl %edx @@ -102,7 +104,7 @@ ENTRY(__up_wakeup) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __up + call __compat_up popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -112,7 +114,7 @@ ENTRY(__up_wakeup) ENDFRAME ret CFI_ENDPROC - ENDPROC(__up_wakeup) + ENDPROC(__compat_up_wakeup) /* * rw spinlock fallbacks Index: linux-2.6.25.8-rt7/include/asm-x86/rwsem.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/rwsem.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/rwsem.h 2008-06-23 19:13:27.000000000 -0400 @@ -44,19 +44,19 @@ struct rwsem_waiter; -extern asmregparm struct rw_semaphore * - rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore * - rwsem_downgrade_wake(struct rw_semaphore *sem); +extern asmregparm struct compat_rw_semaphore * +rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern asmregparm struct compat_rw_semaphore * +rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern asmregparm struct compat_rw_semaphore * +rwsem_wake(struct compat_rw_semaphore *); +extern asmregparm struct compat_rw_semaphore * +rwsem_downgrade_wake(struct compat_rw_semaphore *sem); /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -82,23 +82,23 @@ struct rw_semaphore { { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, +extern void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); -#define init_rwsem(sem) \ +#define compat_init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - __init_rwsem((sem), #sem, &__key); \ + __compat_init_rwsem((sem), #sem, &__key); \ } while (0) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning down_read\n\t" @@ -115,7 +115,7 @@ LOCK_PREFIX " incl (%%eax)\n\t" /* /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { __s32 result, tmp; __asm__ __volatile__( @@ -138,7 +138,8 @@ LOCK_PREFIX " cmpxchgl %2,%0\n\t" /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void +__down_write_nested(struct compat_rw_semaphore *sem, int subclass) { int tmp; @@ -164,7 +165,7 @@ static inline void __down_write(struct r /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { signed long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, @@ -177,7 +178,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; __asm__ __volatile__( @@ -195,7 +196,7 @@ LOCK_PREFIX " xadd %%edx,(%%eax)\n /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning __up_write\n\t" @@ -213,7 +214,7 @@ LOCK_PREFIX " xaddl %%edx,(%%eax)\n /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { __asm__ __volatile__( "# beginning __downgrade_write\n\t" @@ -230,7 +231,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t" /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { __asm__ __volatile__( LOCK_PREFIX "addl %1,%0" @@ -241,7 +242,7 @@ LOCK_PREFIX "addl %1,%0" /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { int tmp = delta; @@ -253,7 +254,7 @@ LOCK_PREFIX "xadd %0,%1" return tmp+delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct rw_semaphore *sem) { return (sem->count != 0); } Index: linux-2.6.25.8-rt7/include/asm-x86/semaphore_32.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/semaphore_32.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/semaphore_32.h 2008-06-23 19:13:27.000000000 -0400 @@ -3,8 +3,6 @@ #include -#ifdef __KERNEL__ - /* * SMP- and interrupt-safe semaphores.. * @@ -41,29 +39,39 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -static inline void sema_init (struct semaphore *sem, int val) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -73,27 +81,27 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern asmregparm void __down_failed(atomic_t *count_ptr); -extern asmregparm int __down_failed_interruptible(atomic_t *count_ptr); -extern asmregparm int __down_failed_trylock(atomic_t *count_ptr); -extern asmregparm void __up_wakeup(atomic_t *count_ptr); +extern asmregparm void __compat_down_failed(void); +extern asmregparm int __compat_down_failed_interruptible(void); +extern asmregparm int __compat_down_failed_trylock(void); +extern asmregparm void __compat_up_wakeup(void); /* * This is ugly, but we want the default case to fall through. * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/i386/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); __asm__ __volatile__( @@ -101,7 +109,7 @@ static inline void down(struct semaphore LOCK_PREFIX "decl %0\n\t" /* --sem->count */ "jns 2f\n" "\tlea %0,%%eax\n\t" - "call __down_failed\n" + "call __compat_down_failed\n" "2:" :"+m" (sem->count) : @@ -112,7 +120,7 @@ static inline void down(struct semaphore * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int result; @@ -123,7 +131,7 @@ static inline int down_interruptible(str LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" "lea %1,%%eax\n\t" - "call __down_failed_interruptible\n" + "call __compat_down_failed_interruptible\n" "2:" :"=&a" (result), "+m" (sem->count) : @@ -135,7 +143,7 @@ static inline int down_interruptible(str * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int result; @@ -145,7 +153,7 @@ static inline int down_trylock(struct se LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" "lea %1,%%eax\n\t" - "call __down_failed_trylock\n\t" + "call __compat_down_failed_trylock\n\t" "2:\n" :"=&a" (result), "+m" (sem->count) : @@ -157,19 +165,24 @@ static inline int down_trylock(struct se * Note! This is subtle. We jump to wake people up only if * the semaphore was negative (== somebody was waiting on it). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ "jg 1f\n\t" "lea %0,%%eax\n\t" - "call __up_wakeup\n" + "call __compat_up_wakeup\n" "1:" :"+m" (sem->count) : :"memory","ax"); } -#endif +extern int compat_sem_is_locked(struct compat_semaphore *sem); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif Index: linux-2.6.25.8-rt7/include/asm-x86/spinlock_types.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/spinlock_types.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/spinlock_types.h 2008-06-23 19:13:27.000000000 -0400 @@ -7,13 +7,13 @@ typedef struct { unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { unsigned int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } Index: linux-2.6.25.8-rt7/include/asm-x86/thread_info_32.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/thread_info_32.h 2008-06-23 19:12:53.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/thread_info_32.h 2008-06-23 19:13:27.000000000 -0400 @@ -133,6 +133,7 @@ static inline struct thread_info *curren #define TIF_SECCOMP 7 /* secure computing */ #define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ #define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */ +#define TIF_NEED_RESCHED_DELAYED 10 /* reschedule on return to userspace */ #define TIF_MEMDIE 16 #define TIF_DEBUG 17 /* uses debug registers */ #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ @@ -153,6 +154,7 @@ static inline struct thread_info *curren #define _TIF_SECCOMP (1<slock); return (((tmp >> 8) & 0xff) != (tmp & 0xff)); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int __raw_spin_is_contended(__raw_spinlock_t *lock) { int tmp = *(volatile signed int *)(&(lock)->slock); return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { short inc = 0x0100; @@ -99,7 +99,7 @@ static inline void __raw_spin_lock(raw_s #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { int tmp; short new; @@ -121,7 +121,7 @@ static inline int __raw_spin_trylock(raw return tmp; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { __asm__ __volatile__( UNLOCK_LOCK_PREFIX "incb %0" @@ -130,21 +130,21 @@ static inline void __raw_spin_unlock(raw :"memory", "cc"); } #else -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int __raw_spin_is_locked(__raw_spinlock_t *lock) { int tmp = *(volatile signed int *)(&(lock)->slock); return (((tmp >> 16) & 0xffff) != (tmp & 0xffff)); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int __raw_spin_is_contended(__raw_spinlock_t *lock) { int tmp = *(volatile signed int *)(&(lock)->slock); return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static inline void __raw_spin_lock(__raw_spinlock_t *lock) { int inc = 0x00010000; int tmp; @@ -168,7 +168,7 @@ static inline void __raw_spin_lock(raw_s #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock) -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static inline int __raw_spin_trylock(__raw_spinlock_t *lock) { int tmp; int new; @@ -191,7 +191,7 @@ static inline int __raw_spin_trylock(raw return tmp; } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static inline void __raw_spin_unlock(__raw_spinlock_t *lock) { __asm__ __volatile__( UNLOCK_LOCK_PREFIX "incw %0" @@ -201,7 +201,7 @@ static inline void __raw_spin_unlock(raw } #endif -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (__raw_spin_is_locked(lock)) cpu_relax(); @@ -225,7 +225,7 @@ static inline void __raw_spin_unlock_wai * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(raw_rwlock_t *lock) +static inline int __raw_read_can_lock(__raw_rwlock_t *lock) { return (int)(lock)->lock > 0; } @@ -234,12 +234,12 @@ static inline int __raw_read_can_lock(ra * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(raw_rwlock_t *lock) +static inline int __raw_write_can_lock(__raw_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void __raw_read_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -248,7 +248,7 @@ static inline void __raw_read_lock(raw_r ::LOCK_PTR_REG (rw) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void __raw_write_lock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" "jz 1f\n" @@ -257,7 +257,7 @@ static inline void __raw_write_lock(raw_ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int __raw_read_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -268,7 +268,7 @@ static inline int __raw_read_trylock(raw return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int __raw_write_trylock(__raw_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -278,19 +278,19 @@ static inline int __raw_write_trylock(ra return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void __raw_read_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void __raw_write_unlock(__raw_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl %1, %0" : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); } -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() #endif Index: linux-2.6.25.8-rt7/include/asm-x86/semaphore_64.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/semaphore_64.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/semaphore_64.h 2008-06-23 19:13:28.000000000 -0400 @@ -5,6 +5,10 @@ #ifdef __KERNEL__ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + /* * SMP- and interrupt-safe semaphores.. * @@ -43,28 +47,33 @@ #include #include -struct semaphore { +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .sleepers = 0, \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) -static inline void sema_init (struct semaphore *sem, int val) +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { /* - * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val); + * *sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val); * * i'd rather use the more flexible initialization above, but sadly * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well. @@ -74,32 +83,33 @@ static inline void sema_init (struct sem init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -asmlinkage void __down_failed(void /* special register calling convention */); -asmlinkage int __down_failed_interruptible(void /* params in registers */); -asmlinkage int __down_failed_trylock(void /* params in registers */); -asmlinkage void __up_wakeup(void /* special register calling convention */); +asmlinkage void __compat_down_failed(void /* special register calling convention */); +asmlinkage int __compat_down_failed_interruptible(void /* params in registers */); +asmlinkage int __compat_down_failed_trylock(void /* params in registers */); +asmlinkage void __compat_up_wakeup(void /* special register calling convention */); -asmlinkage void __down(struct semaphore * sem); -asmlinkage int __down_interruptible(struct semaphore * sem); -asmlinkage int __down_trylock(struct semaphore * sem); -asmlinkage void __up(struct semaphore * sem); +asmlinkage void __compat_down(struct compat_semaphore * sem); +asmlinkage int __compat_down_interruptible(struct compat_semaphore * sem); +asmlinkage int __compat_down_trylock(struct compat_semaphore * sem); +asmlinkage void __compat_up(struct compat_semaphore * sem); +asmlinkage int compat_sem_is_locked(struct compat_semaphore *sem); /* * This is ugly, but we want the default case to fall through. * "__down_failed" is a special asm handler that calls the C * routine that actually waits. See arch/x86_64/kernel/semaphore.c */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -107,7 +117,7 @@ static inline void down(struct semaphore "# atomic down operation\n\t" LOCK_PREFIX "decl %0\n\t" /* --sem->count */ "jns 1f\n\t" - "call __down_failed\n" + "call __compat_down_failed\n" "1:" :"=m" (sem->count) :"D" (sem) @@ -118,7 +128,7 @@ static inline void down(struct semaphore * Interruptible try to acquire a semaphore. If we obtained * it, return zero. If we were interrupted, returns -EINTR */ -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int result; @@ -129,7 +139,7 @@ static inline int down_interruptible(str "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" - "call __down_failed_interruptible\n" + "call __compat_down_failed_interruptible\n" "2:\n" :"=&a" (result), "=m" (sem->count) :"D" (sem) @@ -141,7 +151,7 @@ static inline int down_interruptible(str * Non-blockingly attempt to down() a semaphore. * Returns zero if we acquired it */ -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { int result; @@ -150,7 +160,7 @@ static inline int down_trylock(struct se "xorl %0,%0\n\t" LOCK_PREFIX "decl %1\n\t" /* --sem->count */ "jns 2f\n\t" - "call __down_failed_trylock\n\t" + "call __compat_down_failed_trylock\n\t" "2:\n" :"=&a" (result), "=m" (sem->count) :"D" (sem) @@ -164,17 +174,20 @@ static inline int down_trylock(struct se * The default case (no contention) will result in NO * jumps for both down() and up(). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __asm__ __volatile__( "# atomic up operation\n\t" LOCK_PREFIX "incl %0\n\t" /* ++sem->count */ "jg 1f\n\t" - "call __up_wakeup\n" + "call __compat_up_wakeup\n" "1:" :"=m" (sem->count) :"D" (sem) :"memory"); } + +#include + #endif /* __KERNEL__ */ #endif Index: linux-2.6.25.8-rt7/include/asm-x86/thread_info_64.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-x86/thread_info_64.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-x86/thread_info_64.h 2008-06-23 19:13:28.000000000 -0400 @@ -108,6 +108,7 @@ static inline struct thread_info *stack_ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_IRET 5 /* force IRET */ +#define TIF_NEED_RESCHED_DELAYED 6 /* reschedul on return to userspace */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ @@ -131,6 +132,7 @@ static inline struct thread_info *stack_ #define _TIF_SINGLESTEP (1<counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -180,6 +182,7 @@ static __inline__ int atomic_add_return( v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -223,7 +226,9 @@ static __inline__ int atomic_sub_return( : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -232,6 +237,7 @@ static __inline__ int atomic_sub_return( v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -291,7 +297,9 @@ static __inline__ int atomic_sub_if_posi : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -301,6 +309,7 @@ static __inline__ int atomic_sub_if_posi v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -552,7 +561,9 @@ static __inline__ long atomic64_add_retu : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -561,6 +572,7 @@ static __inline__ long atomic64_add_retu v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); @@ -604,7 +616,9 @@ static __inline__ long atomic64_sub_retu : "=&r" (result), "=&r" (temp), "=m" (v->counter) : "Ir" (i), "m" (v->counter) : "memory"); - } else { + } +#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT) + else { unsigned long flags; raw_local_irq_save(flags); @@ -682,6 +696,7 @@ static __inline__ long atomic64_sub_if_p v->counter = result; raw_local_irq_restore(flags); } +#endif smp_llsc_mb(); Index: linux-2.6.25.8-rt7/include/asm-mips/semaphore.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-mips/semaphore.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-mips/semaphore.h 2008-06-23 19:13:37.000000000 -0400 @@ -24,12 +24,20 @@ #ifdef __KERNEL__ -#include -#include #include #include -struct semaphore { +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +#include +#include + +struct compat_semaphore { /* * Note that any negative value of count is equivalent to 0, * but additionally indicates that some process(es) might be @@ -39,38 +47,41 @@ struct semaphore { wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name, count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name, count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name, 1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) -static inline void sema_init(struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX(struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED(struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern void __compat_up(struct compat_semaphore * sem); -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -78,31 +89,37 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (unlikely(atomic_dec_return(&sem->count) < 0)) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (unlikely(atomic_dec_return(&sem->count) < 0)) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { return atomic_dec_if_positive(&sem->count) < 0; } -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (unlikely(atomic_inc_return(&sem->count) <= 0)) - __up(sem); + __compat_up(sem); } +extern int compat_sem_is_locked(struct compat_semaphore *sem); + +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif /* __KERNEL__ */ #endif /* __ASM_SEMAPHORE_H */ Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/Makefile =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/Makefile 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/Makefile 2008-06-23 19:13:28.000000000 -0400 @@ -12,11 +12,12 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o \ signal.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_ppc970.o \ Index: linux-2.6.25.8-rt7/arch/powerpc/kernel/semaphore.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/kernel/semaphore.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/kernel/semaphore.c 2008-06-23 19:13:28.000000000 -0400 @@ -31,7 +31,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -50,7 +50,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -63,7 +63,7 @@ void __up(struct semaphore *sem) __sem_update_count(sem, 1); wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se */ wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore *sem) { int retval = 0; struct task_struct *tsk = current; @@ -132,4 +132,10 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux-2.6.25.8-rt7/arch/powerpc/lib/locks.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/powerpc/lib/locks.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/powerpc/lib/locks.c 2008-06-23 19:14:08.000000000 -0400 @@ -25,7 +25,7 @@ #include #include -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(__raw_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -55,7 +55,7 @@ void __spin_yield(raw_spinlock_t *lock) * This turns out to be the same for read and write locks, since * we only know the holder if it is write-locked. */ -void __rw_yield(raw_rwlock_t *rw) +void __rw_yield(__raw_rwlock_t *rw) { int lock_value; unsigned int holder_cpu, yield_count; @@ -82,7 +82,7 @@ void __rw_yield(raw_rwlock_t *rw) } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); Index: linux-2.6.25.8-rt7/arch/ppc/Kconfig =================================================================== --- linux-2.6.25.8-rt7.orig/arch/ppc/Kconfig 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/ppc/Kconfig 2008-06-23 19:13:28.000000000 -0400 @@ -16,13 +16,6 @@ config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -926,6 +919,18 @@ config ARCH_POPULATES_NODE_MAP source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "mm/Kconfig" source "fs/Kconfig.binfmt" Index: linux-2.6.25.8-rt7/arch/ppc/kernel/entry.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/ppc/kernel/entry.S 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/ppc/kernel/entry.S 2008-06-23 19:13:28.000000000 -0400 @@ -882,7 +882,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -896,7 +896,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,18 lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING beq restore_user Index: linux-2.6.25.8-rt7/arch/ppc/kernel/semaphore.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/ppc/kernel/semaphore.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/ppc/kernel/semaphore.c 2008-06-23 19:13:28.000000000 -0400 @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: linux-2.6.25.8-rt7/arch/ppc/lib/locks.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/ppc/lib/locks.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/ppc/lib/locks.c 2008-06-23 19:13:28.000000000 -0400 @@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp) wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw) } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw) wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw) } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw) wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: linux-2.6.25.8-rt7/drivers/macintosh/adb.c =================================================================== --- linux-2.6.25.8-rt7.orig/drivers/macintosh/adb.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/drivers/macintosh/adb.c 2008-06-23 19:13:28.000000000 -0400 @@ -84,7 +84,7 @@ struct adb_driver *adb_controller; BLOCKING_NOTIFIER_HEAD(adb_client_list); static int adb_got_sleep; static int adb_inited; -static DECLARE_MUTEX(adb_probe_mutex); +static COMPAT_DECLARE_MUTEX(adb_probe_mutex); static int sleepy_trackpad; static int autopoll_devs; int __adb_probe_sync; Index: linux-2.6.25.8-rt7/include/asm-powerpc/rwsem.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-powerpc/rwsem.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-powerpc/rwsem.h 2008-06-23 19:13:28.000000000 -0400 @@ -21,7 +21,7 @@ /* * the semaphore definition */ -struct rw_semaphore { +struct compat_rw_semaphore { /* XXX this should be able to be an atomic_t -- paulus */ signed int count; #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -30,7 +30,7 @@ struct rw_semaphore { #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; + raw_spinlock_t wait_lock; struct list_head wait_list; }; @@ -38,15 +38,15 @@ struct rw_semaphore { { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ LIST_HEAD_INIT((name).wait_list) } -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) +#define COMPAT_DECLARE_RWSEM(name) \ + struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem); +extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem); -static inline void init_rwsem(struct rw_semaphore *sem) +static inline void compat_init_rwsem(struct compat_rw_semaphore *sem) { sem->count = RWSEM_UNLOCKED_VALUE; spin_lock_init(&sem->wait_lock); @@ -56,13 +56,13 @@ static inline void init_rwsem(struct rw_ /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct compat_rw_semaphore *sem) { if (unlikely(atomic_inc_return((atomic_t *)(&sem->count)) <= 0)) rwsem_down_read_failed(sem); } -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline int __down_read_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -78,7 +78,7 @@ static inline int __down_read_trylock(st /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline void __down_write(struct compat_rw_semaphore *sem) { int tmp; @@ -88,7 +88,7 @@ static inline void __down_write(struct r rwsem_down_write_failed(sem); } -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline int __down_write_trylock(struct compat_rw_semaphore *sem) { int tmp; @@ -100,7 +100,7 @@ static inline int __down_write_trylock(s /* * unlock after reading */ -static inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct compat_rw_semaphore *sem) { int tmp; @@ -112,7 +112,7 @@ static inline void __up_read(struct rw_s /* * unlock after writing */ -static inline void __up_write(struct rw_semaphore *sem) +static inline void __up_write(struct compat_rw_semaphore *sem) { if (unlikely(atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0)) @@ -122,7 +122,7 @@ static inline void __up_write(struct rw_ /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) { atomic_add(delta, (atomic_t *)(&sem->count)); } @@ -130,7 +130,7 @@ static inline void rwsem_atomic_add(int /* * downgrade write lock to read lock */ -static inline void __downgrade_write(struct rw_semaphore *sem) +static inline void __downgrade_write(struct compat_rw_semaphore *sem) { int tmp; @@ -142,12 +142,12 @@ static inline void __downgrade_write(str /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) { return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) +static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) { return (sem->count != 0); } Index: linux-2.6.25.8-rt7/include/asm-powerpc/semaphore.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-powerpc/semaphore.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-powerpc/semaphore.h 2008-06-23 19:13:28.000000000 -0400 @@ -15,48 +15,58 @@ #include #include -struct semaphore { +/* + * On !PREEMPT_RT all sempahores are compat + */ +#ifndef CONFIG_PREEMPT_RT +# define compat_semaphore semaphore +#endif + +struct compat_semaphore { /* * Note that any negative value of count is equivalent to 0, * but additionally indicates that some process(es) might be * sleeping on `wait'. */ atomic_t count; + int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INITIALIZER(name, n) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ { \ .count = ATOMIC_INIT(n), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \ } -#define __DECLARE_SEMAPHORE_GENERIC(name, count) \ - struct semaphore name = __SEMAPHORE_INITIALIZER(name,count) +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) -static inline void sema_init (struct semaphore *sem, int val) +static inline void compat_sema_init (struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX (struct semaphore *sem) +static inline void compat_init_MUTEX (struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 1); } -static inline void init_MUTEX_LOCKED (struct semaphore *sem) +static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem) { - sema_init(sem, 0); + compat_sema_init(sem, 0); } -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern void __compat_up(struct compat_semaphore * sem); + +extern int compat_sem_is_locked(struct compat_semaphore *sem); -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); @@ -64,31 +74,35 @@ static inline void down(struct semaphore * Try to get the semaphore, take the slow path if we fail. */ if (unlikely(atomic_dec_return(&sem->count) < 0)) - __down(sem); + __compat_down(sem); } -static inline int down_interruptible(struct semaphore * sem) +static inline int compat_down_interruptible(struct compat_semaphore * sem) { int ret = 0; might_sleep(); if (unlikely(atomic_dec_return(&sem->count) < 0)) - ret = __down_interruptible(sem); + ret = __compat_down_interruptible(sem); return ret; } -static inline int down_trylock(struct semaphore * sem) +static inline int compat_down_trylock(struct compat_semaphore * sem) { return atomic_dec_if_positive(&sem->count) < 0; } -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { if (unlikely(atomic_inc_return(&sem->count) <= 0)) - __up(sem); + __compat_up(sem); } +#define compat_sema_count(sem) atomic_read(&(sem)->count) + +#include + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_SEMAPHORE_H */ Index: linux-2.6.25.8-rt7/include/asm-powerpc/spinlock.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-powerpc/spinlock.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-powerpc/spinlock.h 2008-06-23 19:13:41.000000000 -0400 @@ -53,7 +53,7 @@ * This returns the old value in the lock, so we succeeded * in getting the lock if the return value is 0. */ -static __inline__ unsigned long __spin_trylock(raw_spinlock_t *lock) +static __inline__ unsigned long ___raw_spin_trylock(__raw_spinlock_t *lock) { unsigned long tmp, token; @@ -72,10 +72,10 @@ static __inline__ unsigned long __spin_t return tmp; } -static int __inline__ __raw_spin_trylock(raw_spinlock_t *lock) +static int __inline__ __raw_spin_trylock(__raw_spinlock_t *lock) { CLEAR_IO_SYNC; - return __spin_trylock(lock) == 0; + return ___raw_spin_trylock(lock) == 0; } /* @@ -95,19 +95,19 @@ static int __inline__ __raw_spin_trylock #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) /* We only yield to the hypervisor if we are in shared processor mode */ #define SHARED_PROCESSOR (get_lppaca()->shared_proc) -extern void __spin_yield(raw_spinlock_t *lock); -extern void __rw_yield(raw_rwlock_t *lock); +extern void __spin_yield(__raw_spinlock_t *lock); +extern void __rw_yield(__raw_rwlock_t *lock); #else /* SPLPAR || ISERIES */ #define __spin_yield(x) barrier() #define __rw_yield(x) barrier() #define SHARED_PROCESSOR 0 #endif -static void __inline__ __raw_spin_lock(raw_spinlock_t *lock) +static void __inline__ __raw_spin_lock(__raw_spinlock_t *lock) { CLEAR_IO_SYNC; while (1) { - if (likely(__spin_trylock(lock) == 0)) + if (likely(___raw_spin_trylock(lock) == 0)) break; do { HMT_low(); @@ -118,13 +118,13 @@ static void __inline__ __raw_spin_lock(r } } -static void __inline__ __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +static void __inline__ __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) { unsigned long flags_dis; CLEAR_IO_SYNC; while (1) { - if (likely(__spin_trylock(lock) == 0)) + if (likely(___raw_spin_trylock(lock) == 0)) break; local_save_flags(flags_dis); local_irq_restore(flags); @@ -138,7 +138,7 @@ static void __inline__ __raw_spin_lock_f } } -static __inline__ void __raw_spin_unlock(raw_spinlock_t *lock) +static __inline__ void __raw_spin_unlock(__raw_spinlock_t *lock) { SYNC_IO; __asm__ __volatile__("# __raw_spin_unlock\n\t" @@ -147,7 +147,7 @@ static __inline__ void __raw_spin_unlock } #ifdef CONFIG_PPC64 -extern void __raw_spin_unlock_wait(raw_spinlock_t *lock); +extern void __raw_spin_unlock_wait(__raw_spinlock_t *lock); #else #define __raw_spin_unlock_wait(lock) \ do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0) @@ -179,7 +179,7 @@ extern void __raw_spin_unlock_wait(raw_s * This returns the old value in the lock + 1, * so we got a read lock if the return value is > 0. */ -static long __inline__ __read_trylock(raw_rwlock_t *rw) +static long __inline__ ___raw_read_trylock(__raw_rwlock_t *rw) { long tmp; @@ -203,7 +203,7 @@ static long __inline__ __read_trylock(ra * This returns the old value in the lock, * so we got the write lock if the return value is 0. */ -static __inline__ long __write_trylock(raw_rwlock_t *rw) +static __inline__ long ___raw_write_trylock(__raw_rwlock_t *rw) { long tmp, token; @@ -223,10 +223,10 @@ static __inline__ long __write_trylock(r return tmp; } -static void __inline__ __raw_read_lock(raw_rwlock_t *rw) +static void __inline__ __raw_read_lock(__raw_rwlock_t *rw) { while (1) { - if (likely(__read_trylock(rw) > 0)) + if (likely(___raw_read_trylock(rw) > 0)) break; do { HMT_low(); @@ -237,10 +237,10 @@ static void __inline__ __raw_read_lock(r } } -static void __inline__ __raw_write_lock(raw_rwlock_t *rw) +static void __inline__ __raw_write_lock(__raw_rwlock_t *rw) { while (1) { - if (likely(__write_trylock(rw) == 0)) + if (likely(___raw_write_trylock(rw) == 0)) break; do { HMT_low(); @@ -251,17 +251,17 @@ static void __inline__ __raw_write_lock( } } -static int __inline__ __raw_read_trylock(raw_rwlock_t *rw) +static int __inline__ __raw_read_trylock(__raw_rwlock_t *rw) { - return __read_trylock(rw) > 0; + return ___raw_read_trylock(rw) > 0; } -static int __inline__ __raw_write_trylock(raw_rwlock_t *rw) +static int __inline__ __raw_write_trylock(__raw_rwlock_t *rw) { - return __write_trylock(rw) == 0; + return ___raw_write_trylock(rw) == 0; } -static void __inline__ __raw_read_unlock(raw_rwlock_t *rw) +static void __inline__ __raw_read_unlock(__raw_rwlock_t *rw) { long tmp; @@ -278,7 +278,7 @@ static void __inline__ __raw_read_unlock : "cr0", "memory"); } -static __inline__ void __raw_write_unlock(raw_rwlock_t *rw) +static __inline__ void __raw_write_unlock(__raw_rwlock_t *rw) { __asm__ __volatile__("# write_unlock\n\t" LWSYNC_ON_SMP: : :"memory"); @@ -289,5 +289,9 @@ static __inline__ void __raw_write_unloc #define _raw_read_relax(lock) __rw_yield(lock) #define _raw_write_relax(lock) __rw_yield(lock) +#define __raw_spin_relax(lock) cpu_relax() +#define __raw_read_relax(lock) cpu_relax() +#define __raw_write_relax(lock) cpu_relax() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ Index: linux-2.6.25.8-rt7/include/asm-powerpc/spinlock_types.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-powerpc/spinlock_types.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-powerpc/spinlock_types.h 2008-06-23 19:13:28.000000000 -0400 @@ -7,13 +7,13 @@ typedef struct { volatile unsigned int slock; -} raw_spinlock_t; +} __raw_spinlock_t; #define __RAW_SPIN_LOCK_UNLOCKED { 0 } typedef struct { volatile signed int lock; -} raw_rwlock_t; +} __raw_rwlock_t; #define __RAW_RW_LOCK_UNLOCKED { 0 } Index: linux-2.6.25.8-rt7/arch/arm/kernel/entry-armv.S =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/entry-armv.S 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/entry-armv.S 2008-06-23 19:13:29.000000000 -0400 @@ -210,7 +210,7 @@ __irq_svc: irq_handler #ifdef CONFIG_PREEMPT ldr r0, [tsk, #TI_FLAGS] @ get flags - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED blne svc_preempt preempt_return: ldr r0, [tsk, #TI_PREEMPT] @ read preempt value @@ -241,7 +241,7 @@ svc_preempt: str r7, [tsk, #TI_PREEMPT] @ expects preempt_count == 0 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED beq preempt_return @ go again b 1b #endif Index: linux-2.6.25.8-rt7/arch/arm/kernel/semaphore.c =================================================================== --- linux-2.6.25.8-rt7.orig/arch/arm/kernel/semaphore.c 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/arch/arm/kernel/semaphore.c 2008-06-23 19:14:34.000000000 -0400 @@ -49,14 +49,16 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) + void __used __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } +EXPORT_SYMBOL(__compat_up); + static DEFINE_SPINLOCK(semaphore_lock); -void __sched __down(struct semaphore * sem) + void __used __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +EXPORT_SYMBOL(__compat_down); + + int __used __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -140,6 +144,8 @@ int __sched __down_interruptible(struct return retval; } +EXPORT_SYMBOL(__compat_down_interruptible); + /* * Trylock failed - make sure we correct for * having decremented the count. @@ -148,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) + int __used __sched __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se return 1; } +EXPORT_SYMBOL(__compat_down_trylock); + + int __sched compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); + /* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines @@ -185,7 +200,7 @@ asm(" .section .sched.text,\"ax\",%progb __down_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down \n\ + bl __compat_down \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ .align 5 \n\ @@ -193,7 +208,7 @@ __down_failed: \n\ __down_interruptible_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_interruptible \n\ + bl __compat_down_interruptible \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -202,7 +217,7 @@ __down_interruptible_failed: \n\ __down_trylock_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_trylock \n\ + bl __compat_down_trylock \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -211,7 +226,7 @@ __down_trylock_failed: \n\ __up_wakeup: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __up \n\ + bl __compat_up \n\ ldmfd sp!, {r0 - r4, pc} \n\ "); Index: linux-2.6.25.8-rt7/include/asm-arm/semaphore.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/semaphore.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/semaphore.h 2008-06-23 19:13:29.000000000 -0400 @@ -5,45 +5,65 @@ #define __ASM_ARM_SEMAPHORE_H #include + +#ifdef CONFIG_PREEMPT_RT +# include +#endif + #include #include #include +/* + * On !PREEMPT_RT all semaphores are compat: + */ +#ifndef CONFIG_PREEMPT_RT +# define semaphore compat_semaphore +#endif + #include #include -struct semaphore { +struct compat_semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; }; -#define __SEMAPHORE_INIT(name, cnt) \ +#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt) \ { \ .count = ATOMIC_INIT(cnt), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \ } -#define __DECLARE_SEMAPHORE_GENERIC(name,count) \ - struct semaphore name = __SEMAPHORE_INIT(name,count) +#define __COMPAT_MUTEX_INITIALIZER(name) \ + __COMPAT_SEMAPHORE_INITIALIZER(name,1) + +#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \ + struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count) -#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1) +#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1) -static inline void sema_init(struct semaphore *sem, int val) +static inline void compat_sema_init(struct compat_semaphore *sem, int val) { atomic_set(&sem->count, val); sem->sleepers = 0; init_waitqueue_head(&sem->wait); } -static inline void init_MUTEX(struct semaphore *sem) +static inline void compat_init_MUTEX(struct compat_semaphore *sem) +{ + compat_sema_init(sem, 1); +} + +static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem) { - sema_init(sem, 1); + compat_sema_init(sem, 0); } -static inline void init_MUTEX_LOCKED(struct semaphore *sem) +static inline int compat_sema_count(struct compat_semaphore *sem) { - sema_init(sem, 0); + return atomic_read(&sem->count); } /* @@ -54,16 +74,18 @@ asmlinkage int __down_interruptible_fai asmlinkage int __down_trylock_failed(void); asmlinkage void __up_wakeup(void); -extern void __down(struct semaphore * sem); -extern int __down_interruptible(struct semaphore * sem); -extern int __down_trylock(struct semaphore * sem); -extern void __up(struct semaphore * sem); +extern void __compat_up(struct compat_semaphore *sem); +extern int __compat_down_interruptible(struct compat_semaphore * sem); +extern int __compat_down_trylock(struct compat_semaphore * sem); +extern void __compat_down(struct compat_semaphore * sem); + +extern int compat_sem_is_locked(struct compat_semaphore *sem); /* * This is ugly, but we want the default case to fall through. * "__down" is the actual routine that waits... */ -static inline void down(struct semaphore * sem) +static inline void compat_down(struct compat_semaphore * sem) { might_sleep(); __down_op(sem, __down_failed); @@ -73,13 +95,13 @@ static inline void down(struct semaphore * This is ugly, but we want the default case to fall through. * "__down_interruptible" is the actual routine that waits... */ -static inline int down_interruptible (struct semaphore * sem) +static inline int compat_down_interruptible (struct compat_semaphore * sem) { might_sleep(); return __down_op_ret(sem, __down_interruptible_failed); } -static inline int down_trylock(struct semaphore *sem) +static inline int compat_down_trylock(struct compat_semaphore *sem) { return __down_op_ret(sem, __down_trylock_failed); } @@ -90,9 +112,10 @@ static inline int down_trylock(struct se * The default case (no contention) will result in NO * jumps for both down() and up(). */ -static inline void up(struct semaphore * sem) +static inline void compat_up(struct compat_semaphore * sem) { __up_op(sem, __up_wakeup); } +#include #endif Index: linux-2.6.25.8-rt7/include/asm-arm/thread_info.h =================================================================== --- linux-2.6.25.8-rt7.orig/include/asm-arm/thread_info.h 2008-06-23 19:12:52.000000000 -0400 +++ linux-2.6.25.8-rt7/include/asm-arm/thread_info.h 2008-06-23 19:13:29.000000000 -0400 @@ -141,6 +141,7 @@ extern void iwmmxt_task_switch(struct th */ #define TIF_SIGPENDING 0 #define TIF_NEED_RESCHED 1 +#define TIF_NEED_RESCHED_DELAYED 3 #define TIF_SYSCALL_TRACE 8 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 @@ -149,6 +150,7 @@ extern void iwmmxt_task_switch(struct th #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_DELAYED (1<