Index: linux-2.6.24.7-rt27/kernel/futex.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/futex.c 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/futex.c 2009-02-08 00:05:09.000000000 -0500 @@ -61,6 +61,7 @@ #include "rtmutex_common.h" int __read_mostly futex_cmpxchg_enabled; +int __read_mostly futex_rt_pi_warning; #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -126,12 +127,14 @@ static struct futex_hash_bucket futex_qu /* Futex-fs vfsmount entry: */ static struct vfsmount *futex_mnt; +int futex_performance_hack; + /* * Take mm->mmap_sem, when futex is shared */ static inline void futex_lock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) down_read(fshared); } @@ -140,7 +143,7 @@ static inline void futex_lock_mm(struct */ static inline void futex_unlock_mm(struct rw_semaphore *fshared) { - if (fshared) + if (fshared && !futex_performance_hack) up_read(fshared); } @@ -950,8 +953,12 @@ static int futex_requeue(u32 __user *uad plist_add(&this->list, &hb2->chain); this->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_PREEMPT_RT + this->list.plist.lock = NULL; +#else this->list.plist.lock = &hb2->lock; #endif +#endif } this->key = key2; get_futex_key_refs(&key2); @@ -1011,8 +1018,12 @@ static inline void __queue_me(struct fut plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_PREEMPT_RT + q->list.plist.lock = NULL; +#else q->list.plist.lock = &hb->lock; #endif +#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1107,21 +1118,64 @@ static void unqueue_me_pi(struct futex_q * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1129,26 +1183,35 @@ static int fixup_pi_state_owner(u32 __us WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1181,6 +1244,15 @@ static int futex_wait(u32 __user *uaddr, hb = queue_lock(&q, -1, NULL); + if (futex_rt_pi_warning && unlikely(rt_task(curr))) { + if (printk_ratelimit()) { + printk(KERN_WARNING + "RT task %s:%d with priority %d" + " using non PI futex\n", + current->comm, current->pid, + MAX_RT_PRIO - current->prio); + } + } /* * Access the page AFTER the futex is queued. * Order is important: @@ -1248,6 +1320,10 @@ static int futex_wait(u32 __user *uaddr, * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ if (likely(!plist_node_empty(&q.list))) { + unsigned long nosched_flag = current->flags & PF_NOSCHED; + + current->flags &= ~PF_NOSCHED; + if (!abs_time) schedule(); else { @@ -1270,6 +1346,8 @@ static int futex_wait(u32 __user *uaddr, /* Flag if a timeout occured */ rem = (t.task == NULL); } + + current->flags |= nosched_flag; } __set_current_state(TASK_RUNNING); @@ -1505,7 +1583,7 @@ static int futex_lock_pi(u32 __user *uad * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1537,10 +1615,8 @@ static int futex_lock_pi(u32 __user *uad int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); - - WARN_ON(rt_mutex_owner(&q.pi_state->pi_mutex) != - owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) @@ -2156,7 +2232,11 @@ static int __init init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { +#ifdef CONFIG_PREEMPT_RT + plist_head_init(&futex_queues[i].chain, NULL); +#else plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); +#endif spin_lock_init(&futex_queues[i].lock); } Index: linux-2.6.24.7-rt27/arch/x86/lib/copy_user_64.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/lib/copy_user_64.S 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/lib/copy_user_64.S 2009-02-08 00:00:40.000000000 -0500 @@ -217,19 +217,19 @@ ENTRY(copy_user_generic_unrolled) /* table sorted by exception address */ .section __ex_table,"a" .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e + .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */ + .quad .Ls2,.Ls1e + .quad .Ls3,.Ls1e + .quad .Ls4,.Ls1e + .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */ .quad .Ld2,.Ls2e .quad .Ld3,.Ls3e .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e + .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */ + .quad .Ls6,.Ls5e + .quad .Ls7,.Ls5e + .quad .Ls8,.Ls5e + .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */ .quad .Ld6,.Ls6e .quad .Ld7,.Ls7e .quad .Ld8,.Ls8e @@ -244,11 +244,8 @@ ENTRY(copy_user_generic_unrolled) .quad .Le5,.Le_zero .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax +.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */ .Ls2e: addl $8,%eax .Ls3e: addl $8,%eax .Ls4e: addl $8,%eax Index: linux-2.6.24.7-rt27/arch/x86/lib/copy_user_nocache_64.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/lib/copy_user_nocache_64.S 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/lib/copy_user_nocache_64.S 2009-02-08 00:00:40.000000000 -0500 @@ -145,19 +145,19 @@ ENTRY(__copy_user_nocache) /* table sorted by exception address */ .section __ex_table,"a" .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e + .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */ + .quad .Ls2,.Ls1e + .quad .Ls3,.Ls1e + .quad .Ls4,.Ls1e + .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */ .quad .Ld2,.Ls2e .quad .Ld3,.Ls3e .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e + .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */ + .quad .Ls6,.Ls5e + .quad .Ls7,.Ls5e + .quad .Ls8,.Ls5e + .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */ .quad .Ld6,.Ls6e .quad .Ld7,.Ls7e .quad .Ld8,.Ls8e @@ -172,11 +172,8 @@ ENTRY(__copy_user_nocache) .quad .Le5,.Le_zero .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax +.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */ .Ls2e: addl $8,%eax .Ls3e: addl $8,%eax .Ls4e: addl $8,%eax Index: linux-2.6.24.7-rt27/mm/memory.c =================================================================== --- linux-2.6.24.7-rt27.orig/mm/memory.c 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/mm/memory.c 2009-02-08 00:03:13.000000000 -0500 @@ -261,18 +261,52 @@ void free_pgd_range(struct mmu_gather ** } while (pgd++, addr = next, addr != end); } +#ifdef CONFIG_IA64 +#define tlb_start_addr(tlb) (tlb)->start_addr +#define tlb_end_addr(tlb) (tlb)->end_addr +#else +#define tlb_start_addr(tlb) 0UL /* only ia64 really uses it */ +#define tlb_end_addr(tlb) 0UL /* only ia64 really uses it */ +#endif + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { +#ifdef CONFIG_PREEMPT + struct vm_area_struct *unlink = vma; + int fullmm = (*tlb)->fullmm; + + if (!vma) /* Sometimes when exiting after an oops */ + return; +#ifndef CONFIG_PREEMPT_RT + if (vma->vm_next) +#endif + tlb_finish_mmu(*tlb, tlb_start_addr(*tlb), tlb_end_addr(*tlb)); + /* + * Hide vma from rmap and vmtruncate before freeeing pgtables, + * with preemption enabled, except when unmapping just one area. + */ + while (unlink) { + anon_vma_unlink(unlink); + unlink_file_vma(unlink); + unlink = unlink->vm_next; + } +#ifndef CONFIG_PREEMPT_RT + if (vma->vm_next) +#endif + *tlb = tlb_gather_mmu(vma->vm_mm, fullmm); +#endif while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; +#ifndef CONFIG_PREEMPT /* * Hide vma from rmap and vmtruncate before freeing pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); +#endif if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, @@ -285,8 +319,10 @@ void free_pgtables(struct mmu_gather **t && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; +#ifndef CONFIG_PREEMPT anon_vma_unlink(vma); unlink_file_vma(vma); +#endif } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -772,10 +808,13 @@ static unsigned long unmap_page_range(st return addr; } -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else -/* No preempt: go for improved straight-line efficiency */ +/* + * No preempt: go for improved straight-line efficiency + * on PREEMPT_RT this is not a critical latency-path. + */ # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif @@ -934,17 +973,15 @@ struct page *follow_page(struct vm_area_ } ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!ptep) - goto out; pte = *ptep; if (!pte_present(pte)) - goto unlock; + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; page = vm_normal_page(vma, address, pte); if (unlikely(!page)) - goto unlock; + goto bad_page; if (flags & FOLL_GET) get_page(page); @@ -959,6 +996,15 @@ unlock: out: return page; +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + /* Fall through to ZERO_PAGE handling */ no_page_table: /* * When core dumping an enormous anonymous area that nobody @@ -973,6 +1019,26 @@ no_page_table: return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1047,9 +1113,7 @@ int get_user_pages(struct task_struct *t foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || (!vma->vm_ops->nopage && - !vma->vm_ops->fault))) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { @@ -1095,6 +1159,8 @@ int get_user_pages(struct task_struct *t cond_resched(); } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); if (pages) { pages[i] = page; @@ -1639,7 +1705,6 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1661,6 +1726,32 @@ gotten: lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; @@ -2522,6 +2613,28 @@ unlock: return 0; } +void pagefault_disable(void) +{ + current->pagefault_disabled++; + /* + * make sure to have issued the store before a pagefault + * can hit. + */ + barrier(); +} +EXPORT_SYMBOL(pagefault_disable); + +void pagefault_enable(void) +{ + /* + * make sure to issue those last loads/stores before enabling + * the pagefault handler again. + */ + barrier(); + current->pagefault_disabled--; +} +EXPORT_SYMBOL(pagefault_enable); + /* * By the time we get here, we already hold the mm semaphore */ Index: linux-2.6.24.7-rt27/kernel/hrtimer.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/hrtimer.c 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/hrtimer.c 2009-02-08 00:05:25.000000000 -0500 @@ -44,6 +44,8 @@ #include #include +#include + #include /** @@ -378,9 +380,9 @@ static inline int hrtimer_is_hres_enable /* * Is the high resolution mode active ? */ -static inline int hrtimer_hres_active(void) +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return __get_cpu_var(hrtimer_bases).hres_active; + return cpu_base->hres_active; } /* @@ -403,6 +405,13 @@ static void hrtimer_force_reprogram(stru continue; timer = rb_entry(base->first, struct hrtimer, node); expires = ktime_sub(timer->expires, base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } @@ -468,11 +477,12 @@ static int hrtimer_reprogram(struct hrti */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base; + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset; unsigned long seq; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active(base)) return; do { @@ -482,8 +492,6 @@ static void retrigger_next_event(void *a -wall_to_monotonic.tv_nsec); } while (read_seqretry(&xtime_lock, seq)); - base = &__get_cpu_var(hrtimer_bases); - /* Adjust CLOCK_REALTIME offset */ spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = @@ -593,7 +601,6 @@ static inline int hrtimer_enqueue_reprog list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); timer->state = HRTIMER_STATE_PENDING; - raise_softirq(HRTIMER_SOFTIRQ); return 1; default: BUG(); @@ -605,10 +612,8 @@ static inline int hrtimer_enqueue_reprog /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -619,7 +624,7 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", raw_smp_processor_id()); return 0; } base->hres_active = 1; @@ -631,16 +636,25 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } +static inline void hrtimer_raise_softirq(void) +{ + raise_softirq(HRTIMER_SOFTIRQ); +} + #else -static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) +{ + return 0; +} static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) @@ -651,6 +665,7 @@ static inline int hrtimer_cb_pending(str static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static inline void hrtimer_raise_softirq(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -712,11 +727,39 @@ hrtimer_forward(struct hrtimer *timer, k orun++; } timer->expires = ktime_add_safe(timer->expires, interval); + /* + * Make sure, that the result did not wrap with a very large + * interval. + */ + if (timer->expires.tv64 < 0) + timer->expires = ktime_set(KTIME_SEC_MAX, 0); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); +unsigned long +hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + unsigned long orun = 1; + ktime_t delta; + + delta = ktime_sub(now, timer->expires); + + if (delta.tv64 < 0) + return 0; + + if (interval.tv64 < timer->base->resolution.tv64) + interval.tv64 = timer->base->resolution.tv64; + + if (unlikely(delta.tv64 >= interval.tv64)) + orun = ktime_divns(delta, ktime_to_ns(interval)) + 1; + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_overrun); + + /* * enqueue_hrtimer - internal function to (re)start a timer * @@ -731,6 +774,7 @@ static void enqueue_hrtimer(struct hrtim struct hrtimer *entry; int leftmost = 1; + ftrace_event_timer_set(&timer->expires, timer); /* * Find the right place in the rbtree: */ @@ -802,7 +846,7 @@ static void __remove_hrtimer(struct hrti if (base->first == &timer->node) { base->first = rb_next(&timer->node); /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) + if (reprogram && hrtimer_hres_active(base->cpu_base)) hrtimer_force_reprogram(base->cpu_base); } rb_erase(&timer->node, &base->active); @@ -852,6 +896,7 @@ hrtimer_start(struct hrtimer *timer, kti struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret; + int raise; base = lock_hrtimer_base(timer, &flags); @@ -885,8 +930,26 @@ hrtimer_start(struct hrtimer *timer, kti enqueue_hrtimer(timer, new_base, new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + /* + * The timer may be expired and moved to the cb_pending + * list. We can not raise the softirq with base lock held due + * to a possible deadlock with runqueue lock. + */ + raise = timer->state == HRTIMER_STATE_PENDING; + + /* + * We use preempt_disable to prevent this task from migrating after + * setting up the softirq and raising it. Otherwise, if me migrate + * we will raise the softirq on the wrong CPU. + */ + preempt_disable(); + unlock_hrtimer_base(timer, &flags); + if (raise) + hrtimer_raise_softirq(); + preempt_enable(); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_start); @@ -934,7 +997,7 @@ int hrtimer_cancel(struct hrtimer *timer if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -974,7 +1037,7 @@ ktime_t hrtimer_get_next_event(void) spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { + if (!hrtimer_hres_active(cpu_base)) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -1045,6 +1108,32 @@ int hrtimer_get_res(const clockid_t whic } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1064,6 +1153,7 @@ void hrtimer_interrupt(struct clock_even retry: now = ktime_get(); + ftrace_event_timestamp(&now); expires_next.tv64 = KTIME_MAX; @@ -1092,6 +1182,8 @@ void hrtimer_interrupt(struct clock_even break; } + ftrace_event_timer_triggered(&timer->expires, timer); + /* Move softirq callbacks to the pending list */ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { __remove_hrtimer(timer, base, @@ -1137,7 +1229,9 @@ void hrtimer_interrupt(struct clock_even static void run_hrtimer_softirq(struct softirq_action *h) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; + + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); spin_lock_irq(&cpu_base->lock); @@ -1172,11 +1266,24 @@ static void run_hrtimer_softirq(struct s * If the timer was rearmed on another CPU, reprogram * the event device. */ - if (timer->base->first == &timer->node) - hrtimer_reprogram(timer, timer->base); + struct hrtimer_clock_base *base = timer->base; + + if (base->first == &timer->node && + hrtimer_reprogram(timer, base)) { + /* + * Timer is expired. Thus move it from tree to + * pending list again. + */ + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + } } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1227,6 +1334,8 @@ static inline void run_hrtimer_queue(str } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } /* @@ -1238,10 +1347,11 @@ static inline void run_hrtimer_queue(str */ void hrtimer_run_queues(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base; int i; - if (hrtimer_hres_active()) + cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); + if (hrtimer_hres_active(cpu_base)) return; /* @@ -1253,7 +1363,7 @@ void hrtimer_run_queues(void) * deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - if (hrtimer_switch_to_hres()) + if (hrtimer_switch_to_hres(cpu_base)) return; hrtimer_get_softirq_time(cpu_base); @@ -1408,6 +1518,9 @@ static void __cpuinit init_hrtimers_cpu( cpu_base->clock_base[i].cpu_base = cpu_base; hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU @@ -1442,7 +1555,7 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, + raw_double_spin_lock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1450,7 +1563,7 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, + raw_double_spin_unlock(&new_base->lock, &old_base->lock, smp_processor_id() < cpu); local_irq_enable(); put_cpu_var(hrtimer_bases); Index: linux-2.6.24.7-rt27/fs/select.c =================================================================== --- linux-2.6.24.7-rt27.orig/fs/select.c 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/fs/select.c 2009-02-08 00:03:14.000000000 -0500 @@ -407,20 +407,12 @@ asmlinkage long sys_select(int n, fd_set rtv.tv_sec = timeout; if (timeval_compare(&rtv, &tv) >= 0) rtv = tv; - if (copy_to_user(tvp, &rtv, sizeof(rtv))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } + if (copy_to_user(tvp, &rtv, sizeof(rtv))) + return -EFAULT; } +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; return ret; } @@ -739,7 +731,7 @@ asmlinkage long sys_poll(struct pollfd _ timeout_jiffies = -1; else #endif - timeout_jiffies = msecs_to_jiffies(timeout_msecs); + timeout_jiffies = msecs_to_jiffies(timeout_msecs) + 1; } else { /* Infinite (< 0) or no (0) timeout */ timeout_jiffies = timeout_msecs; Index: linux-2.6.24.7-rt27/arch/x86/kernel/entry_64.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/entry_64.S 2009-02-08 00:00:39.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/entry_64.S 2009-02-08 00:05:15.000000000 -0500 @@ -53,6 +53,85 @@ .code64 +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + +#define HARDNMI_MASK 0x40000000 + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -234,7 +313,10 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative +system_call_ret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -268,8 +350,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON sti pushq %rdi @@ -292,7 +374,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli @@ -304,7 +386,7 @@ badsys: jmp ret_from_sys_call /* Do syscall tracing */ -tracesys: +tracesys: SAVE_REST movq $-ENOSYS,RAX(%rsp) FIXUP_TOP_OF_STACK %rdi @@ -317,7 +399,10 @@ tracesys: cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) +traceret: + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ @@ -346,8 +431,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON sti pushq %rdi @@ -382,7 +467,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -588,8 +673,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON sti pushq %rdi @@ -615,7 +700,7 @@ retint_signal: RESTORE_REST cli TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -779,7 +864,7 @@ paranoid_swapgs\trace: swapgs paranoid_restore\trace: RESTORE_ALL 8 - iretq + jmp iret_label paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx Index: linux-2.6.24.7-rt27/arch/powerpc/platforms/chrp/setup.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/platforms/chrp/setup.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/platforms/chrp/setup.c 2009-02-08 00:00:43.000000000 -0500 @@ -115,7 +115,7 @@ void chrp_show_cpuinfo(struct seq_file * seq_printf(m, "machine\t\t: CHRP %s\n", model); /* longtrail (goldengate) stuff */ - if (!strncmp(model, "IBM,LongTrail", 13)) { + if (model && !strncmp(model, "IBM,LongTrail", 13)) { /* VLSI VAS96011/12 `Golden Gate 2' */ /* Memory banks */ sdramen = (in_le32(gg2_pci_config_base + GG2_PCI_DRAM_CTRL) @@ -203,15 +203,20 @@ static void __init sio_fixup_irq(const c static void __init sio_init(void) { struct device_node *root; + const char *model; - if ((root = of_find_node_by_path("/")) && - !strncmp(of_get_property(root, "model", NULL), - "IBM,LongTrail", 13)) { + root = of_find_node_by_path("/"); + if (!root) + return; + + model = of_get_property(root, "model", NULL); + if (model && !strncmp(model,"IBM,LongTrail", 13)) { /* logical device 0 (KBC/Keyboard) */ sio_fixup_irq("keyboard", 0, 1, 2); /* select logical device 1 (KBC/Mouse) */ sio_fixup_irq("mouse", 1, 12, 2); } + of_node_put(root); } Index: linux-2.6.24.7-rt27/fs/cifs/asn1.c =================================================================== --- linux-2.6.24.7-rt27.orig/fs/cifs/asn1.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/fs/cifs/asn1.c 2009-02-08 00:00:44.000000000 -0500 @@ -186,6 +186,11 @@ asn1_length_decode(struct asn1_ctx *ctx, } } } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + return 1; } @@ -203,6 +208,10 @@ asn1_header_decode(struct asn1_ctx *ctx, if (!asn1_length_decode(ctx, &def, &len)) return 0; + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + if (def) *eoc = ctx->pointer + len; else @@ -389,6 +398,11 @@ asn1_oid_decode(struct asn1_ctx *ctx, unsigned long *optr; size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + return 0; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) return 0; Index: linux-2.6.24.7-rt27/net/ipv4/netfilter/nf_nat_snmp_basic.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/ipv4/netfilter/nf_nat_snmp_basic.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/net/ipv4/netfilter/nf_nat_snmp_basic.c 2009-02-08 00:00:44.000000000 -0500 @@ -231,6 +231,11 @@ static unsigned char asn1_length_decode( } } } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + return 1; } @@ -249,6 +254,10 @@ static unsigned char asn1_header_decode( if (!asn1_length_decode(ctx, &def, &len)) return 0; + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + if (def) *eoc = ctx->pointer + len; else @@ -433,6 +442,11 @@ static unsigned char asn1_oid_decode(str unsigned long *optr; size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + return 0; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) { if (net_ratelimit()) Index: linux-2.6.24.7-rt27/net/ipv6/sit.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/ipv6/sit.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/net/ipv6/sit.c 2009-02-08 00:00:44.000000000 -0500 @@ -395,9 +395,9 @@ static int ipip6_rcv(struct sk_buff *skb } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - kfree_skb(skb); read_unlock(&ipip6_lock); out: + kfree_skb(skb); return 0; } Index: linux-2.6.24.7-rt27/kernel/sched_fair.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/sched_fair.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/sched_fair.c 2009-02-08 00:05:00.000000000 -0500 @@ -185,6 +185,9 @@ static void __dequeue_entity(struct cfs_ if (cfs_rq->rb_leftmost == &se->run_node) cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_load_balance_curr == &se->run_node) + cfs_rq->rb_load_balance_curr = rb_next(&se->run_node); + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -263,12 +266,8 @@ static u64 __sched_period(unsigned long */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_mine(__sched_period(cfs_rq->nr_running), + se->load.weight, &cfs_rq->load); } /* @@ -757,10 +756,11 @@ static inline struct sched_entity *paren * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int wakeup = flags & ENQUEUE_WAKEUP; for_each_sched_entity(se) { if (se->on_rq) @@ -776,10 +776,11 @@ static void enqueue_task_fair(struct rq * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int sleep = flags & DEQUEUE_SLEEP; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -836,6 +837,154 @@ static void yield_task_fair(struct rq *r } /* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, struct task_struct *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) { + if (i != task_cpu(p)) { + schedstat_inc(p, + se.nr_wakeups_idle); + } + return i; + } + } + } else { + break; + } + } + return cpu; +} +#else +static inline int wake_idle(int cpu, struct task_struct *p) +{ + return cpu; +} +#endif + +#ifdef CONFIG_SMP +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + int cpu, this_cpu; + struct rq *rq; + struct sched_domain *sd, *this_sd = NULL; + int new_cpu; + + cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + new_cpu = cpu; + + if (cpu == this_cpu) + goto out_set_cpu; + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + this_sd = sd; + break; + } + } + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + /* + * Check for affine wakeup and passive balancing possibilities. + */ + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; + unsigned long load, this_load; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; + unsigned long tl_per_task; + + /* + * Attract cache-cold tasks on sync wakeups: + */ + if (sync && !task_hot(p, rq->clock, this_sd)) + goto out_set_cpu; + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && + tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + goto out_set_cpu; + } + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + goto out_set_cpu; + } + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + return wake_idle(new_cpu, p); +} +#endif /* CONFIG_SMP */ + + +/* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) @@ -971,7 +1120,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *lb_flags, int *this_best_prio) { struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; @@ -1007,7 +1156,7 @@ load_balance_fair(struct rq *this_rq, in */ cfs_rq_iterator.arg = busy_cfs_rq; rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, + maxload, sd, idle, lb_flags, this_best_prio, &cfs_rq_iterator); @@ -1041,6 +1190,12 @@ move_one_task_fair(struct rq *this_rq, i return 0; } + +static int +is_runnable_fair(struct rq *this_rq) +{ + return !!this_rq->cfs.nr_running; +} #endif /* @@ -1091,6 +1246,42 @@ static void task_new_fair(struct rq *rq, resched_task(rq->curr); } +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void prio_changed_fair(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p); +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p, + int running) +{ + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (running) + resched_task(rq->curr); + else + check_preempt_curr(rq, p); +} + /* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task @@ -1112,6 +1303,9 @@ static const struct sched_class fair_sch .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_wakeup, @@ -1121,11 +1315,15 @@ static const struct sched_class fair_sch #ifdef CONFIG_SMP .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .is_runnable = is_runnable_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, + + .prio_changed = prio_changed_fair, + .switched_to = switched_to_fair, }; #ifdef CONFIG_SCHED_DEBUG Index: linux-2.6.24.7-rt27/net/ipv4/esp4.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/ipv4/esp4.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/net/ipv4/esp4.c 2009-02-08 00:00:45.000000000 -0500 @@ -165,7 +165,7 @@ static int esp_input(struct xfrm_state * int padlen; int err; - if (!pskb_may_pull(skb, sizeof(*esph))) + if (!pskb_may_pull(skb, sizeof(*esph) + esp->conf.ivlen)) goto out; if (elen <= 0 || (elen & (blksize-1))) Index: linux-2.6.24.7-rt27/net/ipv6/esp6.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/ipv6/esp6.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/net/ipv6/esp6.c 2009-02-08 00:00:45.000000000 -0500 @@ -155,7 +155,7 @@ static int esp6_input(struct xfrm_state int nfrags; int ret = 0; - if (!pskb_may_pull(skb, sizeof(*esph))) { + if (!pskb_may_pull(skb, sizeof(*esph) + esp->conf.ivlen)) { ret = -EINVAL; goto out; } Index: linux-2.6.24.7-rt27/fs/utimes.c =================================================================== --- linux-2.6.24.7-rt27.orig/fs/utimes.c 2009-02-08 00:00:38.000000000 -0500 +++ linux-2.6.24.7-rt27/fs/utimes.c 2009-02-08 00:00:45.000000000 -0500 @@ -38,9 +38,14 @@ asmlinkage long sys_utime(char __user *f #endif +static bool nsec_special(long nsec) +{ + return nsec == UTIME_OMIT || nsec == UTIME_NOW; +} + static bool nsec_valid(long nsec) { - if (nsec == UTIME_OMIT || nsec == UTIME_NOW) + if (nsec_special(nsec)) return true; return nsec >= 0 && nsec <= 999999999; @@ -114,7 +119,15 @@ long do_utimes(int dfd, char __user *fil newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; newattrs.ia_valid |= ATTR_MTIME_SET; } - } else { + } + + /* + * If times is NULL or both times are either UTIME_OMIT or + * UTIME_NOW, then need to check permissions, because + * inode_change_ok() won't do it. + */ + if (!times || (nsec_special(times[0].tv_nsec) && + nsec_special(times[1].tv_nsec))) { error = -EACCES; if (IS_IMMUTABLE(inode)) goto dput_and_out; Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/vdso.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/vdso.c 2009-02-08 00:00:36.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/vdso.c 2009-02-08 00:00:45.000000000 -0500 @@ -141,7 +141,7 @@ static void dump_one_vdso_page(struct pa printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT), page_count(pg), pg->flags); - if (upg/* && pg != upg*/) { + if (upg && !IS_ERR(upg) /* && pg != upg*/) { printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg) << PAGE_SHIFT), page_count(upg), Index: linux-2.6.24.7-rt27/mm/migrate.c =================================================================== --- linux-2.6.24.7-rt27.orig/mm/migrate.c 2009-02-08 00:00:36.000000000 -0500 +++ linux-2.6.24.7-rt27/mm/migrate.c 2009-02-08 00:03:09.000000000 -0500 @@ -295,6 +295,7 @@ static int migrate_page_move_mapping(str struct page *newpage, struct page *page) { void **pslot; + struct radix_tree_context ctx; if (!mapping) { /* Anonymous page without mapping */ @@ -303,14 +304,15 @@ static int migrate_page_move_mapping(str return 0; } - write_lock_irq(&mapping->tree_lock); - - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + init_radix_tree_context(&ctx, &mapping->page_tree); + lock_page_ref_irq(page); + radix_tree_lock(&ctx); + pslot = radix_tree_lookup_slot(ctx.tree, page_index(page)); if (page_count(page) != 2 + !!PagePrivate(page) || (struct page *)radix_tree_deref_slot(pslot) != page) { - write_unlock_irq(&mapping->tree_lock); + radix_tree_unlock(&ctx); + unlock_page_ref_irq(page); return -EAGAIN; } @@ -326,12 +328,8 @@ static int migrate_page_move_mapping(str #endif radix_tree_replace_slot(pslot, newpage); - - /* - * Drop cache reference from old page. - * We know this isn't the last reference. - */ - __put_page(page); + page->mapping = NULL; + radix_tree_unlock(&ctx); /* * If moved to a different zone then also account @@ -346,7 +344,13 @@ static int migrate_page_move_mapping(str __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - write_unlock_irq(&mapping->tree_lock); + unlock_page_ref_irq(page); + + /* + * Drop cache reference from old page. + * We know this isn't the last reference. + */ + __put_page(page); return 0; } @@ -823,6 +827,11 @@ static int do_move_pages(struct mm_struc goto set_status; page = follow_page(vma, pp->addr, FOLL_GET); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + err = -ENOENT; if (!page) goto set_status; @@ -886,6 +895,11 @@ static int do_pages_stat(struct mm_struc goto set_status; page = follow_page(vma, pm->addr, 0); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + err = -ENOENT; /* Use PageReserved to check for zero page */ if (!page || PageReserved(page)) Index: linux-2.6.24.7-rt27/fs/inotify_user.c =================================================================== --- linux-2.6.24.7-rt27.orig/fs/inotify_user.c 2009-02-08 00:00:35.000000000 -0500 +++ linux-2.6.24.7-rt27/fs/inotify_user.c 2009-02-08 00:00:46.000000000 -0500 @@ -248,6 +248,19 @@ inotify_dev_get_event(struct inotify_dev } /* + * inotify_dev_get_last_event - return the last event in the given dev's queue + * + * Caller must hold dev->ev_mutex. + */ +static inline struct inotify_kernel_event * +inotify_dev_get_last_event(struct inotify_device *dev) +{ + if (list_empty(&dev->events)) + return NULL; + return list_entry(dev->events.prev, struct inotify_kernel_event, list); +} + +/* * inotify_dev_queue_event - event handler registered with core inotify, adds * a new event to the given device * @@ -273,7 +286,7 @@ static void inotify_dev_queue_event(stru put_inotify_watch(w); /* final put */ /* coalescing: drop this event if it is a dupe of the previous */ - last = inotify_dev_get_event(dev); + last = inotify_dev_get_last_event(dev); if (last && last->event.mask == mask && last->event.wd == wd && last->event.cookie == cookie) { const char *lastname = last->name; Index: linux-2.6.24.7-rt27/net/sctp/socket.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/sctp/socket.c 2009-02-08 00:00:35.000000000 -0500 +++ linux-2.6.24.7-rt27/net/sctp/socket.c 2009-02-08 00:00:46.000000000 -0500 @@ -4391,7 +4391,9 @@ static int sctp_getsockopt_local_addrs_o if (copy_from_user(&getaddrs, optval, len)) return -EFAULT; - if (getaddrs.addr_num <= 0) return -EINVAL; + if (getaddrs.addr_num <= 0 || + getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr))) + return -EINVAL; /* * For UDP-style sockets, id specifies the association to query. * If the id field is set to the value '0' then the locally bound Index: linux-2.6.24.7-rt27/arch/x86/ia32/ptrace32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/ia32/ptrace32.c 2009-02-08 00:00:35.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/ia32/ptrace32.c 2009-02-08 00:00:47.000000000 -0500 @@ -75,10 +75,18 @@ static int putreg32(struct task_struct * R32(esi, rsi); R32(ebp, rbp); R32(eax, rax); - R32(orig_eax, orig_rax); R32(eip, rip); R32(esp, rsp); + case offsetof(struct user_regs_struct32, orig_eax): { + /* + * Sign-extend the value so that orig_eax = -1 + * causes (long)orig_rax < 0 tests to fire correctly. + */ + stack[offsetof(struct pt_regs, orig_rax)/8] = (long) (s32) val; + break; + } + case offsetof(struct user32, regs.eflags): { __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; val &= FLAG_MASK; Index: linux-2.6.24.7-rt27/arch/x86/kernel/signal_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/signal_64.c 2009-02-08 00:00:35.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/signal_64.c 2009-02-08 00:02:25.000000000 -0500 @@ -311,6 +311,35 @@ give_sigsegv: } /* + * Return -1L or the syscall number that @regs is executing. + */ +static long current_syscall(struct pt_regs *regs) +{ + /* + * We always sign-extend a -1 value being set here, + * so this is always either -1L or a syscall number. + */ + return regs->orig_rax; +} + +/* + * Return a value that is -EFOO if the system call in @regs->orig_rax + * returned an error. This only works for @regs from @current. + */ +static long current_syscall_ret(struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + return (int) regs->rax; +#endif + return regs->rax; +} + +/* * OK, we're invoking a handler */ @@ -327,9 +356,9 @@ handle_signal(unsigned long sig, siginfo #endif /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { + if (current_syscall(regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->rax) { + switch (current_syscall_ret(regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->rax = -EINTR; @@ -394,6 +423,13 @@ static void do_signal(struct pt_regs *re int signr; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from @@ -430,10 +466,9 @@ static void do_signal(struct pt_regs *re } /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { + if (current_syscall(regs) >= 0) { /* Restart the system call - no handlers present */ - long res = regs->rax; - switch (res) { + switch (current_syscall_ret(regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: Index: linux-2.6.24.7-rt27/arch/x86/kernel/ptrace_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/ptrace_64.c 2009-02-08 00:00:35.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/ptrace_64.c 2009-02-08 00:00:47.000000000 -0500 @@ -267,6 +267,16 @@ static int putreg(struct task_struct *ch return -EIO; child->thread.gs = value; return 0; + case offsetof(struct user_regs_struct, orig_rax): + /* + * Orig_rax is really just a flag with small positive + * and negative values, so make sure to always + * sign-extend it from 32 bits so that it works + * correctly regardless of whether we come from a + * 32-bit environment or not. + */ + value = (long) (s32) value; + break; case offsetof(struct user_regs_struct, eflags): value &= FLAG_MASK; tmp = get_stack_long(child, EFL_OFFSET); Index: linux-2.6.24.7-rt27/arch/x86/kernel/vsyscall_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/vsyscall_64.c 2009-02-08 00:00:35.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/vsyscall_64.c 2009-02-08 00:04:34.000000000 -0500 @@ -42,13 +42,8 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","rcx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : @@ -60,7 +55,7 @@ int __vgetcpu_mode __section_vgetcpu_mod struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -79,14 +74,40 @@ void update_vsyscall(struct timespec *wa unsigned long flags; write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + + if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timespec tmp = *(wall_time); + cycle_t (*vread)(void); + cycle_t now; + + vread = vsyscall_gtod_data.clock.vread; + if (likely(vread)) + now = vread(); + else + now = clock->read(); + + /* calculate interval: */ + now = (now - clock->cycle_last) & clock->mask; + /* convert to nsecs: */ + tmp.tv_nsec += ( now * clock->mult) >> clock->shift; + + while (tmp.tv_nsec >= NSEC_PER_SEC) { + tmp.tv_sec += 1; + tmp.tv_nsec -= NSEC_PER_SEC; + } + + vsyscall_gtod_data.wall_time_sec = tmp.tv_sec; + vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec; + } else { + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + } /* copy vsyscall data */ vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = clock->mult; vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -102,7 +123,7 @@ static __always_inline void do_get_tz(st static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("vsysc2: syscall" + asm volatile("syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); @@ -112,7 +133,7 @@ static __always_inline int gettimeofday( static __always_inline long time_syscall(long *t) { long secs; - asm volatile("vsysc1: syscall" + asm volatile("syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -124,6 +145,26 @@ static __always_inline void do_vgettimeo unsigned seq; unsigned long mult, shift, nsec; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; + barrier(); + tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; + tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + tv->tv_usec /= NSEC_PER_MSEC; + tv->tv_usec *= USEC_PER_MSEC; + return; + } + do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -132,7 +173,6 @@ static __always_inline void do_vgettimeo gettimeofday(tv,NULL); return; } - now = vread(); base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; @@ -142,6 +182,7 @@ static __always_inline void do_vgettimeo nsec = __vsyscall_gtod_data.wall_time_nsec; } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + now = vread(); /* calculate interval: */ cycle_delta = (now - base) & mask; /* convert to nsecs: */ @@ -227,50 +268,10 @@ long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .mode = 0644 }, {} }; @@ -279,7 +280,6 @@ static ctl_table kernel_root_table2[] = .child = kernel_table2 }, {} }; - #endif /* Assume __initcall executes before all user space. Hopefully kmod Index: linux-2.6.24.7-rt27/arch/m68knommu/Kconfig =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/Kconfig 2009-02-08 00:00:32.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/Kconfig 2009-02-08 00:02:07.000000000 -0500 @@ -29,6 +29,10 @@ config RWSEM_XCHGADD_ALGORITHM bool default n +config ASM_SEMAPHORES + bool + default y + config ARCH_HAS_ILOG2_U32 bool default n @@ -53,10 +57,22 @@ config GENERIC_CALIBRATE_DELAY bool default y +config GENERIC_TIME + bool + default y + +config GENERIC_CMOS_UPDATE + bool + default y + config TIME_LOW_RES bool default y +config GENERIC_CLOCKEVENTS + bool + default n + config NO_IOPORT def_bool y @@ -100,11 +116,14 @@ config M5206e config M520x bool "MCF520x" + select GENERIC_CLOCKEVENTS help Freescale Coldfire 5207/5208 processor support. config M523x bool "MCF523x" + select GENERIC_CLOCKEVENTS + select GENERIC_HARDIRQS_NO__DO_IRQ help Freescale Coldfire 5230/1/2/4/5 processor support @@ -130,6 +149,7 @@ config M5275 config M528x bool "MCF528x" + select GENERIC_CLOCKEVENTS help Motorola ColdFire 5280/5282 processor support. @@ -153,11 +173,13 @@ endchoice config M527x bool depends on (M5271 || M5275) + select GENERIC_CLOCKEVENTS default y config COLDFIRE bool depends on (M5206 || M5206e || M520x || M523x || M5249 || M527x || M5272 || M528x || M5307 || M532x || M5407) + select HAVE_FTRACE default y config CLOCK_SET @@ -658,6 +680,13 @@ config ROMKERNEL endchoice +config GENERIC_HARDIRQS_NO__DO_IRQ + bool "Force generic IRQ implementation" + +source "kernel/time/Kconfig" +if COLDFIRE +source "kernel/Kconfig.preempt" +endif source "mm/Kconfig" endmenu Index: linux-2.6.24.7-rt27/arch/m68knommu/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/Makefile 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/Makefile 2009-02-08 00:00:48.000000000 -0500 @@ -61,17 +61,17 @@ MODEL := $(model-y) # for the selected cpu. ONLY need to define this for the non-base member # of the family. # -cpuclass-$(CONFIG_M5206) := 5307 -cpuclass-$(CONFIG_M5206e) := 5307 -cpuclass-$(CONFIG_M520x) := 5307 -cpuclass-$(CONFIG_M523x) := 5307 -cpuclass-$(CONFIG_M5249) := 5307 -cpuclass-$(CONFIG_M527x) := 5307 -cpuclass-$(CONFIG_M5272) := 5307 -cpuclass-$(CONFIG_M528x) := 5307 -cpuclass-$(CONFIG_M5307) := 5307 -cpuclass-$(CONFIG_M532x) := 5307 -cpuclass-$(CONFIG_M5407) := 5307 +cpuclass-$(CONFIG_M5206) := coldfire +cpuclass-$(CONFIG_M5206e) := coldfire +cpuclass-$(CONFIG_M520x) := coldfire +cpuclass-$(CONFIG_M523x) := coldfire +cpuclass-$(CONFIG_M5249) := coldfire +cpuclass-$(CONFIG_M527x) := coldfire +cpuclass-$(CONFIG_M5272) := coldfire +cpuclass-$(CONFIG_M528x) := coldfire +cpuclass-$(CONFIG_M5307) := coldfire +cpuclass-$(CONFIG_M532x) := coldfire +cpuclass-$(CONFIG_M5407) := coldfire cpuclass-$(CONFIG_M68328) := 68328 cpuclass-$(CONFIG_M68EZ328) := 68328 cpuclass-$(CONFIG_M68VZ328) := 68328 @@ -90,13 +90,14 @@ export PLATFORM BOARD MODEL CPUCLASS cflags-$(CONFIG_M5206) := -m5200 cflags-$(CONFIG_M5206e) := -m5200 cflags-$(CONFIG_M520x) := -m5307 -cflags-$(CONFIG_M523x) := -m5307 +cflags-$(CONFIG_M523x) := $(call cc-option,-mcpu=523x,-m5307) cflags-$(CONFIG_M5249) := -m5200 -cflags-$(CONFIG_M527x) := -m5307 +cflags-$(CONFIG_M5271) := $(call cc-option,-mcpu=5271,-m5307) cflags-$(CONFIG_M5272) := -m5307 -cflags-$(CONFIG_M528x) := -m5307 +cflags-$(CONFIG_M5275) := $(call cc-option,-mcpu=5275,-m5307) +cflags-$(CONFIG_M528x) := $(call cc-option,-m528x,-m5307) cflags-$(CONFIG_M5307) := -m5307 -cflags-$(CONFIG_M532x) := -m5307 +cflags-$(CONFIG_M532x) := $(call cc-option,-mcpu=532x,-m5307) cflags-$(CONFIG_M5407) := -m5200 cflags-$(CONFIG_M68328) := -m68000 cflags-$(CONFIG_M68EZ328) := -m68000 Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/asm-offsets.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/asm-offsets.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/asm-offsets.c 2009-02-08 00:00:48.000000000 -0500 @@ -91,6 +91,7 @@ int main(void) DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_EXECDOMAIN, offsetof(struct thread_info, exec_domain)); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_PREEMPTCOUNT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); return 0; Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/irq.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/irq.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/irq.c 2009-02-08 00:00:48.000000000 -0500 @@ -23,7 +23,7 @@ asmlinkage void do_IRQ(int irq, struct p struct pt_regs *oldregs = set_irq_regs(regs); irq_enter(); - __do_IRQ(irq); + generic_handle_irq(irq); irq_exit(); set_irq_regs(oldregs); @@ -34,12 +34,16 @@ void ack_bad_irq(unsigned int irq) printk(KERN_ERR "IRQ: unexpected irq=%d\n", irq); } +#ifndef CONFIG_M523x static struct irq_chip m_irq_chip = { .name = "M68K-INTC", .enable = enable_vector, .disable = disable_vector, .ack = ack_vector, }; +#else +void coldfire_init_irq_chip(void); +#endif void __init init_IRQ(void) { @@ -47,12 +51,16 @@ void __init init_IRQ(void) init_vectors(); +#ifndef CONFIG_M523x for (irq = 0; (irq < NR_IRQS); irq++) { irq_desc[irq].status = IRQ_DISABLED; irq_desc[irq].action = NULL; irq_desc[irq].depth = 1; irq_desc[irq].chip = &m_irq_chip; } +#else + coldfire_init_irq_chip(); +#endif } int show_interrupts(struct seq_file *p, void *v) @@ -79,4 +87,3 @@ int show_interrupts(struct seq_file *p, return 0; } - Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/setup.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/setup.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/setup.c 2009-02-08 00:00:48.000000000 -0500 @@ -165,7 +165,7 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "DragonEngine II board support by Georges Menie\n"); #endif #ifdef CONFIG_M5235EVB - printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)"); + printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n"); #endif #ifdef DEBUG Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/time.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/time.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/time.c 2009-02-08 00:00:48.000000000 -0500 @@ -22,7 +22,6 @@ #include #include -#include #include #define TICK_SIZE (tick_nsec / 1000) @@ -34,14 +33,13 @@ static inline int set_rtc_mmss(unsigned return -1; } +#ifndef CONFIG_GENERIC_CLOCKEVENTS /* * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick */ irqreturn_t arch_timer_interrupt(int irq, void *dummy) { - /* last time the cmos clock got updated */ - static long last_rtc_update=0; write_seqlock(&xtime_lock); @@ -52,49 +50,12 @@ irqreturn_t arch_timer_interrupt(int irq if (current->pid) profile_tick(CPU_PROFILING); - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if (ntp_synced() && - xtime.tv_sec > last_rtc_update + 660 && - (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 && - (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) { - if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ - } -#ifdef CONFIG_HEARTBEAT - /* use power LED as a heartbeat instead -- much more useful - for debugging -- based on the version for PReP by Cort */ - /* acts like an actual heart beat -- ie thump-thump-pause... */ - if (mach_heartbeat) { - static unsigned cnt = 0, period = 0, dist = 0; - - if (cnt == 0 || cnt == dist) - mach_heartbeat( 1 ); - else if (cnt == 7 || cnt == dist+7) - mach_heartbeat( 0 ); - - if (++cnt > period) { - cnt = 0; - /* The hyperbolic function below modifies the heartbeat period - * length in dependency of the current (5min) load. It goes - * through the points f(0)=126, f(1)=86, f(5)=51, - * f(inf)->30. */ - period = ((672<= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; + return read_rtc_mmss(); } -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) +int update_persistent_clock(struct timespec now) { - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set the xtime.tv_usec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ - nsec -= (hw_timer_offset() * 1000); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + return set_rtc_mmss(now.tv_sec); +} - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; +void time_init(void) +{ + hw_timer_init(); } -EXPORT_SYMBOL(do_settimeofday); Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/traps.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/traps.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/traps.c 2009-02-08 00:00:48.000000000 -0500 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -102,56 +103,79 @@ asmlinkage void buserr_c(struct frame *f force_sig(SIGSEGV, current); } +static void print_this_address(unsigned long addr, int i) +{ +#ifdef CONFIG_KALLSYMS + printk(KERN_EMERG " [%08lx] ", addr); + print_symbol(KERN_CONT "%s\n", addr); +#else + if (i % 5) + printk(KERN_CONT " [%08lx] ", addr); + else + printk(KERN_CONT "\n" KERN_EMERG " [%08lx] ", addr); + i++; +#endif +} int kstack_depth_to_print = 48; -void show_stack(struct task_struct *task, unsigned long *stack) +static void __show_stack(struct task_struct *task, unsigned long *stack) { unsigned long *endstack, addr; - extern char _start, _etext; +#ifdef CONFIG_FRAME_POINTER + unsigned long *last_stack; +#endif int i; - if (!stack) { - if (task) - stack = (unsigned long *)task->thread.ksp; - else - stack = (unsigned long *)&stack; - } + if (!stack) + stack = (unsigned long *)task->thread.ksp; addr = (unsigned long) stack; endstack = (unsigned long *) PAGE_ALIGN(addr); printk(KERN_EMERG "Stack from %08lx:", (unsigned long)stack); for (i = 0; i < kstack_depth_to_print; i++) { - if (stack + 1 > endstack) + if (stack + 1 + i > endstack) break; if (i % 8 == 0) printk("\n" KERN_EMERG " "); - printk(" %08lx", *stack++); + printk(" %08lx", *(stack + i)); } printk("\n"); - - printk(KERN_EMERG "Call Trace:"); i = 0; - while (stack + 1 <= endstack) { + +#ifdef CONFIG_FRAME_POINTER + printk(KERN_EMERG "Call Trace:\n"); + + last_stack = stack - 1; + while (stack <= endstack && stack > last_stack) { + + addr = *(stack + 1); + print_this_address(addr, i); + i++; + + last_stack = stack; + stack = (unsigned long *)*stack; + } + printk("\n"); +#else + printk(KERN_EMERG "Call Trace with CONFIG_FRAME_POINTER disabled:\n"); + while (stack <= endstack) { addr = *stack++; /* - * If the address is either in the text segment of the - * kernel, or in the region which contains vmalloc'ed - * memory, it *may* be the address of a calling - * routine; if so, print it so that someone tracing - * down the cause of the crash will be able to figure - * out the call path that was taken. + * If the address is either in the text segment of the kernel, + * or in a region which is occupied by a module then it *may* + * be the address of a calling routine; if so, print it so that + * someone tracing down the cause of the crash will be able to + * figure out the call path that was taken. */ - if (((addr >= (unsigned long) &_start) && - (addr <= (unsigned long) &_etext))) { - if (i % 4 == 0) - printk("\n" KERN_EMERG " "); - printk(" [<%08lx>]", addr); + if (__kernel_text_address(addr)) { + print_this_address(addr, i); i++; } } - printk("\n"); + printk(KERN_CONT "\n"); +#endif } void bad_super_trap(struct frame *fp) @@ -298,19 +322,47 @@ asmlinkage void set_esp0(unsigned long s current->thread.esp0 = ssp; } - /* * The architecture-independent backtrace generator */ void dump_stack(void) { - unsigned long stack; - - show_stack(current, &stack); + /* + * We need frame pointers for this little trick, which works as follows: + * + * +------------+ 0x00 + * | Next SP | -> 0x0c + * +------------+ 0x04 + * | Caller | + * +------------+ 0x08 + * | Local vars | -> our stack var + * +------------+ 0x0c + * | Next SP | -> 0x18, that is what we pass to show_stack() + * +------------+ 0x10 + * | Caller | + * +------------+ 0x14 + * | Local vars | + * +------------+ 0x18 + * | ... | + * +------------+ + */ + + unsigned long *stack; + + stack = (unsigned long *)&stack; + stack++; + __show_stack(current, stack); } - EXPORT_SYMBOL(dump_stack); +void show_stack(struct task_struct *task, unsigned long *stack) +{ + if (!stack && !task) + dump_stack(); + else + __show_stack(task, stack); +} + #ifdef CONFIG_M68KFPU_EMU asmlinkage void fpemu_signal(int signal, int code, void *addr) { Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/vmlinux.lds.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/vmlinux.lds.S 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/vmlinux.lds.S 2009-02-08 00:00:48.000000000 -0500 @@ -7,6 +7,8 @@ * run kernels. */ +#define OUTPUT_DATA_SECTION > DATA + #include #if defined(CONFIG_RAMKERNEL) @@ -34,7 +36,6 @@ #define DATA_ADDR #endif - OUTPUT_ARCH(m68k) ENTRY(_start) @@ -64,81 +65,32 @@ SECTIONS { _stext = . ; TEXT_TEXT SCHED_TEXT + LOCK_TEXT *(.text.lock) - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - - *(.rodata) *(.rodata.*) - *(__vermagic) /* Kernel version magic */ - *(.rodata1) - *(.rodata.str1.1) - - /* Kernel symbol table: Normal symbols */ - . = ALIGN(4); - __start___ksymtab = .; - *(__ksymtab) - __stop___ksymtab = .; - - /* Kernel symbol table: GPL-only symbols */ - __start___ksymtab_gpl = .; - *(__ksymtab_gpl) - __stop___ksymtab_gpl = .; - - /* Kernel symbol table: Normal unused symbols */ - __start___ksymtab_unused = .; - *(__ksymtab_unused) - __stop___ksymtab_unused = .; - - /* Kernel symbol table: GPL-only unused symbols */ - __start___ksymtab_unused_gpl = .; - *(__ksymtab_unused_gpl) - __stop___ksymtab_unused_gpl = .; - - /* Kernel symbol table: GPL-future symbols */ - __start___ksymtab_gpl_future = .; - *(__ksymtab_gpl_future) - __stop___ksymtab_gpl_future = .; - - /* Kernel symbol table: Normal symbols */ - __start___kcrctab = .; - *(__kcrctab) - __stop___kcrctab = .; - - /* Kernel symbol table: GPL-only symbols */ - __start___kcrctab_gpl = .; - *(__kcrctab_gpl) - __stop___kcrctab_gpl = .; - - /* Kernel symbol table: GPL-future symbols */ - __start___kcrctab_gpl_future = .; - *(__kcrctab_gpl_future) - __stop___kcrctab_gpl_future = .; - - /* Kernel symbol table: strings */ - *(__ksymtab_strings) - - /* Built-in module parameters */ - . = ALIGN(4) ; - __start___param = .; - *(__param) - __stop___param = .; - . = ALIGN(4) ; - _etext = . ; } > TEXT + _etext = . ; + + RODATA + .data DATA_ADDR : { . = ALIGN(4); _sdata = . ; DATA_DATA + . = ALIGN(16); /* Exception table */ + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; . = ALIGN(8192) ; *(.data.init_task) _edata = . ; } > DATA + BUG_TABLE + PERCPU(4096) + .init : { . = ALIGN(4096); __init_begin = .; @@ -169,12 +121,6 @@ SECTIONS { __init_end = .; } > INIT - /DISCARD/ : { - *(.exit.text) - *(.exit.data) - *(.exitcall.exit) - } - .bss : { . = ALIGN(4); _sbss = . ; @@ -184,5 +130,11 @@ SECTIONS { _ebss = . ; } > BSS -} + _end = . ; + /DISCARD/ : { + *(.exit.text) + *(.exit.data) + *(.exitcall.exit) + } +} Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5206/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5206/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5206/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -13,12 +13,11 @@ #include #include #include -#include +#include #include #include -#include #include -#include +#include /***************************************************************************/ @@ -26,15 +25,51 @@ void coldfire_reset(void); /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, - MCF_MBAR + MCFDMA_BASE1, +static struct mcf_platform_uart m5206_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = 73, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = 74, + }, + { }, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device m5206_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m5206_uart_platform, +}; + +static struct platform_device *m5206_devices[] __initdata = { + &m5206_uart, +}; + +/***************************************************************************/ + +static void __init m5206_uart_init_line(int line, int irq) +{ + if (line == 0) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); + } else if (line == 1) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); + } +} + +static void __init m5206_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m5206_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m5206_uart_init_line(line, m5206_uart_platform[line].irq); +} /***************************************************************************/ @@ -74,24 +109,21 @@ void mcf_settimericr(unsigned int timer, /***************************************************************************/ -int mcf_timerirqpending(int timer) +void __init config_BSP(char *commandp, int size) { - unsigned int imr = 0; - - switch (timer) { - case 1: imr = MCFSIM_IMR_TIMER1; break; - case 2: imr = MCFSIM_IMR_TIMER2; break; - default: break; - } - return (mcf_getipr() & imr); + mcf_setimr(MCFSIM_IMR_MASKALL); + mach_reset = coldfire_reset; } /***************************************************************************/ -void config_BSP(char *commandp, int size) +static int __init init_BSP(void) { - mcf_setimr(MCFSIM_IMR_MASKALL); - mach_reset = coldfire_reset; + m5206_uarts_init(); + platform_add_devices(m5206_devices, ARRAY_SIZE(m5206_devices)); + return 0; } +arch_initcall(init_BSP); + /***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5206e/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5206e/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5206e/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -10,8 +10,9 @@ #include #include +#include #include -#include +#include #include #include #include @@ -23,15 +24,51 @@ void coldfire_reset(void); /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, - MCF_MBAR + MCFDMA_BASE1, +static struct mcf_platform_uart m5206_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = 73, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = 74, + }, + { }, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device m5206_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m5206_uart_platform, +}; + +static struct platform_device *m5206_devices[] __initdata = { + &m5206_uart, +}; + +/***************************************************************************/ + +static void __init m5206_uart_init_line(int line, int irq) +{ + if (line == 0) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); + } else if (line == 1) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); + } +} + +static void __init m5206_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m5206_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m5206_uart_init_line(line, m5206_uart_platform[line].irq); +} /***************************************************************************/ @@ -71,21 +108,7 @@ void mcf_settimericr(unsigned int timer, /***************************************************************************/ -int mcf_timerirqpending(int timer) -{ - unsigned int imr = 0; - - switch (timer) { - case 1: imr = MCFSIM_IMR_TIMER1; break; - case 2: imr = MCFSIM_IMR_TIMER2; break; - default: break; - } - return (mcf_getipr() & imr); -} - -/***************************************************************************/ - -void config_BSP(char *commandp, int size) +void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); @@ -99,3 +122,14 @@ void config_BSP(char *commandp, int size } /***************************************************************************/ + +static int __init init_BSP(void) +{ + m5206_uarts_init(); + platform_add_devices(m5206_devices, ARRAY_SIZE(m5206_devices)); + return 0; +} + +arch_initcall(init_BSP); + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/520x/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/520x/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/520x/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -5,7 +5,7 @@ * * Copyright (C) 2005, Freescale (www.freescale.com) * Copyright (C) 2005, Intec Automation (mike@steroidmicros.com) - * Copyright (C) 1999-2003, Greg Ungerer (gerg@snapgear.com) + * Copyright (C) 1999-2007, Greg Ungerer (gerg@snapgear.com) * Copyright (C) 2001-2003, SnapGear Inc. (www.snapgear.com) */ @@ -13,21 +13,93 @@ #include #include +#include #include +#include #include -#include +#include +#include +#include /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS]; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +void coldfire_reset(void); /***************************************************************************/ -void coldfire_reset(void); +static struct mcf_platform_uart m520x_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = MCFINT_VECBASE + MCFINT_UART0, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = MCFINT_VECBASE + MCFINT_UART1, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE3, + .irq = MCFINT_VECBASE + MCFINT_UART2, + }, + { }, +}; + +static struct platform_device m520x_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m520x_uart_platform, +}; + +static struct platform_device *m520x_devices[] __initdata = { + &m520x_uart, +}; + +/***************************************************************************/ + +#define INTC0 (MCF_MBAR + MCFICM_INTC0) + +static void __init m520x_uart_init_line(int line, int irq) +{ + u32 imr; + u16 par; + u8 par2; + + writeb(0x03, INTC0 + MCFINTC_ICR0 + MCFINT_UART0 + line); + + imr = readl(INTC0 + MCFINTC_IMRL); + imr &= ~((1 << (irq - MCFINT_VECBASE)) | 1); + writel(imr, INTC0 + MCFINTC_IMRL); + + switch (line) { + case 0: + par = readw(MCF_IPSBAR + MCF_GPIO_PAR_UART); + par |= MCF_GPIO_PAR_UART_PAR_UTXD0 | + MCF_GPIO_PAR_UART_PAR_URXD0; + writew(par, MCF_IPSBAR + MCF_GPIO_PAR_UART); + break; + case 1: + par = readw(MCF_IPSBAR + MCF_GPIO_PAR_UART); + par |= MCF_GPIO_PAR_UART_PAR_UTXD1 | + MCF_GPIO_PAR_UART_PAR_URXD1; + writew(par, MCF_IPSBAR + MCF_GPIO_PAR_UART); + break; + case 2: + par2 = readb(MCF_IPSBAR + MCF_GPIO_PAR_FECI2C); + par2 &= ~0x0F; + par2 |= MCF_GPIO_PAR_FECI2C_PAR_SCL_UTXD2 | + MCF_GPIO_PAR_FECI2C_PAR_SDA_URXD2; + writeb(par2, MCF_IPSBAR + MCF_GPIO_PAR_FECI2C); + break; + } +} + +static void __init m520x_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m520x_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m520x_uart_init_line(line, m520x_uart_platform[line].irq); +} /***************************************************************************/ @@ -42,9 +114,20 @@ void mcf_autovector(unsigned int vec) /***************************************************************************/ -void config_BSP(char *commandp, int size) +void __init config_BSP(char *commandp, int size) { mach_reset = coldfire_reset; + m520x_uarts_init(); +} + +/***************************************************************************/ + +static int __init init_BSP(void) +{ + platform_add_devices(m520x_devices, ARRAY_SIZE(m520x_devices)); + return 0; } +arch_initcall(init_BSP); + /***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/523x/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/523x/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/523x/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -16,11 +16,15 @@ #include #include #include -#include +#include #include #include #include -#include +#include + +#ifdef CONFIG_MTD +#include +#endif /***************************************************************************/ @@ -28,14 +32,58 @@ void coldfire_reset(void); /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, +static struct mcf_platform_uart m523x_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = MCFINT_VECBASE + MCFINT_UART0, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = MCFINT_VECBASE + MCFINT_UART0 + 1, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE3, + .irq = MCFINT_VECBASE + MCFINT_UART0 + 2, + }, + { }, +}; + +static struct platform_device m523x_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m523x_uart_platform, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device *m523x_devices[] __initdata = { + &m523x_uart, +}; + +/***************************************************************************/ + +#define INTC0 (MCF_MBAR + MCFICM_INTC0) + +static void __init m523x_uart_init_line(int line, int irq) +{ + u32 imr; + + if ((line < 0) || (line > 2)) + return; + + writeb(0x30+line, (INTC0 + MCFINTC_ICR0 + MCFINT_UART0 + line)); + + imr = readl(INTC0 + MCFINTC_IMRL); + imr &= ~((1 << (irq - MCFINT_VECBASE)) | 1); + writel(imr, INTC0 + MCFINTC_IMRL); +} + +static void __init m523x_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m523x_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m523x_uart_init_line(line, m523x_uart_platform[line].irq); +} /***************************************************************************/ @@ -49,15 +97,85 @@ void mcf_disableall(void) void mcf_autovector(unsigned int vec) { - /* Everything is auto-vectored on the 5272 */ + /* Everything is auto-vectored on the 523x */ } /***************************************************************************/ -void config_BSP(char *commandp, int size) +#if defined(CONFIG_SAVANT) + +/* + * Do special config for SAVANT BSP + */ +static void __init config_savantBSP(char *commandP, int size) +{ + /* setup BOOTPARAM_STRING */ + strncpy(commandP, "root=/dev/mtdblock1 ro rootfstype=romfs", size); + /* Look at Chatter DIP Switch, if CS3 is enabled */ + { + uint32_t *csmr3 = (uint32_t *) (MCF_IPSBAR + MCF523x_CSMR3); + uint32_t *csar3 = (uint32_t *) (MCF_IPSBAR + MCF523x_CSAR3); + uint16_t *dipsP = (uint16_t *) *csar3; + uint16_t dipSetOff = *dipsP & 0x0100; // switch #1 + uint16_t *btnPressP = (uint16_t *)(*csar3 + 0x10); + uint16_t shortButtonPress = *btnPressP & 0x8000; + if (*csmr3 & 1) { + /* CS3 enabled */ + if (!dipSetOff && shortButtonPress) { + /* switch on, so be quiet */ + strncat(commandP, " console=", size-strlen(commandP)-1); + } + } + } + commandP[size-1] = 0; + + /* Set on-chip peripheral space to user mode */ + { + uint8_t *gpacr = (uint8_t *) (MCF_IPSBAR + MCF523x_GPACR); + uint8_t *pacr1 = (uint8_t *) (MCF_IPSBAR + MCF523x_PACR1); + uint8_t *pacr4 = (uint8_t *) (MCF_IPSBAR + MCF523x_PACR4); + uint8_t *pacr7 = (uint8_t *) (MCF_IPSBAR + MCF523x_PACR7); + uint8_t *pacr8 = (uint8_t *) (MCF_IPSBAR + MCF523x_PACR8); + *gpacr = 0x04; + *pacr1 = 0x40; /* EIM required for Chip Select access */ + *pacr4 = 0x40; /* I2C */ + *pacr7 = 0x44; /* INTC0 & 1 handy for debug */ + *pacr8 = 0x40; /* FEC MAC */ + } + +#ifdef CONFIG_MTD + /* all board spins cannot access flash from linux unless we change the map here */ + { + uint32_t *csar0 = (uint32_t *) (MCF_IPSBAR + MCF523x_CSAR0); + uint32_t start = *csar0; + uint32_t size = 0xffffFFFF - start + 1; + physmap_configure(start, size, CONFIG_MTD_PHYSMAP_BANKWIDTH, NULL); + } +#endif +} + +#endif /* CONFIG_SAVANT */ + +/***************************************************************************/ + +void __init config_BSP(char *commandp, int size) { mcf_disableall(); +#if defined(CONFIG_SAVANT) + config_savantBSP(commandp, size); +#endif /* CONFIG_SAVANT */ mach_reset = coldfire_reset; + m523x_uarts_init(); +} + +/***************************************************************************/ + +static int __init init_BSP(void) +{ + platform_add_devices(m523x_devices, ARRAY_SIZE(m523x_devices)); + return 0; } +arch_initcall(init_BSP); + /***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5249/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5249/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5249/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -12,11 +12,11 @@ #include #include #include -#include +#include #include #include #include -#include +#include /***************************************************************************/ @@ -24,17 +24,51 @@ void coldfire_reset(void); /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, - MCF_MBAR + MCFDMA_BASE1, - MCF_MBAR + MCFDMA_BASE2, - MCF_MBAR + MCFDMA_BASE3, +static struct mcf_platform_uart m5249_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = 73, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = 74, + } +}; + +static struct platform_device m5249_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m5249_uart_platform, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device *m5249_devices[] __initdata = { + &m5249_uart, +}; + +/***************************************************************************/ + +static void __init m5249_uart_init_line(int line, int irq) +{ + if (line == 0) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); + } else if (line == 1) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); + } +} + +static void __init m5249_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m5249_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m5249_uart_init_line(line, m5249_uart_platform[line].irq); +} + /***************************************************************************/ @@ -71,24 +105,21 @@ void mcf_settimericr(unsigned int timer, /***************************************************************************/ -int mcf_timerirqpending(int timer) +void __init config_BSP(char *commandp, int size) { - unsigned int imr = 0; - - switch (timer) { - case 1: imr = MCFSIM_IMR_TIMER1; break; - case 2: imr = MCFSIM_IMR_TIMER2; break; - default: break; - } - return (mcf_getipr() & imr); + mcf_setimr(MCFSIM_IMR_MASKALL); + mach_reset = coldfire_reset; } /***************************************************************************/ -void config_BSP(char *commandp, int size) +static int __init init_BSP(void) { - mcf_setimr(MCFSIM_IMR_MASKALL); - mach_reset = coldfire_reset; + m5249_uarts_init(); + platform_add_devices(m5249_devices, ARRAY_SIZE(m5249_devices)); + return 0; } +arch_initcall(init_BSP); + /***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5272/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5272/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5272/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -13,11 +13,11 @@ #include #include #include -#include +#include #include #include #include -#include +#include /***************************************************************************/ @@ -37,14 +37,57 @@ unsigned char ledbank = 0xff; /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, +static struct mcf_platform_uart m5272_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = 73, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = 74, + }, + { }, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device m5272_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m5272_uart_platform, +}; + +static struct platform_device *m5272_devices[] __initdata = { + &m5272_uart, +}; + +/***************************************************************************/ + +static void __init m5272_uart_init_line(int line, int irq) +{ + u32 v; + + if ((line >= 0) && (line < 2)) { + v = (line) ? 0x0e000000 : 0xe0000000; + writel(v, MCF_MBAR + MCFSIM_ICR2); + + /* Enable the output lines for the serial ports */ + v = readl(MCF_MBAR + MCFSIM_PBCNT); + v = (v & ~0x000000ff) | 0x00000055; + writel(v, MCF_MBAR + MCFSIM_PBCNT); + + v = readl(MCF_MBAR + MCFSIM_PDCNT); + v = (v & ~0x000003fc) | 0x000002a8; + writel(v, MCF_MBAR + MCFSIM_PDCNT); + } +} + +static void __init m5272_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m5272_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m5272_uart_init_line(line, m5272_uart_platform[line].irq); +} /***************************************************************************/ @@ -80,20 +123,7 @@ void mcf_settimericr(int timer, int leve /***************************************************************************/ -int mcf_timerirqpending(int timer) -{ - volatile unsigned long *icrp; - - if ((timer >= 1 ) && (timer <= 4)) { - icrp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_ICR1); - return (*icrp & (0x8 << ((4 - timer) * 4))); - } - return 0; -} - -/***************************************************************************/ - -void config_BSP(char *commandp, int size) +void __init config_BSP(char *commandp, int size) { #if defined (CONFIG_MOD5272) volatile unsigned char *pivrp; @@ -109,10 +139,6 @@ void config_BSP(char *commandp, int size /* Copy command line from FLASH to local buffer... */ memcpy(commandp, (char *) 0xf0004000, size); commandp[size-1] = 0; -#elif defined(CONFIG_MTD_KeyTechnology) - /* Copy command line from FLASH to local buffer... */ - memcpy(commandp, (char *) 0xffe06000, size); - commandp[size-1] = 0; #elif defined(CONFIG_CANCam) /* Copy command line from FLASH to local buffer... */ memcpy(commandp, (char *) 0xf0010000, size); @@ -125,3 +151,14 @@ void config_BSP(char *commandp, int size } /***************************************************************************/ + +static int __init init_BSP(void) +{ + m5272_uarts_init(); + platform_add_devices(m5272_devices, ARRAY_SIZE(m5272_devices)); + return 0; +} + +arch_initcall(init_BSP); + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/527x/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/527x/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/527x/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -16,11 +16,11 @@ #include #include #include -#include +#include #include #include #include -#include +#include /***************************************************************************/ @@ -28,14 +28,72 @@ void coldfire_reset(void); /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, +static struct mcf_platform_uart m527x_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = MCFINT_VECBASE + MCFINT_UART0, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = MCFINT_VECBASE + MCFINT_UART1, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE3, + .irq = MCFINT_VECBASE + MCFINT_UART2, + }, + { }, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device m527x_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m527x_uart_platform, +}; + +static struct platform_device *m527x_devices[] __initdata = { + &m527x_uart, +}; + +/***************************************************************************/ + +#define INTC0 (MCF_MBAR + MCFICM_INTC0) + +static void __init m527x_uart_init_line(int line, int irq) +{ + u16 sepmask; + u32 imr; + + if ((line < 0) || (line > 2)) + return; + + /* level 6, line based priority */ + writeb(0x30+line, INTC0 + MCFINTC_ICR0 + MCFINT_UART0 + line); + + imr = readl(INTC0 + MCFINTC_IMRL); + imr &= ~((1 << (irq - MCFINT_VECBASE)) | 1); + writel(imr, INTC0 + MCFINTC_IMRL); + + /* + * External Pin Mask Setting & Enable External Pin for Interface + */ + sepmask = readw(MCF_IPSBAR + MCF_GPIO_PAR_UART); + if (line == 0) + sepmask |= UART0_ENABLE_MASK; + else if (line == 1) + sepmask |= UART1_ENABLE_MASK; + else if (line == 2) + sepmask |= UART2_ENABLE_MASK; + writew(sepmask, MCF_IPSBAR + MCF_GPIO_PAR_UART); +} + +static void __init m527x_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m527x_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m527x_uart_init_line(line, m527x_uart_platform[line].irq); +} /***************************************************************************/ @@ -54,10 +112,21 @@ void mcf_autovector(unsigned int vec) /***************************************************************************/ -void config_BSP(char *commandp, int size) +void __init config_BSP(char *commandp, int size) { mcf_disableall(); mach_reset = coldfire_reset; } /***************************************************************************/ + +static int __init init_BSP(void) +{ + m527x_uarts_init(); + platform_add_devices(m527x_devices, ARRAY_SIZE(m527x_devices)); + return 0; +} + +arch_initcall(init_BSP); + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/528x/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/528x/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/528x/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -16,26 +16,314 @@ #include #include #include -#include +#include +#include +#include +#include #include #include #include -#include +#include +#include + +#ifdef CONFIG_MTD_PARTITIONS +#include +#endif /***************************************************************************/ void coldfire_reset(void); +void coldfire_qspi_cs_control(u8 cs, u8 command); + +/***************************************************************************/ + +#if defined(CONFIG_SPI) + +#if defined(CONFIG_WILDFIRE) +#define SPI_NUM_CHIPSELECTS 0x02 +#define SPI_PAR_VAL 0x07 // Enable DIN, DOUT, CLK +#define SPI_CS_MASK 0x18 + +#define FLASH_BLOCKSIZE (1024*64) +#define FLASH_NUMBLOCKS 16 +#define FLASH_TYPE "m25p80" + +#define M25P80_CS 0 +#define MMC_CS 1 + +#ifdef CONFIG_MTD_PARTITIONS +static struct mtd_partition stm25p_partitions[] = { + /* sflash */ + [0] = { + .name = "stm25p80", + .offset = 0x00000000, + .size = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS, + .mask_flags = 0 + } +}; + +#endif + +#elif defined(CONFIG_WILDFIREMOD) + +#define SPI_NUM_CHIPSELECTS 0x08 +#define SPI_PAR_VAL 0x07 // Enable DIN, DOUT, CLK +#define SPI_CS_MASK 0x78 + +#define FLASH_BLOCKSIZE (1024*64) +#define FLASH_NUMBLOCKS 64 +#define FLASH_TYPE "m25p32" +/* Reserve 1M for the kernel parition */ +#define FLASH_KERNEL_SIZE (1024 * 1024) + +#define M25P80_CS 5 +#define MMC_CS 6 + +#ifdef CONFIG_MTD_PARTITIONS +static struct mtd_partition stm25p_partitions[] = { + /* sflash */ + [0] = { + .name = "kernel", + .offset = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS - FLASH_KERNEL_SIZE, + .size = FLASH_KERNEL_SIZE, + .mask_flags = 0 + }, + [1] = { + .name = "image", + .offset = 0x00000000, + .size = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS - FLASH_KERNEL_SIZE, + .mask_flags = 0 + }, + [2] = { + .name = "all", + .offset = 0x00000000, + .size = FLASH_BLOCKSIZE * FLASH_NUMBLOCKS, + .mask_flags = 0 + } +}; +#endif + +#else +#define SPI_NUM_CHIPSELECTS 0x04 +#define SPI_PAR_VAL 0x7F // Enable DIN, DOUT, CLK, CS0 - CS4 +#endif + +#ifdef MMC_CS +static struct coldfire_spi_chip flash_chip_info = { + .mode = SPI_MODE_0, + .bits_per_word = 16, + .del_cs_to_clk = 17, + .del_after_trans = 1, + .void_write_data = 0 +}; + +static struct coldfire_spi_chip mmc_chip_info = { + .mode = SPI_MODE_0, + .bits_per_word = 16, + .del_cs_to_clk = 17, + .del_after_trans = 1, + .void_write_data = 0xFFFF +}; +#endif + +#ifdef M25P80_CS +static struct flash_platform_data stm25p80_platform_data = { + .name = "ST M25P80 SPI Flash chip", +#ifdef CONFIG_MTD_PARTITIONS + .parts = stm25p_partitions, + .nr_parts = sizeof(stm25p_partitions) / sizeof(*stm25p_partitions), +#endif + .type = FLASH_TYPE +}; +#endif + +static struct spi_board_info spi_board_info[] __initdata = { +#ifdef M25P80_CS + { + .modalias = "m25p80", + .max_speed_hz = 16000000, + .bus_num = 1, + .chip_select = M25P80_CS, + .platform_data = &stm25p80_platform_data, + .controller_data = &flash_chip_info + }, +#endif +#ifdef MMC_CS + { + .modalias = "mmc_spi", + .max_speed_hz = 16000000, + .bus_num = 1, + .chip_select = MMC_CS, + .controller_data = &mmc_chip_info + } +#endif +}; + +static struct coldfire_spi_master coldfire_master_info = { + .bus_num = 1, + .num_chipselect = SPI_NUM_CHIPSELECTS, + .irq_source = MCF5282_QSPI_IRQ_SOURCE, + .irq_vector = MCF5282_QSPI_IRQ_VECTOR, + .irq_mask = ((0x01 << MCF5282_QSPI_IRQ_SOURCE) | 0x01), + .irq_lp = 0x2B, // Level 5 and Priority 3 + .par_val = SPI_PAR_VAL, + .cs_control = coldfire_qspi_cs_control, +}; + +static struct resource coldfire_spi_resources[] = { + [0] = { + .name = "qspi-par", + .start = MCF5282_QSPI_PAR, + .end = MCF5282_QSPI_PAR, + .flags = IORESOURCE_MEM + }, + + [1] = { + .name = "qspi-module", + .start = MCF5282_QSPI_QMR, + .end = MCF5282_QSPI_QMR + 0x18, + .flags = IORESOURCE_MEM + }, + + [2] = { + .name = "qspi-int-level", + .start = MCF5282_INTC0 + MCFINTC_ICR0 + MCF5282_QSPI_IRQ_SOURCE, + .end = MCF5282_INTC0 + MCFINTC_ICR0 + MCF5282_QSPI_IRQ_SOURCE, + .flags = IORESOURCE_MEM + }, + + [3] = { + .name = "qspi-int-mask", + .start = MCF5282_INTC0 + MCFINTC_IMRL, + .end = MCF5282_INTC0 + MCFINTC_IMRL, + .flags = IORESOURCE_MEM + } +}; + +static struct platform_device coldfire_spi = { + .name = "spi_coldfire", + .id = -1, + .resource = coldfire_spi_resources, + .num_resources = ARRAY_SIZE(coldfire_spi_resources), + .dev = { + .platform_data = &coldfire_master_info, + } +}; + +void coldfire_qspi_cs_control(u8 cs, u8 command) +{ + u8 cs_bit = ((0x01 << cs) << 3) & SPI_CS_MASK; + +#if defined(CONFIG_WILDFIRE) + u8 cs_mask = ~(((0x01 << cs) << 3) & SPI_CS_MASK); +#endif +#if defined(CONFIG_WILDFIREMOD) + u8 cs_mask = (cs << 3) & SPI_CS_MASK; +#endif + + /* + * Don't do anything if the chip select is not + * one of the port qs pins. + */ + if (command & QSPI_CS_INIT) { +#if defined(CONFIG_WILDFIRE) + MCF5282_GPIO_DDRQS |= cs_bit; + MCF5282_GPIO_PQSPAR &= ~cs_bit; +#endif + +#if defined(CONFIG_WILDFIREMOD) + MCF5282_GPIO_DDRQS |= SPI_CS_MASK; + MCF5282_GPIO_PQSPAR &= ~SPI_CS_MASK; +#endif + } + + if (command & QSPI_CS_ASSERT) { + MCF5282_GPIO_PORTQS &= ~SPI_CS_MASK; + MCF5282_GPIO_PORTQS |= cs_mask; + } else if (command & QSPI_CS_DROP) { + MCF5282_GPIO_PORTQS |= SPI_CS_MASK; + } +} + +static int __init spi_dev_init(void) +{ + int retval; + + retval = platform_device_register(&coldfire_spi); + if (retval < 0) + return retval; + + if (ARRAY_SIZE(spi_board_info)) + retval = spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info)); + + return retval; +} + +#endif /* CONFIG_SPI */ /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, +static struct mcf_platform_uart m528x_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = MCFINT_VECBASE + MCFINT_UART0, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = MCFINT_VECBASE + MCFINT_UART0 + 1, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE3, + .irq = MCFINT_VECBASE + MCFINT_UART0 + 2, + }, + { }, +}; + +static struct platform_device m528x_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m528x_uart_platform, +}; + +static struct platform_device *m528x_devices[] __initdata = { + &m528x_uart, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +/***************************************************************************/ + +#define INTC0 (MCF_MBAR + MCFICM_INTC0) + +static void __init m528x_uart_init_line(int line, int irq) +{ + u8 port; + u32 imr; + + if ((line < 0) || (line > 2)) + return; + + /* level 6, line based priority */ + writeb(0x30+line, INTC0 + MCFINTC_ICR0 + MCFINT_UART0 + line); + + imr = readl(INTC0 + MCFINTC_IMRL); + imr &= ~((1 << (irq - MCFINT_VECBASE)) | 1); + writel(imr, INTC0 + MCFINTC_IMRL); + + /* make sure PUAPAR is set for UART0 and UART1 */ + if (line < 2) { + port = readb(MCF_MBAR + MCF5282_GPIO_PUAPAR); + port |= (0x03 << (line * 2)); + writeb(port, MCF_MBAR + MCF5282_GPIO_PUAPAR); + } +} + +static void __init m528x_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m528x_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m528x_uart_init_line(line, m528x_uart_platform[line].irq); +} /***************************************************************************/ @@ -54,10 +342,57 @@ void mcf_autovector(unsigned int vec) /***************************************************************************/ -void config_BSP(char *commandp, int size) +#ifdef CONFIG_WILDFIRE +void wildfire_halt (void) +{ + writeb(0, 0x30000007); + writeb(0x2, 0x30000007); +} +#endif + +#ifdef CONFIG_WILDFIREMOD +void wildfiremod_halt (void) +{ + printk("WildFireMod hibernating...\n"); + + /* Set portE.5 to Digital IO */ + MCF5282_GPIO_PEPAR &= ~(1 << (5 * 2)); + + /* Make portE.5 an output */ + MCF5282_GPIO_DDRE |= (1 << 5); + + /* Now toggle portE.5 from low to high */ + MCF5282_GPIO_PORTE &= ~(1 << 5); + MCF5282_GPIO_PORTE |= (1 << 5); + + printk("Failed to hibernate. Halting!\n"); +} +#endif + +void __init config_BSP(char *commandp, int size) { mcf_disableall(); - mach_reset = coldfire_reset; + +#ifdef CONFIG_WILDFIRE + mach_halt = wildfire_halt; +#endif +#ifdef CONFIG_WILDFIREMOD + mach_halt = wildfiremod_halt; +#endif +} + +/***************************************************************************/ + +static int __init init_BSP(void) +{ + m528x_uarts_init(); +#ifdef CONFIG_SPI + spi_dev_init(); +#endif + platform_add_devices(m528x_devices, ARRAY_SIZE(m528x_devices)); + return 0; } +arch_initcall(init_BSP); + /***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/Makefile 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/Makefile 2009-02-08 00:00:48.000000000 -0500 @@ -16,17 +16,5 @@ ifdef CONFIG_FULLDEBUG EXTRA_AFLAGS += -DDEBUGGER_COMPATIBLE_CACHE=1 endif -obj-$(CONFIG_COLDFIRE) += entry.o vectors.o -obj-$(CONFIG_M5206) += timers.o -obj-$(CONFIG_M5206e) += timers.o -obj-$(CONFIG_M520x) += pit.o -obj-$(CONFIG_M523x) += pit.o -obj-$(CONFIG_M5249) += timers.o -obj-$(CONFIG_M527x) += pit.o -obj-$(CONFIG_M5272) += timers.o -obj-$(CONFIG_M5307) += config.o timers.o -obj-$(CONFIG_M532x) += timers.o -obj-$(CONFIG_M528x) += pit.o -obj-$(CONFIG_M5407) += timers.o +obj-y += config.o -extra-y := head.o Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -13,11 +13,11 @@ #include #include #include -#include +#include #include #include #include -#include +#include #include /***************************************************************************/ @@ -38,17 +38,51 @@ unsigned char ledbank = 0xff; /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, - MCF_MBAR + MCFDMA_BASE1, - MCF_MBAR + MCFDMA_BASE2, - MCF_MBAR + MCFDMA_BASE3, +static struct mcf_platform_uart m5307_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = 73, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = 74, + }, + { }, +}; + +static struct platform_device m5307_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m5307_uart_platform, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device *m5307_devices[] __initdata = { + &m5307_uart, +}; + +/***************************************************************************/ + +static void __init m5307_uart_init_line(int line, int irq) +{ + if (line == 0) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); + } else if (line == 1) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); + } +} + +static void __init m5307_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m5307_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m5307_uart_init_line(line, m5307_uart_platform[line].irq); +} /***************************************************************************/ @@ -85,27 +119,12 @@ void mcf_settimericr(unsigned int timer, /***************************************************************************/ -int mcf_timerirqpending(int timer) -{ - unsigned int imr = 0; - - switch (timer) { - case 1: imr = MCFSIM_IMR_TIMER1; break; - case 2: imr = MCFSIM_IMR_TIMER2; break; - default: break; - } - return (mcf_getipr() & imr); -} - -/***************************************************************************/ - -void config_BSP(char *commandp, int size) +void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); #if defined(CONFIG_NETtel) || defined(CONFIG_eLIA) || \ - defined(CONFIG_DISKtel) || defined(CONFIG_SECUREEDGEMP3) || \ - defined(CONFIG_CLEOPATRA) + defined(CONFIG_SECUREEDGEMP3) || defined(CONFIG_CLEOPATRA) /* Copy command line from FLASH to local buffer... */ memcpy(commandp, (char *) 0xf0004000, size); commandp[size-1] = 0; @@ -117,7 +136,7 @@ void config_BSP(char *commandp, int size mach_reset = coldfire_reset; -#ifdef MCF_BDM_DISABLE +#ifdef CONFIG_BDM_DISABLE /* * Disable the BDM clocking. This also turns off most of the rest of * the BDM device. This is good for EMC reasons. This option is not @@ -128,3 +147,14 @@ void config_BSP(char *commandp, int size } /***************************************************************************/ + +static int __init init_BSP(void) +{ + m5307_uarts_init(); + platform_add_devices(m5307_devices, ARRAY_SIZE(m5307_devices)); + return 0; +} + +arch_initcall(init_BSP); + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/entry.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/entry.S 2009-02-08 00:00:33.000000000 -0500 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,235 +0,0 @@ -/* - * linux/arch/m68knommu/platform/5307/entry.S - * - * Copyright (C) 1999-2007, Greg Ungerer (gerg@snapgear.com) - * Copyright (C) 1998 D. Jeff Dionne , - * Kenneth Albanowski , - * Copyright (C) 2000 Lineo Inc. (www.lineo.com) - * Copyright (C) 2004-2006 Macq Electronique SA. (www.macqel.com) - * - * Based on: - * - * linux/arch/m68k/kernel/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file README.legal in the main directory of this archive - * for more details. - * - * Linux/m68k support by Hamish Macdonald - * - * 68060 fixes by Jesper Skov - * ColdFire support by Greg Ungerer (gerg@snapgear.com) - * 5307 fixes by David W. Miller - * linux 2.4 support David McCullough - * Bug, speed and maintainability fixes by Philippe De Muyter - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -.bss - -sw_ksp: -.long 0 - -sw_usp: -.long 0 - -.text - -.globl system_call -.globl resume -.globl ret_from_exception -.globl ret_from_signal -.globl sys_call_table -.globl ret_from_interrupt -.globl inthandler -.globl fasthandler - -enosys: - mov.l #sys_ni_syscall,%d3 - bra 1f - -ENTRY(system_call) - SAVE_ALL - move #0x2000,%sr /* enable intrs again */ - - cmpl #NR_syscalls,%d0 - jcc enosys - lea sys_call_table,%a0 - lsll #2,%d0 /* movel %a0@(%d0:l:4),%d3 */ - movel %a0@(%d0),%d3 - jeq enosys - -1: - movel %sp,%d2 /* get thread_info pointer */ - andl #-THREAD_SIZE,%d2 /* at start of kernel stack */ - movel %d2,%a0 - movel %a0@,%a1 /* save top of frame */ - movel %sp,%a1@(TASK_THREAD+THREAD_ESP0) - btst #(TIF_SYSCALL_TRACE%8),%a0@(TI_FLAGS+(31-TIF_SYSCALL_TRACE)/8) - bnes 1f - - movel %d3,%a0 - jbsr %a0@ - movel %d0,%sp@(PT_D0) /* save the return value */ - jra ret_from_exception -1: - movel #-ENOSYS,%d2 /* strace needs -ENOSYS in PT_D0 */ - movel %d2,PT_D0(%sp) /* on syscall entry */ - subql #4,%sp - SAVE_SWITCH_STACK - jbsr syscall_trace - RESTORE_SWITCH_STACK - addql #4,%sp - movel %d3,%a0 - jbsr %a0@ - movel %d0,%sp@(PT_D0) /* save the return value */ - subql #4,%sp /* dummy return address */ - SAVE_SWITCH_STACK - jbsr syscall_trace - -ret_from_signal: - RESTORE_SWITCH_STACK - addql #4,%sp - -ret_from_exception: - btst #5,%sp@(PT_SR) /* check if returning to kernel */ - jeq Luser_return /* if so, skip resched, signals */ - -Lkernel_return: - moveml %sp@,%d1-%d5/%a0-%a2 - lea %sp@(32),%sp /* space for 8 regs */ - movel %sp@+,%d0 - addql #4,%sp /* orig d0 */ - addl %sp@+,%sp /* stk adj */ - rte - -Luser_return: - movel %sp,%d1 /* get thread_info pointer */ - andl #-THREAD_SIZE,%d1 /* at base of kernel stack */ - movel %d1,%a0 - movel %a0@(TI_FLAGS),%d1 /* get thread_info->flags */ - andl #_TIF_WORK_MASK,%d1 - jne Lwork_to_do /* still work to do */ - -Lreturn: - move #0x2700,%sr /* disable intrs */ - movel sw_usp,%a0 /* get usp */ - movel %sp@(PT_PC),%a0@- /* copy exception program counter */ - movel %sp@(PT_FORMATVEC),%a0@-/* copy exception format/vector/sr */ - moveml %sp@,%d1-%d5/%a0-%a2 - lea %sp@(32),%sp /* space for 8 regs */ - movel %sp@+,%d0 - addql #4,%sp /* orig d0 */ - addl %sp@+,%sp /* stk adj */ - addql #8,%sp /* remove exception */ - movel %sp,sw_ksp /* save ksp */ - subql #8,sw_usp /* set exception */ - movel sw_usp,%sp /* restore usp */ - rte - -Lwork_to_do: - movel %a0@(TI_FLAGS),%d1 /* get thread_info->flags */ - btst #TIF_NEED_RESCHED,%d1 - jne reschedule - - /* GERG: do we need something here for TRACEing?? */ - -Lsignal_return: - subql #4,%sp /* dummy return address */ - SAVE_SWITCH_STACK - pea %sp@(SWITCH_STACK_SIZE) - clrl %sp@- - jsr do_signal - addql #8,%sp - RESTORE_SWITCH_STACK - addql #4,%sp - jmp Lreturn - -/* - * This is the generic interrupt handler (for all hardware interrupt - * sources). Calls upto high level code to do all the work. - */ -ENTRY(inthandler) - SAVE_ALL - moveq #-1,%d0 - movel %d0,%sp@(PT_ORIG_D0) - - movew %sp@(PT_FORMATVEC),%d0 /* put exception # in d0 */ - andl #0x03fc,%d0 /* mask out vector only */ - - movel %sp,%sp@- /* push regs arg */ - lsrl #2,%d0 /* calculate real vector # */ - movel %d0,%sp@- /* push vector number */ - jbsr do_IRQ /* call high level irq handler */ - lea %sp@(8),%sp /* pop args off stack */ - - bra ret_from_interrupt /* this was fallthrough */ - -/* - * This is the fast interrupt handler (for certain hardware interrupt - * sources). Unlike the normal interrupt handler it just uses the - * current stack (doesn't care if it is user or kernel). It also - * doesn't bother doing the bottom half handlers. - */ -ENTRY(fasthandler) - SAVE_LOCAL - - movew %sp@(PT_FORMATVEC),%d0 - andl #0x03fc,%d0 /* mask out vector only */ - - movel %sp,%sp@- /* push regs arg */ - lsrl #2,%d0 /* calculate real vector # */ - movel %d0,%sp@- /* push vector number */ - jbsr do_IRQ /* call high level irq handler */ - lea %sp@(8),%sp /* pop args off stack */ - - RESTORE_LOCAL - -ENTRY(ret_from_interrupt) - jeq 2f -1: - RESTORE_ALL -2: - moveb %sp@(PT_SR),%d0 - andl #0x7,%d0 - jhi 1b - - /* check if we need to do software interrupts */ - movel irq_stat+CPUSTAT_SOFTIRQ_PENDING,%d0 - jeq ret_from_exception - - pea ret_from_exception - jmp do_softirq - -/* - * Beware - when entering resume, prev (the current task) is - * in a0, next (the new task) is in a1,so don't change these - * registers until their contents are no longer needed. - * This is always called in supervisor mode, so don't bother to save - * and restore sr; user's process sr is actually in the stack. - */ -ENTRY(resume) - movel %a0, %d1 /* get prev thread in d1 */ - - movel sw_usp,%d0 /* save usp */ - movel %d0,%a0@(TASK_THREAD+THREAD_USP) - - SAVE_SWITCH_STACK - movel %sp,%a0@(TASK_THREAD+THREAD_KSP) /* save kernel stack pointer */ - movel %a1@(TASK_THREAD+THREAD_KSP),%sp /* restore new thread stack */ - RESTORE_SWITCH_STACK - - movel %a1@(TASK_THREAD+THREAD_USP),%a0 /* restore thread user stack */ - movel %a0, sw_usp - rts Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/head.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/head.S 2009-02-08 00:00:33.000000000 -0500 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,222 +0,0 @@ -/*****************************************************************************/ - -/* - * head.S -- common startup code for ColdFire CPUs. - * - * (C) Copyright 1999-2006, Greg Ungerer . - */ - -/*****************************************************************************/ - -#include -#include -#include -#include -#include -#include - -/*****************************************************************************/ - -/* - * If we don't have a fixed memory size, then lets build in code - * to auto detect the DRAM size. Obviously this is the prefered - * method, and should work for most boards. It won't work for those - * that do not have their RAM starting at address 0, and it only - * works on SDRAM (not boards fitted with SRAM). - */ -#if CONFIG_RAMSIZE != 0 -.macro GET_MEM_SIZE - movel #CONFIG_RAMSIZE,%d0 /* hard coded memory size */ -.endm - -#elif defined(CONFIG_M5206) || defined(CONFIG_M5206e) || \ - defined(CONFIG_M5249) || defined(CONFIG_M527x) || \ - defined(CONFIG_M528x) || defined(CONFIG_M5307) || \ - defined(CONFIG_M5407) -/* - * Not all these devices have exactly the same DRAM controller, - * but the DCMR register is virtually identical - give or take - * a couple of bits. The only exception is the 5272 devices, their - * DRAM controller is quite different. - */ -.macro GET_MEM_SIZE - movel MCF_MBAR+MCFSIM_DMR0,%d0 /* get mask for 1st bank */ - btst #0,%d0 /* check if region enabled */ - beq 1f - andl #0xfffc0000,%d0 - beq 1f - addl #0x00040000,%d0 /* convert mask to size */ -1: - movel MCF_MBAR+MCFSIM_DMR1,%d1 /* get mask for 2nd bank */ - btst #0,%d1 /* check if region enabled */ - beq 2f - andl #0xfffc0000, %d1 - beq 2f - addl #0x00040000,%d1 - addl %d1,%d0 /* total mem size in d0 */ -2: -.endm - -#elif defined(CONFIG_M5272) -.macro GET_MEM_SIZE - movel MCF_MBAR+MCFSIM_CSOR7,%d0 /* get SDRAM address mask */ - andil #0xfffff000,%d0 /* mask out chip select options */ - negl %d0 /* negate bits */ -.endm - -#elif defined(CONFIG_M520x) -.macro GET_MEM_SIZE - clrl %d0 - movel MCF_MBAR+MCFSIM_SDCS0, %d2 /* Get SDRAM chip select 0 config */ - andl #0x1f, %d2 /* Get only the chip select size */ - beq 3f /* Check if it is enabled */ - addql #1, %d2 /* Form exponent */ - moveql #1, %d0 - lsll %d2, %d0 /* 2 ^ exponent */ -3: - movel MCF_MBAR+MCFSIM_SDCS1, %d2 /* Get SDRAM chip select 1 config */ - andl #0x1f, %d2 /* Get only the chip select size */ - beq 4f /* Check if it is enabled */ - addql #1, %d2 /* Form exponent */ - moveql #1, %d1 - lsll %d2, %d1 /* 2 ^ exponent */ - addl %d1, %d0 /* Total size of SDRAM in d0 */ -4: -.endm - -#else -#error "ERROR: I don't know how to probe your boards memory size?" -#endif - -/*****************************************************************************/ - -/* - * Boards and platforms can do specific early hardware setup if - * they need to. Most don't need this, define away if not required. - */ -#ifndef PLATFORM_SETUP -#define PLATFORM_SETUP -#endif - -/*****************************************************************************/ - -.global _start -.global _rambase -.global _ramvec -.global _ramstart -.global _ramend - -/*****************************************************************************/ - -.data - -/* - * During startup we store away the RAM setup. These are not in the - * bss, since their values are determined and written before the bss - * has been cleared. - */ -_rambase: -.long 0 -_ramvec: -.long 0 -_ramstart: -.long 0 -_ramend: -.long 0 - -/*****************************************************************************/ - -.text - -/* - * This is the codes first entry point. This is where it all - * begins... - */ - -_start: - nop /* filler */ - movew #0x2700, %sr /* no interrupts */ - - /* - * Do any platform or board specific setup now. Most boards - * don't need anything. Those exceptions are define this in - * their board specific includes. - */ - PLATFORM_SETUP - - /* - * Create basic memory configuration. Set VBR accordingly, - * and size memory. - */ - movel #CONFIG_VECTORBASE,%a7 - movec %a7,%VBR /* set vectors addr */ - movel %a7,_ramvec - - movel #CONFIG_RAMBASE,%a7 /* mark the base of RAM */ - movel %a7,_rambase - - GET_MEM_SIZE /* macro code determines size */ - addl %a7,%d0 - movel %d0,_ramend /* set end ram addr */ - - /* - * Now that we know what the memory is, lets enable cache - * and get things moving. This is Coldfire CPU specific. - */ - CACHE_ENABLE /* enable CPU cache */ - - -#ifdef CONFIG_ROMFS_FS - /* - * Move ROM filesystem above bss :-) - */ - lea _sbss,%a0 /* get start of bss */ - lea _ebss,%a1 /* set up destination */ - movel %a0,%a2 /* copy of bss start */ - - movel 8(%a0),%d0 /* get size of ROMFS */ - addql #8,%d0 /* allow for rounding */ - andl #0xfffffffc, %d0 /* whole words */ - - addl %d0,%a0 /* copy from end */ - addl %d0,%a1 /* copy from end */ - movel %a1,_ramstart /* set start of ram */ - -_copy_romfs: - movel -(%a0),%d0 /* copy dword */ - movel %d0,-(%a1) - cmpl %a0,%a2 /* check if at end */ - bne _copy_romfs - -#else /* CONFIG_ROMFS_FS */ - lea _ebss,%a1 - movel %a1,_ramstart -#endif /* CONFIG_ROMFS_FS */ - - - /* - * Zero out the bss region. - */ - lea _sbss,%a0 /* get start of bss */ - lea _ebss,%a1 /* get end of bss */ - clrl %d0 /* set value */ -_clear_bss: - movel %d0,(%a0)+ /* clear each word */ - cmpl %a0,%a1 /* check if at end */ - bne _clear_bss - - /* - * Load the current task pointer and stack. - */ - lea init_thread_union,%a0 - lea THREAD_SIZE(%a0),%sp - - /* - * Assember start up done, start code proper. - */ - jsr start_kernel /* start Linux kernel */ - -_exit: - jmp _exit /* should never get here */ - -/*****************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/pit.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/pit.c 2009-02-08 00:00:33.000000000 -0500 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ -/***************************************************************************/ - -/* - * pit.c -- Freescale ColdFire PIT timer. Currently this type of - * hardware timer only exists in the Freescale ColdFire - * 5270/5271, 5282 and other CPUs. - * - * Copyright (C) 1999-2007, Greg Ungerer (gerg@snapgear.com) - * Copyright (C) 2001-2004, SnapGear Inc. (www.snapgear.com) - */ - -/***************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/***************************************************************************/ - -/* - * By default use timer1 as the system clock timer. - */ -#define TA(a) (MCF_IPSBAR + MCFPIT_BASE1 + (a)) - -/***************************************************************************/ - -static irqreturn_t hw_tick(int irq, void *dummy) -{ - unsigned short pcsr; - - /* Reset the ColdFire timer */ - pcsr = __raw_readw(TA(MCFPIT_PCSR)); - __raw_writew(pcsr | MCFPIT_PCSR_PIF, TA(MCFPIT_PCSR)); - - return arch_timer_interrupt(irq, dummy); -} - -/***************************************************************************/ - -static struct irqaction coldfire_pit_irq = { - .name = "timer", - .flags = IRQF_DISABLED | IRQF_TIMER, - .handler = hw_tick, -}; - -void hw_timer_init(void) -{ - volatile unsigned char *icrp; - volatile unsigned long *imrp; - - setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &coldfire_pit_irq); - - icrp = (volatile unsigned char *) (MCF_IPSBAR + MCFICM_INTC0 + - MCFINTC_ICR0 + MCFINT_PIT1); - *icrp = ICR_INTRCONF; - - imrp = (volatile unsigned long *) (MCF_IPSBAR + MCFICM_INTC0 + MCFPIT_IMR); - *imrp &= ~MCFPIT_IMR_IBIT; - - /* Set up PIT timer 1 as poll clock */ - __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); - __raw_writew(((MCF_CLK / 2) / 64) / HZ, TA(MCFPIT_PMR)); - __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | MCFPIT_PCSR_OVW | - MCFPIT_PCSR_RLD | MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR)); -} - -/***************************************************************************/ - -unsigned long hw_timer_offset(void) -{ - volatile unsigned long *ipr; - unsigned long pmr, pcntr, offset; - - ipr = (volatile unsigned long *) (MCF_IPSBAR + MCFICM_INTC0 + MCFPIT_IMR); - - pmr = __raw_readw(TA(MCFPIT_PMR)); - pcntr = __raw_readw(TA(MCFPIT_PCNTR)); - - /* - * If we are still in the first half of the upcount and a - * timer interrupt is pending, then add on a ticks worth of time. - */ - offset = ((pmr - pcntr) * (1000000 / HZ)) / pmr; - if ((offset < (1000000 / HZ / 2)) && (*ipr & MCFPIT_IMR_IBIT)) - offset += 1000000 / HZ; - return offset; -} - -/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/timers.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/timers.c 2009-02-08 00:00:33.000000000 -0500 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,155 +0,0 @@ -/***************************************************************************/ - -/* - * timers.c -- generic ColdFire hardware timer support. - * - * Copyright (C) 1999-2007, Greg Ungerer (gerg@snapgear.com) - */ - -/***************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/***************************************************************************/ - -/* - * By default use timer1 as the system clock timer. - */ -#define TA(a) (MCF_MBAR + MCFTIMER_BASE1 + (a)) - -/* - * Default the timer and vector to use for ColdFire. Some ColdFire - * CPU's and some boards may want different. Their sub-architecture - * startup code (in config.c) can change these if they want. - */ -unsigned int mcf_timervector = 29; -unsigned int mcf_profilevector = 31; -unsigned int mcf_timerlevel = 5; - -/* - * These provide the underlying interrupt vector support. - * Unfortunately it is a little different on each ColdFire. - */ -extern void mcf_settimericr(int timer, int level); -extern int mcf_timerirqpending(int timer); - -#if defined(CONFIG_M532x) -#define __raw_readtrr __raw_readl -#define __raw_writetrr __raw_writel -#else -#define __raw_readtrr __raw_readw -#define __raw_writetrr __raw_writew -#endif - -/***************************************************************************/ - -static irqreturn_t hw_tick(int irq, void *dummy) -{ - /* Reset the ColdFire timer */ - __raw_writeb(MCFTIMER_TER_CAP | MCFTIMER_TER_REF, TA(MCFTIMER_TER)); - - return arch_timer_interrupt(irq, dummy); -} - -/***************************************************************************/ - -static struct irqaction coldfire_timer_irq = { - .name = "timer", - .flags = IRQF_DISABLED | IRQF_TIMER, - .handler = hw_tick, -}; - -/***************************************************************************/ - -static int ticks_per_intr; - -void hw_timer_init(void) -{ - setup_irq(mcf_timervector, &coldfire_timer_irq); - - __raw_writew(MCFTIMER_TMR_DISABLE, TA(MCFTIMER_TMR)); - ticks_per_intr = (MCF_BUSCLK / 16) / HZ; - __raw_writetrr(ticks_per_intr - 1, TA(MCFTIMER_TRR)); - __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | - MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR)); - - mcf_settimericr(1, mcf_timerlevel); - -#ifdef CONFIG_HIGHPROFILE - coldfire_profile_init(); -#endif -} - -/***************************************************************************/ - -unsigned long hw_timer_offset(void) -{ - unsigned long tcn, offset; - - tcn = __raw_readw(TA(MCFTIMER_TCN)); - offset = ((tcn + 1) * (1000000 / HZ)) / ticks_per_intr; - - /* Check if we just wrapped the counters and maybe missed a tick */ - if ((offset < (1000000 / HZ / 2)) && mcf_timerirqpending(1)) - offset += 1000000 / HZ; - return offset; -} - -/***************************************************************************/ -#ifdef CONFIG_HIGHPROFILE -/***************************************************************************/ - -/* - * By default use timer2 as the profiler clock timer. - */ -#define PA(a) (MCF_MBAR + MCFTIMER_BASE2 + (a)) - -/* - * Choose a reasonably fast profile timer. Make it an odd value to - * try and get good coverage of kernel operations. - */ -#define PROFILEHZ 1013 - -/* - * Use the other timer to provide high accuracy profiling info. - */ -irqreturn_t coldfire_profile_tick(int irq, void *dummy) -{ - /* Reset ColdFire timer2 */ - __raw_writeb(MCFTIMER_TER_CAP | MCFTIMER_TER_REF, PA(MCFTIMER_TER)); - if (current->pid) - profile_tick(CPU_PROFILING, regs); - return IRQ_HANDLED; -} - -/***************************************************************************/ - -void coldfire_profile_init(void) -{ - printk(KERN_INFO "PROFILE: lodging TIMER2 @ %dHz as profile timer\n", PROFILEHZ); - - /* Set up TIMER 2 as high speed profile clock */ - __raw_writew(MCFTIMER_TMR_DISABLE, PA(MCFTIMER_TMR)); - - __raw_writetrr(((MCF_CLK / 16) / PROFILEHZ), PA(MCFTIMER_TRR)); - __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | - MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, PA(MCFTIMER_TMR)); - - request_irq(mcf_profilevector, coldfire_profile_tick, - (IRQF_DISABLED | IRQ_FLG_FAST), "profile timer", NULL); - mcf_settimericr(2, 7); -} - -/***************************************************************************/ -#endif /* CONFIG_HIGHPROFILE */ -/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5307/vectors.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5307/vectors.c 2009-02-08 00:00:33.000000000 -0500 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,105 +0,0 @@ -/***************************************************************************/ - -/* - * linux/arch/m68knommu/platform/5307/vectors.c - * - * Copyright (C) 1999-2007, Greg Ungerer - */ - -/***************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/***************************************************************************/ - -#ifdef TRAP_DBG_INTERRUPT - -asmlinkage void dbginterrupt_c(struct frame *fp) -{ - extern void dump(struct pt_regs *fp); - printk(KERN_DEBUG "%s(%d): BUS ERROR TRAP\n", __FILE__, __LINE__); - dump((struct pt_regs *) fp); - asm("halt"); -} - -#endif - -/***************************************************************************/ - -extern e_vector *_ramvec; - -void set_evector(int vecnum, void (*handler)(void)) -{ - if (vecnum >= 0 && vecnum <= 255) - _ramvec[vecnum] = handler; -} - -/***************************************************************************/ - -/* Assembler routines */ -asmlinkage void buserr(void); -asmlinkage void trap(void); -asmlinkage void system_call(void); -asmlinkage void inthandler(void); - -void __init init_vectors(void) -{ - int i; - - /* - * There is a common trap handler and common interrupt - * handler that handle almost every vector. We treat - * the system call and bus error special, they get their - * own first level handlers. - */ - for (i = 3; (i <= 23); i++) - _ramvec[i] = trap; - for (i = 33; (i <= 63); i++) - _ramvec[i] = trap; - for (i = 24; (i <= 31); i++) - _ramvec[i] = inthandler; - for (i = 64; (i < 255); i++) - _ramvec[i] = inthandler; - _ramvec[255] = 0; - - _ramvec[2] = buserr; - _ramvec[32] = system_call; - -#ifdef TRAP_DBG_INTERRUPT - _ramvec[12] = dbginterrupt; -#endif -} - -/***************************************************************************/ - -void enable_vector(unsigned int irq) -{ - /* Currently no action on ColdFire */ -} - -void disable_vector(unsigned int irq) -{ - /* Currently no action on ColdFire */ -} - -void ack_vector(unsigned int irq) -{ - /* Currently no action on ColdFire */ -} - -/***************************************************************************/ - -void coldfire_reset(void) -{ - HARD_RESET_NOW(); -} - -/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/532x/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/532x/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/532x/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -21,10 +21,11 @@ #include #include #include -#include +#include #include #include #include +#include #include #include @@ -38,11 +39,75 @@ extern unsigned int mcf_timerlevel; /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +int sys_clk_khz = 0; +int sys_clk_mhz = 0; + +void wtm_init(void); +void scm_init(void); +void gpio_init(void); +void fbcs_init(void); +void sdramc_init(void); +int clock_pll (int fsys, int flags); +int clock_limp (int); +int clock_exit_limp (void); +int get_sys_clock (void); + +/***************************************************************************/ + +static struct mcf_platform_uart m532x_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = MCFINT_VECBASE + MCFINT_UART0, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = MCFINT_VECBASE + MCFINT_UART1, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE3, + .irq = MCFINT_VECBASE + MCFINT_UART2, + }, + { }, +}; + +static struct platform_device m532x_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m532x_uart_platform, +}; + +static struct platform_device *m532x_devices[] __initdata = { + &m532x_uart, +}; + +/***************************************************************************/ + +static void __init m532x_uart_init_line(int line, int irq) +{ + if (line == 0) { + MCF_INTC0_ICR26 = 0x3; + MCF_INTC0_CIMR = 26; + /* GPIO initialization */ + MCF_GPIO_PAR_UART |= 0x000F; + } else if (line == 1) { + MCF_INTC0_ICR27 = 0x3; + MCF_INTC0_CIMR = 27; + /* GPIO initialization */ + MCF_GPIO_PAR_UART |= 0x0FF0; + } else if (line == 2) { + MCF_INTC0_ICR28 = 0x3; + MCF_INTC0_CIMR = 28; + } +} + +static void __init m532x_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m532x_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m532x_uart_init_line(line, m532x_uart_platform[line].irq); +} /***************************************************************************/ @@ -66,22 +131,11 @@ void mcf_settimericr(unsigned int timer, /***************************************************************************/ -int mcf_timerirqpending(int timer) +void __init config_BSP(char *commandp, int size) { - unsigned int imr = 0; - - switch (timer) { - case 1: imr = 0x1; break; - case 2: imr = 0x2; break; - default: break; - } - return (mcf_getiprh() & imr); -} - -/***************************************************************************/ + sys_clk_khz = get_sys_clock(); + sys_clk_mhz = sys_clk_khz/1000; -void config_BSP(char *commandp, int size) -{ mcf_setimr(MCFSIM_IMR_MASKALL); #if !defined(CONFIG_BOOTPARAM) @@ -99,7 +153,7 @@ void config_BSP(char *commandp, int size mcf_profilevector = 64+33; mach_reset = coldfire_reset; -#ifdef MCF_BDM_DISABLE +#ifdef CONFIG_BDM_DISABLE /* * Disable the BDM clocking. This also turns off most of the rest of * the BDM device. This is good for EMC reasons. This option is not @@ -110,6 +164,17 @@ void config_BSP(char *commandp, int size } /***************************************************************************/ + +static int __init init_BSP(void) +{ + m532x_uarts_init(); + platform_add_devices(m532x_devices, ARRAY_SIZE(m532x_devices)); + return 0; +} + +arch_initcall(init_BSP); + +/***************************************************************************/ /* Board initialization */ /********************************************************************/ @@ -152,24 +217,9 @@ void config_BSP(char *commandp, int size #define NAND_FLASH_ADDRESS (0xD0000000) -int sys_clk_khz = 0; -int sys_clk_mhz = 0; - -void wtm_init(void); -void scm_init(void); -void gpio_init(void); -void fbcs_init(void); -void sdramc_init(void); -int clock_pll (int fsys, int flags); -int clock_limp (int); -int clock_exit_limp (void); -int get_sys_clock (void); asmlinkage void __init sysinit(void) { - sys_clk_khz = clock_pll(0, 0); - sys_clk_mhz = sys_clk_khz/1000; - wtm_init(); scm_init(); gpio_init(); @@ -207,25 +257,61 @@ void scm_init(void) void fbcs_init(void) { +#if defined(CONFIG_COBRA5329) + /* The COBRA5329 by senTec needs this settings */ + + /* + * We need to give the LCD enough bandwidth + */ + + MCF_XBS_PRS1 = MCF_XBS_PRIO_LCD(MCF_PRIO_LVL_1) + | MCF_XBS_PRIO_CORE(MCF_PRIO_LVL_2) + | MCF_XBS_PRIO_FEC(MCF_PRIO_LVL_3) + | MCF_XBS_PRIO_USBHOST(MCF_PRIO_LVL_4) + | MCF_XBS_PRIO_EDMA(MCF_PRIO_LVL_5) + | MCF_XBS_PRIO_USBOTG(MCF_PRIO_LVL_6) + | MCF_XBS_PRIO_FACTTEST(MCF_PRIO_LVL_7); + + /* Boot Flash connected to FBCS0 */ + MCF_FBCS0_CSAR = FLASH_ADDRESS; + MCF_FBCS0_CSCR = (MCF_FBCS_CSCR_PS_16 + | MCF_FBCS_CSCR_BEM + | MCF_FBCS_CSCR_AA + | MCF_FBCS_CSCR_WS(8)); + + MCF_FBCS0_CSMR = (MCF_FBCS_CSMR_BAM_1G + | MCF_FBCS_CSMR_V); + + /* Fix bug #10 in the errata */ + MCF_FBCS1_CSAR = 0xC0000000; + MCF_FBCS1_CSCR = (MCF_FBCS_CSCR_PS_16 + | MCF_FBCS_CSCR_BEM + | MCF_FBCS_CSCR_AA + | MCF_FBCS_CSCR_WS(8)); + + MCF_FBCS1_CSMR = (0x30000000 + | MCF_FBCS_CSMR_V + | MCF_FBCS_CSMR_WP ); +#else MCF_GPIO_PAR_CS = 0x0000003E; /* Latch chip select */ MCF_FBCS1_CSAR = 0x10080000; - MCF_FBCS1_CSCR = 0x002A3780; + MCF_FBCS1_CSCR = 0x002A3580 | (MCF_FBCS1_CSCR&0x200); MCF_FBCS1_CSMR = (MCF_FBCS_CSMR_BAM_2M | MCF_FBCS_CSMR_V); /* Initialize latch to drive signals to inactive states */ - *((u16 *)(0x10080000)) = 0xFFFF; + *((u16 *)(0x10080000)) = 0xD3FF; - /* External SRAM */ - MCF_FBCS1_CSAR = EXT_SRAM_ADDRESS; - MCF_FBCS1_CSCR = (MCF_FBCS_CSCR_PS_16 - | MCF_FBCS_CSCR_AA - | MCF_FBCS_CSCR_SBM - | MCF_FBCS_CSCR_WS(1)); - MCF_FBCS1_CSMR = (MCF_FBCS_CSMR_BAM_512K - | MCF_FBCS_CSMR_V); +// /* External SRAM */ +// MCF_FBCS1_CSAR = EXT_SRAM_ADDRESS; +// MCF_FBCS1_CSCR = (MCF_FBCS_CSCR_PS_16 +// | MCF_FBCS_CSCR_AA +// | MCF_FBCS_CSCR_SBM +// | MCF_FBCS_CSCR_WS(1)); +// MCF_FBCS1_CSMR = (MCF_FBCS_CSMR_BAM_512K +// | MCF_FBCS_CSMR_V); /* Boot Flash connected to FBCS0 */ MCF_FBCS0_CSAR = FLASH_ADDRESS; @@ -236,6 +322,7 @@ void fbcs_init(void) | MCF_FBCS_CSCR_WS(7)); MCF_FBCS0_CSMR = (MCF_FBCS_CSMR_BAM_32M | MCF_FBCS_CSMR_V); +#endif } void sdramc_init(void) Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/532x/spi-mcf532x.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/532x/spi-mcf532x.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,176 @@ +/***************************************************************************/ +/* + * linux/arch/m68knommu/platform/532x/spi-mcf532x.c + * + * Sub-architcture dependant initialization code for the Freescale + * 532x SPI module + * + * Yaroslav Vinogradov yaroslav.vinogradov@freescale.com + * Copyright Freescale Semiconductor, Inc 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +/***************************************************************************/ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define SPI_NUM_CHIPSELECTS 0x04 +#define SPI_PAR_VAL 0xFFF0 /* Enable DIN, DOUT, CLK */ + +#define MCF532x_QSPI_IRQ_SOURCE (31) +#define MCF532x_QSPI_IRQ_VECTOR (64 + MCF532x_QSPI_IRQ_SOURCE) + +#define MCF532x_QSPI_PAR (0xFC0A405A) +#define MCF532x_QSPI_QMR (0xFC05C000) +#define MCF532x_INTC0_ICR (0xFC048040) +#define MCF532x_INTC0_IMRL (0xFC04800C) + +/* on 5329 EVB ADS7843 is connected to IRQ4 */ +#define ADS784x_IRQ_SOURCE 4 +#define ADS784x_IRQ_VECTOR (64+ADS784x_IRQ_SOURCE) +#define ADS7843_IRQ_LEVEL 2 + + +void coldfire_qspi_cs_control(u8 cs, u8 command) +{ +} + +#if defined(CONFIG_TOUCHSCREEN_ADS7843) +static struct coldfire_spi_chip ads784x_chip_info = { + .mode = SPI_MODE_0, + .bits_per_word = 8, + .del_cs_to_clk = 17, + .del_after_trans = 1, + .void_write_data = 0 +}; + +static struct ads7843_platform_data ads784x_platform_data = { + .model = 7843, + .vref_delay_usecs = 0, + .x_plate_ohms = 580, + .y_plate_ohms = 410 +}; +#endif + + +static struct spi_board_info spi_board_info[] = { +#if defined(CONFIG_TOUCHSCREEN_ADS7843) + { + .modalias = "ads7843", + .max_speed_hz = 125000 * 16, + .bus_num = 1, + .chip_select = 1, + .irq = ADS784x_IRQ_VECTOR, + .platform_data = &ads784x_platform_data, + .controller_data = &ads784x_chip_info + } +#endif +}; + +static struct coldfire_spi_master coldfire_master_info = { + .bus_num = 1, + .num_chipselect = SPI_NUM_CHIPSELECTS, + .irq_source = MCF532x_QSPI_IRQ_SOURCE, + .irq_vector = MCF532x_QSPI_IRQ_VECTOR, + .irq_mask = (0x01 << MCF532x_QSPI_IRQ_SOURCE), + .irq_lp = 0x5, /* Level */ + .par_val = 0, /* not used on 532x */ + .par_val16 = SPI_PAR_VAL, + .cs_control = coldfire_qspi_cs_control, +}; + +static struct resource coldfire_spi_resources[] = { + [0] = { + .name = "qspi-par", + .start = MCF532x_QSPI_PAR, + .end = MCF532x_QSPI_PAR, + .flags = IORESOURCE_MEM + }, + + [1] = { + .name = "qspi-module", + .start = MCF532x_QSPI_QMR, + .end = MCF532x_QSPI_QMR + 0x18, + .flags = IORESOURCE_MEM + }, + + [2] = { + .name = "qspi-int-level", + .start = MCF532x_INTC0_ICR + MCF532x_QSPI_IRQ_SOURCE, + .end = MCF532x_INTC0_ICR + MCF532x_QSPI_IRQ_SOURCE, + .flags = IORESOURCE_MEM + }, + + [3] = { + .name = "qspi-int-mask", + .start = MCF532x_INTC0_IMRL, + .end = MCF532x_INTC0_IMRL, + .flags = IORESOURCE_MEM + } +}; + +static struct platform_device coldfire_spi = { + .name = "coldfire-qspi", + .id = -1, + .resource = coldfire_spi_resources, + .num_resources = ARRAY_SIZE(coldfire_spi_resources), + .dev = { + .platform_data = &coldfire_master_info, + } +}; + +#if defined(CONFIG_TOUCHSCREEN_ADS7843) +static int __init init_ads7843(void) +{ + /* GPIO initiaalization */ + MCF_GPIO_PAR_IRQ = MCF_GPIO_PAR_IRQ_PAR_IRQ4(0); + /* EPORT initialization */ + MCF_EPORT_EPPAR = MCF_EPORT_EPPAR_EPPA4(MCF_EPORT_EPPAR_FALLING); + MCF_EPORT_EPDDR = 0; + MCF_EPORT_EPIER = MCF_EPORT_EPIER_EPIE4; + /* enable interrupt source */ + MCF_INTC0_ICR4 = ADS7843_IRQ_LEVEL; + MCF_INTC0_CIMR = ADS784x_IRQ_SOURCE; +} +#endif + +static int __init spi_dev_init(void) +{ + int retval = 0; +#if defined(CONFIG_TOUCHSCREEN_ADS7843) + init_ads7843(); +#endif + + retval = platform_device_register(&coldfire_spi); + if (retval < 0) + goto out; + + if (ARRAY_SIZE(spi_board_info)) + retval = spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info)); + + +out: + return retval; +} + +arch_initcall(spi_dev_init); Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/532x/usb-mcf532x.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/532x/usb-mcf532x.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,171 @@ +/*************************************************************************** + * usb-mcf532x.c - Platform level (mcf532x) USB initialization. + * + * Andrey Butok Andrey.Butok@freescale.com. + * Copyright Freescale Semiconductor, Inc 2006 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + *************************************************************************** + * Changes: + * v0.01 31 March 2006 Andrey Butok + * Initial Release - developed on uClinux with 2.6.15.6 kernel + * + * WARNING: The MCF532x USB functionality was tested + * only with low-speed USB devices (cause of HW bugs). + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include + +/* Start address of HC registers.*/ +#define MCF532x_USB_HOST_REG_START (0xfc0b4000) +/* End address of HC registers */ +#define MCF532x_USB_HOST_REG_END (MCF532x_USB_HOST_REG_START+0x200) +/* USB Host Interrupt number */ +#define MCF532x_USB_HOST_INT_NUMBER (128+48) + +#ifdef CONFIG_USB_OTG +/* Start address of OTG module registers.*/ +#define MCF532x_USB_OTG_REG_START (0xfc0b0000) +/* End address of OTG module registers */ +#define MCF532x_USB_OTG_REG_END (MCF532x_USB_OTG_REG_START+0x200) +/* USB OTG Interrupt number */ +#define MCF532x_USB_OTG_INT_NUMBER (128+47) +#endif + +/*-------------------------------------------------------------------------*/ + +static void +usb_release(struct device *dev) +{ + /* normally not freed */ +} + +/* + * USB Host module structures + */ +static struct resource ehci_host_resources[] = { + { + .start = MCF532x_USB_HOST_REG_START, + .end = MCF532x_USB_HOST_REG_END, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF532x_USB_HOST_INT_NUMBER, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device ehci_host_device = { + .name = "ehci", + .id = 1, + .dev = { + .release = usb_release, + .dma_mask = 0x0}, + .num_resources = ARRAY_SIZE(ehci_host_resources), + .resource = ehci_host_resources, +}; + +/* + * USB OTG module structures. + */ +#ifdef CONFIG_USB_OTG +static struct resource ehci_otg_resources[] = { + { + .start = MCF532x_USB_OTG_REG_START, + .end = MCF532x_USB_OTG_REG_END, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF532x_USB_OTG_INT_NUMBER, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device ehci_otg_device = { + .name = "ehci", + .id = 0, + .dev = { + .release = usb_release, + .dma_mask = 0x0}, + .num_resources = ARRAY_SIZE(ehci_otg_resources), + .resource = ehci_otg_resources, +}; +#endif + +typedef volatile u8 vuint8; /* 8 bits */ + +static int __init +mcf532x_usb_init(void) +{ + int status; + + /* + * Initialize the clock divider for the USB: + */ +#if CONFIG_CLOCK_FREQ == 240000000 + /* + * CPU oerating on 240Mhz (MISCCR[USBDIV]=1) + * this is the default + */ + (*(volatile u16 *) (0xFC0A0010)) |= (0x0002); +#elif CONFIG_CLOCK_FREQ == 180000000 + /* + * CPU oerating on 180Mhz (MISCCR[USBDIV]=0) + */ + (*(volatile u16 *) (0xFC0A0010)) &= ~(0x0002); +#else + #error "CLOCK must be 240MHz or 180Mhz" +#endif + /* + * Register USB Host device: + */ + status = platform_device_register(&ehci_host_device); + if (status) { + pr_info + ("USB-MCF532x: Can't register MCF532x USB Host device, %d\n", + status); + return -ENODEV; + } + pr_info("USB-MCF532x: MCF532x USB Host device is registered\n"); + +#ifdef CONFIG_USB_OTG + /* + * Register USB OTG device: + * Done only USB Host. + * TODO: Device and OTG functinality. + */ + status = platform_device_register(&ehci_otg_device); + if (status) { + pr_info + ("USB-MCF532x: Can't register MCF532x USB OTG device, %d\n", + status); + return -ENODEV; + } + pr_info("USB-MCF532x: MCF532x USB OTG device is registered\n"); +#endif + + return 0; +} + +subsys_initcall(mcf532x_usb_init); Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/5407/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/5407/config.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/5407/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -13,11 +13,11 @@ #include #include #include -#include +#include #include #include #include -#include +#include /***************************************************************************/ @@ -29,17 +29,51 @@ extern unsigned int mcf_timerlevel; /***************************************************************************/ -/* - * DMA channel base address table. - */ -unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { - MCF_MBAR + MCFDMA_BASE0, - MCF_MBAR + MCFDMA_BASE1, - MCF_MBAR + MCFDMA_BASE2, - MCF_MBAR + MCFDMA_BASE3, +static struct mcf_platform_uart m5407_uart_platform[] = { + { + .mapbase = MCF_MBAR + MCFUART_BASE1, + .irq = 73, + }, + { + .mapbase = MCF_MBAR + MCFUART_BASE2, + .irq = 74, + }, + { }, }; -unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; +static struct platform_device m5407_uart = { + .name = "mcfuart", + .id = 0, + .dev.platform_data = m5407_uart_platform, +}; + +static struct platform_device *m5407_devices[] __initdata = { + &m5407_uart, +}; + +/***************************************************************************/ + +static void __init m5407_uart_init_line(int line, int irq) +{ + if (line == 0) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); + } else if (line == 1) { + writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); + } +} + +static void __init m5407_uarts_init(void) +{ + const int nrlines = ARRAY_SIZE(m5407_uart_platform); + int line; + + for (line = 0; (line < nrlines); line++) + m5407_uart_init_line(line, m5407_uart_platform[line].irq); +} /***************************************************************************/ @@ -76,21 +110,7 @@ void mcf_settimericr(unsigned int timer, /***************************************************************************/ -int mcf_timerirqpending(int timer) -{ - unsigned int imr = 0; - - switch (timer) { - case 1: imr = MCFSIM_IMR_TIMER1; break; - case 2: imr = MCFSIM_IMR_TIMER2; break; - default: break; - } - return (mcf_getipr() & imr); -} - -/***************************************************************************/ - -void config_BSP(char *commandp, int size) +void __init config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); @@ -105,3 +125,14 @@ void config_BSP(char *commandp, int size } /***************************************************************************/ + +static int __init init_BSP(void) +{ + m5407_uarts_init(); + platform_add_devices(m5407_devices, ARRAY_SIZE(m5407_devices)); + return 0; +} + +arch_initcall(init_BSP); + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/68328/ints.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/68328/ints.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/68328/ints.c 2009-02-08 00:00:48.000000000 -0500 @@ -101,6 +101,8 @@ void __init init_vectors(void) IMR = ~0; } +void do_IRQ(int irq, struct pt_regs *fp); + /* The 68k family did not have a good way to determine the source * of interrupts until later in the family. The EC000 core does * not provide the vector number on the stack, we vector everything Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/68328/timers.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/68328/timers.c 2009-02-08 00:00:33.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/68328/timers.c 2009-02-08 00:00:48.000000000 -0500 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,19 @@ #define TICKS_PER_JIFFY 10 #endif +static u32 m68328_tick_cnt; + +/***************************************************************************/ + +static irqreturn_t hw_tick(int irq, void *dummy) +{ + /* Reset Timer1 */ + TSTAT &= 0; + + m68328_tick_cnt += TICKS_PER_JIFFY; + return arch_timer_interrupt(irq, dummy); +} + /***************************************************************************/ static irqreturn_t hw_tick(int irq, void *dummy) @@ -69,6 +83,33 @@ static struct irqaction m68328_timer_irq .handler = hw_tick, }; +/***************************************************************************/ + +static cycle_t m68328_read_clk(void) +{ + unsigned long flags; + u32 cycles; + + local_irq_save(flags); + cycles = m68328_tick_cnt + TCN; + local_irq_restore(flags); + + return cycles; +} + +/***************************************************************************/ + +static struct clocksource m68328_clk = { + .name = "timer", + .rating = 250, + .read = m68328_read_clk, + .shift = 20, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +/***************************************************************************/ + void hw_timer_init(void) { /* disable timer 1 */ @@ -84,19 +125,8 @@ void hw_timer_init(void) /* Enable timer 1 */ TCTL |= TCTL_TEN; -} - -/***************************************************************************/ - -unsigned long hw_timer_offset(void) -{ - unsigned long ticks = TCN, offset = 0; - - /* check for pending interrupt */ - if (ticks < (TICKS_PER_JIFFY >> 1) && (ISR & (1 << TMR_IRQ_NUM))) - offset = 1000000 / HZ; - ticks = (ticks * 1000000 / HZ) / TICKS_PER_JIFFY; - return ticks + offset; + m68328_clk.mult = clocksource_hz2mult(TICKS_PER_JIFFY*HZ, m68328_clk.shift); + clocksource_register(&m68328_clk); } /***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/68360/config.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/platform/68360/config.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/68360/config.c 2009-02-08 00:00:48.000000000 -0500 @@ -103,11 +103,6 @@ void hw_timer_init(void) pquicc->timer_tgcr = tgcr_save; } -unsigned long hw_timer_offset(void) -{ - return 0; -} - void BSP_gettod (int *yearp, int *monp, int *dayp, int *hourp, int *minp, int *secp) { Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/Makefile 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,32 @@ +# +# Makefile for the m68knommu kernel. +# + +# +# If you want to play with the HW breakpoints then you will +# need to add define this, which will give you a stack backtrace +# on the console port whenever a DBG interrupt occurs. You have to +# set up you HW breakpoints to trigger a DBG interrupt: +# +# EXTRA_CFLAGS += -DTRAP_DBG_INTERRUPT +# EXTRA_AFLAGS += -DTRAP_DBG_INTERRUPT +# + +ifdef CONFIG_FULLDEBUG +AFLAGS += -DDEBUGGER_COMPATIBLE_CACHE=1 +endif + +obj-$(CONFIG_COLDFIRE) += dma.o entry.o vectors.o +obj-$(CONFIG_M5206) += timers.o +obj-$(CONFIG_M5206e) += timers.o +obj-$(CONFIG_M520x) += pit.o +obj-$(CONFIG_M523x) += pit.o dma_timer.o irq_chip.o +obj-$(CONFIG_M5249) += timers.o +obj-$(CONFIG_M527x) += pit.o +obj-$(CONFIG_M5272) += timers.o +obj-$(CONFIG_M528x) += pit.o +obj-$(CONFIG_M5307) += timers.o +obj-$(CONFIG_M532x) += timers.o +obj-$(CONFIG_M5407) += timers.o + +extra-y := head.o Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/dma.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/dma.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,39 @@ +/***************************************************************************/ + +/* + * dma.c -- Freescale ColdFire DMA support + * + * Copyright (C) 2007, Greg Ungerer (gerg@snapgear.com) + */ + +/***************************************************************************/ + +#include +#include +#include +#include +#include + +/***************************************************************************/ + +/* + * DMA channel base address table. + */ +unsigned int dma_base_addr[MAX_M68K_DMA_CHANNELS] = { +#ifdef MCFDMA_BASE0 + MCF_MBAR + MCFDMA_BASE0, +#endif +#ifdef MCFDMA_BASE1 + MCF_MBAR + MCFDMA_BASE1, +#endif +#ifdef MCFDMA_BASE2 + MCF_MBAR + MCFDMA_BASE2, +#endif +#ifdef MCFDMA_BASE3 + MCF_MBAR + MCFDMA_BASE3, +#endif +}; + +unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/dma_timer.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/dma_timer.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,84 @@ +/* + * dma_timer.c -- Freescale ColdFire DMA Timer. + * + * Copyright (C) 2007, Benedikt Spranger + * Copyright (C) 2008. Sebastian Siewior, Linutronix + * + */ + +#include +#include + +#include +#include +#include +#include + +#define DMA_TIMER_0 (0x00) +#define DMA_TIMER_1 (0x40) +#define DMA_TIMER_2 (0x80) +#define DMA_TIMER_3 (0xc0) + +#define DTMR0 (MCF_IPSBAR + DMA_TIMER_0 + 0x400) +#define DTXMR0 (MCF_IPSBAR + DMA_TIMER_0 + 0x402) +#define DTER0 (MCF_IPSBAR + DMA_TIMER_0 + 0x403) +#define DTRR0 (MCF_IPSBAR + DMA_TIMER_0 + 0x404) +#define DTCR0 (MCF_IPSBAR + DMA_TIMER_0 + 0x408) +#define DTCN0 (MCF_IPSBAR + DMA_TIMER_0 + 0x40c) + +#define DMA_FREQ ((MCF_CLK / 2) / 16) + +/* DTMR */ +#define DMA_DTMR_RESTART (1 << 3) +#define DMA_DTMR_CLK_DIV_1 (1 << 1) +#define DMA_DTMR_CLK_DIV_16 (2 << 1) +#define DMA_DTMR_ENABLE (1 << 0) + +static cycle_t cf_dt_get_cycles(void) +{ + return __raw_readl(DTCN0); +} + +static struct clocksource clocksource_cf_dt = { + .name = "coldfire_dma_timer", + .rating = 200, + .read = cf_dt_get_cycles, + .mask = CLOCKSOURCE_MASK(32), + .shift = 20, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int __init init_cf_dt_clocksource(void) +{ + /* + * We setup DMA timer 0 in free run mode. This incrementing counter is + * used as a highly precious clock source. With MCF_CLOCK = 150 MHz we + * get a ~213 ns resolution and the 32bit register will overflow almost + * every 15 minutes. + */ + __raw_writeb(0x00, DTXMR0); + __raw_writeb(0x00, DTER0); + __raw_writel(0x00000000, DTRR0); + __raw_writew(DMA_DTMR_CLK_DIV_16 | DMA_DTMR_ENABLE, DTMR0); + clocksource_cf_dt.mult = clocksource_hz2mult(DMA_FREQ, + clocksource_cf_dt.shift); + return clocksource_register(&clocksource_cf_dt); +} + +arch_initcall(init_cf_dt_clocksource); + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen in tsc / x86 */ +#define CYC2NS_SCALE ((1000000 << CYC2NS_SCALE_FACTOR) / (DMA_FREQ / 1000)) + +static unsigned long long cycles2ns(unsigned long cycl) +{ + return (unsigned long long) ((unsigned long long)cycl * CYC2NS_SCALE) + >> CYC2NS_SCALE_FACTOR; +} + +unsigned long long sched_clock(void) +{ + unsigned long cycl = __raw_readl(DTCN0); + + return cycles2ns(cycl); +} Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/entry.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/entry.S 2009-02-08 00:02:07.000000000 -0500 @@ -0,0 +1,274 @@ +/* + * linux/arch/m68knommu/platform/5307/entry.S + * + * Copyright (C) 1999-2007, Greg Ungerer (gerg@snapgear.com) + * Copyright (C) 1998 D. Jeff Dionne , + * Kenneth Albanowski , + * Copyright (C) 2000 Lineo Inc. (www.lineo.com) + * Copyright (C) 2004-2006 Macq Electronique SA. (www.macqel.com) + * + * Based on: + * + * linux/arch/m68k/kernel/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file README.legal in the main directory of this archive + * for more details. + * + * Linux/m68k support by Hamish Macdonald + * + * 68060 fixes by Jesper Skov + * ColdFire support by Greg Ungerer (gerg@snapgear.com) + * 5307 fixes by David W. Miller + * linux 2.4 support David McCullough + * Bug, speed and maintainability fixes by Philippe De Muyter + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +.bss + +sw_ksp: +.long 0 + +sw_usp: +.long 0 + +.text + +.globl system_call +.globl resume +.globl ret_from_exception +.globl ret_from_signal +.globl sys_call_table +.globl ret_from_interrupt +.globl inthandler +.globl fasthandler + +#ifdef CONFIG_FTRACE +ENTRY(_mcount) + linkw %fp, #0 + + moveal ftrace_trace_function, %a0 + movel #ftrace_stub, %d0 + cmpl %a0@, %d0 + + bnew do_mcount + + unlk %fp + rts + +do_mcount: + + movel %fp, %d0 + moveal %d0, %a1 + + moveal %a1@, %a0 + movel %a0@(4), %sp@- /* push parent ip */ + movel %a1@(4), %sp@- /* push ip */ + + moveal ftrace_trace_function, %a0 + jsr %a0@ + + unlk %fp + +.globl ftrace_stub +ftrace_stub: + rts +END(mcount) +#endif + +enosys: + mov.l #sys_ni_syscall,%d3 + bra 1f + +ENTRY(system_call) + SAVE_ALL + move #0x2000,%sr /* enable intrs again */ + + cmpl #NR_syscalls,%d0 + jcc enosys + lea sys_call_table,%a0 + lsll #2,%d0 /* movel %a0@(%d0:l:4),%d3 */ + movel %a0@(%d0),%d3 + jeq enosys + +1: + movel %sp,%d2 /* get thread_info pointer */ + andl #-THREAD_SIZE,%d2 /* at start of kernel stack */ + movel %d2,%a0 + movel %a0@,%a1 /* save top of frame */ + movel %sp,%a1@(TASK_THREAD+THREAD_ESP0) + btst #(TIF_SYSCALL_TRACE%8),%a0@(TI_FLAGS+(31-TIF_SYSCALL_TRACE)/8) + bnes 1f + + movel %d3,%a0 + jbsr %a0@ + movel %d0,%sp@(PT_D0) /* save the return value */ + jra ret_from_exception +1: + movel #-ENOSYS,%d2 /* strace needs -ENOSYS in PT_D0 */ + movel %d2,PT_D0(%sp) /* on syscall entry */ + subql #4,%sp + SAVE_SWITCH_STACK + jbsr syscall_trace + RESTORE_SWITCH_STACK + addql #4,%sp + movel %d3,%a0 + jbsr %a0@ + movel %d0,%sp@(PT_D0) /* save the return value */ + subql #4,%sp /* dummy return address */ + SAVE_SWITCH_STACK + jbsr syscall_trace + +ret_from_signal: + RESTORE_SWITCH_STACK + addql #4,%sp + +ret_from_exception: + move #0x2700,%sr /* disable intrs */ + btst #5,%sp@(PT_SR) /* check if returning to kernel */ + jeq Luser_return /* if so, skip resched, signals */ + +#ifdef CONFIG_PREEMPT + movel %sp,%d1 /* get thread_info pointer */ + andl #-THREAD_SIZE,%d1 /* at base of kernel stack */ + movel %d1,%a0 + movel %a0@(TI_FLAGS),%d1 /* get thread_info->flags */ + andl #_TIF_RESCHED_MASK,%d1 + jeq Lkernel_return + + movel %a0@(TI_PREEMPTCOUNT),%d1 + cmpl #0,%d1 + jne Lkernel_return + + pea Lkernel_return + jmp preempt_schedule_irq /* preempt the kernel */ +#endif + +Lkernel_return: + moveml %sp@,%d1-%d5/%a0-%a2 + lea %sp@(32),%sp /* space for 8 regs */ + movel %sp@+,%d0 + addql #4,%sp /* orig d0 */ + addl %sp@+,%sp /* stk adj */ + rte + +Luser_return: + movel %sp,%d1 /* get thread_info pointer */ + andl #-THREAD_SIZE,%d1 /* at base of kernel stack */ + movel %d1,%a0 + movel %a0@(TI_FLAGS),%d1 /* get thread_info->flags */ + andl #_TIF_WORK_MASK,%d1 + jne Lwork_to_do /* still work to do */ + +Lreturn: + move #0x2700,%sr /* disable intrs */ + movel sw_usp,%a0 /* get usp */ + movel %sp@(PT_PC),%a0@- /* copy exception program counter */ + movel %sp@(PT_FORMATVEC),%a0@-/* copy exception format/vector/sr */ + moveml %sp@,%d1-%d5/%a0-%a2 + lea %sp@(32),%sp /* space for 8 regs */ + movel %sp@+,%d0 + addql #4,%sp /* orig d0 */ + addl %sp@+,%sp /* stk adj */ + addql #8,%sp /* remove exception */ + movel %sp,sw_ksp /* save ksp */ + subql #8,sw_usp /* set exception */ + movel sw_usp,%sp /* restore usp */ + rte + +Lwork_to_do: + movel %a0@(TI_FLAGS),%d1 /* get thread_info->flags */ + move #0x2000,%sr /* enable intrs again */ + andl #_TIF_RESCHED_MASK, %d1 + jne reschedule + + /* GERG: do we need something here for TRACEing?? */ + +Lsignal_return: + subql #4,%sp /* dummy return address */ + SAVE_SWITCH_STACK + pea %sp@(SWITCH_STACK_SIZE) + clrl %sp@- + jsr do_signal + addql #8,%sp + RESTORE_SWITCH_STACK + addql #4,%sp + jmp Lreturn + +/* + * This is the generic interrupt handler (for all hardware interrupt + * sources). Calls upto high level code to do all the work. + */ +ENTRY(inthandler) + SAVE_ALL + moveq #-1,%d0 + movel %d0,%sp@(PT_ORIG_D0) + + movew %sp@(PT_FORMATVEC),%d0 /* put exception # in d0 */ + andl #0x03fc,%d0 /* mask out vector only */ + + movel %sp,%sp@- /* push regs arg */ + lsrl #2,%d0 /* calculate real vector # */ + movel %d0,%sp@- /* push vector number */ + jbsr do_IRQ /* call high level irq handler */ + lea %sp@(8),%sp /* pop args off stack */ + + bra ret_from_interrupt /* this was fallthrough */ + +/* + * This is the fast interrupt handler (for certain hardware interrupt + * sources). Unlike the normal interrupt handler it just uses the + * current stack (doesn't care if it is user or kernel). It also + * doesn't bother doing the bottom half handlers. + */ +ENTRY(fasthandler) + SAVE_LOCAL + + movew %sp@(PT_FORMATVEC),%d0 + andl #0x03fc,%d0 /* mask out vector only */ + + movel %sp,%sp@- /* push regs arg */ + lsrl #2,%d0 /* calculate real vector # */ + movel %d0,%sp@- /* push vector number */ + jbsr do_IRQ /* call high level irq handler */ + lea %sp@(8),%sp /* pop args off stack */ + + RESTORE_LOCAL + +ENTRY(ret_from_interrupt) + /* the fasthandler is confusing me, haven't seen any user */ + jmp ret_from_exception + +/* + * Beware - when entering resume, prev (the current task) is + * in a0, next (the new task) is in a1,so don't change these + * registers until their contents are no longer needed. + * This is always called in supervisor mode, so don't bother to save + * and restore sr; user's process sr is actually in the stack. + */ +ENTRY(resume) + movel %a0, %d1 /* get prev thread in d1 */ + + movel sw_usp,%d0 /* save usp */ + movel %d0,%a0@(TASK_THREAD+THREAD_USP) + + SAVE_SWITCH_STACK + movel %sp,%a0@(TASK_THREAD+THREAD_KSP) /* save kernel stack pointer */ + movel %a1@(TASK_THREAD+THREAD_KSP),%sp /* restore new thread stack */ + RESTORE_SWITCH_STACK + + movel %a1@(TASK_THREAD+THREAD_USP),%a0 /* restore thread user stack */ + movel %a0, sw_usp + rts Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/head.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/head.S 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,222 @@ +/*****************************************************************************/ + +/* + * head.S -- common startup code for ColdFire CPUs. + * + * (C) Copyright 1999-2006, Greg Ungerer . + */ + +/*****************************************************************************/ + +#include +#include +#include +#include +#include +#include + +/*****************************************************************************/ + +/* + * If we don't have a fixed memory size, then lets build in code + * to auto detect the DRAM size. Obviously this is the prefered + * method, and should work for most boards. It won't work for those + * that do not have their RAM starting at address 0, and it only + * works on SDRAM (not boards fitted with SRAM). + */ +#if CONFIG_RAMSIZE != 0 +.macro GET_MEM_SIZE + movel #CONFIG_RAMSIZE,%d0 /* hard coded memory size */ +.endm + +#elif defined(CONFIG_M5206) || defined(CONFIG_M5206e) || \ + defined(CONFIG_M5249) || defined(CONFIG_M527x) || \ + defined(CONFIG_M528x) || defined(CONFIG_M5307) || \ + defined(CONFIG_M5407) +/* + * Not all these devices have exactly the same DRAM controller, + * but the DCMR register is virtually identical - give or take + * a couple of bits. The only exception is the 5272 devices, their + * DRAM controller is quite different. + */ +.macro GET_MEM_SIZE + movel MCF_MBAR+MCFSIM_DMR0,%d0 /* get mask for 1st bank */ + btst #0,%d0 /* check if region enabled */ + beq 1f + andl #0xfffc0000,%d0 + beq 1f + addl #0x00040000,%d0 /* convert mask to size */ +1: + movel MCF_MBAR+MCFSIM_DMR1,%d1 /* get mask for 2nd bank */ + btst #0,%d1 /* check if region enabled */ + beq 2f + andl #0xfffc0000, %d1 + beq 2f + addl #0x00040000,%d1 + addl %d1,%d0 /* total mem size in d0 */ +2: +.endm + +#elif defined(CONFIG_M5272) +.macro GET_MEM_SIZE + movel MCF_MBAR+MCFSIM_CSOR7,%d0 /* get SDRAM address mask */ + andil #0xfffff000,%d0 /* mask out chip select options */ + negl %d0 /* negate bits */ +.endm + +#elif defined(CONFIG_M520x) +.macro GET_MEM_SIZE + clrl %d0 + movel MCF_MBAR+MCFSIM_SDCS0, %d2 /* Get SDRAM chip select 0 config */ + andl #0x1f, %d2 /* Get only the chip select size */ + beq 3f /* Check if it is enabled */ + addql #1, %d2 /* Form exponent */ + moveql #1, %d0 + lsll %d2, %d0 /* 2 ^ exponent */ +3: + movel MCF_MBAR+MCFSIM_SDCS1, %d2 /* Get SDRAM chip select 1 config */ + andl #0x1f, %d2 /* Get only the chip select size */ + beq 4f /* Check if it is enabled */ + addql #1, %d2 /* Form exponent */ + moveql #1, %d1 + lsll %d2, %d1 /* 2 ^ exponent */ + addl %d1, %d0 /* Total size of SDRAM in d0 */ +4: +.endm + +#else +#error "ERROR: I don't know how to probe your boards memory size?" +#endif + +/*****************************************************************************/ + +/* + * Boards and platforms can do specific early hardware setup if + * they need to. Most don't need this, define away if not required. + */ +#ifndef PLATFORM_SETUP +#define PLATFORM_SETUP +#endif + +/*****************************************************************************/ + +.global _start +.global _rambase +.global _ramvec +.global _ramstart +.global _ramend + +/*****************************************************************************/ + +.data + +/* + * During startup we store away the RAM setup. These are not in the + * bss, since their values are determined and written before the bss + * has been cleared. + */ +_rambase: +.long 0 +_ramvec: +.long 0 +_ramstart: +.long 0 +_ramend: +.long 0 + +/*****************************************************************************/ + +.text + +/* + * This is the codes first entry point. This is where it all + * begins... + */ + +_start: + nop /* filler */ + movew #0x2700, %sr /* no interrupts */ + + /* + * Do any platform or board specific setup now. Most boards + * don't need anything. Those exceptions are define this in + * their board specific includes. + */ + PLATFORM_SETUP + + /* + * Create basic memory configuration. Set VBR accordingly, + * and size memory. + */ + movel #CONFIG_VECTORBASE,%a7 + movec %a7,%VBR /* set vectors addr */ + movel %a7,_ramvec + + movel #CONFIG_RAMBASE,%a7 /* mark the base of RAM */ + movel %a7,_rambase + + GET_MEM_SIZE /* macro code determines size */ + addl %a7,%d0 + movel %d0,_ramend /* set end ram addr */ + + /* + * Now that we know what the memory is, lets enable cache + * and get things moving. This is Coldfire CPU specific. + */ + CACHE_ENABLE /* enable CPU cache */ + + +#ifdef CONFIG_ROMFS_FS + /* + * Move ROM filesystem above bss :-) + */ + lea _sbss,%a0 /* get start of bss */ + lea _ebss,%a1 /* set up destination */ + movel %a0,%a2 /* copy of bss start */ + + movel 8(%a0),%d0 /* get size of ROMFS */ + addql #8,%d0 /* allow for rounding */ + andl #0xfffffffc, %d0 /* whole words */ + + addl %d0,%a0 /* copy from end */ + addl %d0,%a1 /* copy from end */ + movel %a1,_ramstart /* set start of ram */ + +_copy_romfs: + movel -(%a0),%d0 /* copy dword */ + movel %d0,-(%a1) + cmpl %a0,%a2 /* check if at end */ + bne _copy_romfs + +#else /* CONFIG_ROMFS_FS */ + lea _ebss,%a1 + movel %a1,_ramstart +#endif /* CONFIG_ROMFS_FS */ + + + /* + * Zero out the bss region. + */ + lea _sbss,%a0 /* get start of bss */ + lea _ebss,%a1 /* get end of bss */ + clrl %d0 /* set value */ +_clear_bss: + movel %d0,(%a0)+ /* clear each word */ + cmpl %a0,%a1 /* check if at end */ + bne _clear_bss + + /* + * Load the current task pointer and stack. + */ + lea init_thread_union,%a0 + lea THREAD_SIZE(%a0),%sp + + /* + * Assember start up done, start code proper. + */ + jsr start_kernel /* start Linux kernel */ + +_exit: + jmp _exit /* should never get here */ + +/*****************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/irq_chip.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/irq_chip.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,110 @@ +/* + * IRQ-Chip implementation for Coldfire + * + * Author: Sebastian Siewior + */ + +#include +#include +#include +#include + +static inline void *coldfire_irqnum_to_mem(unsigned int irq) +{ + u32 imrp; + + imrp = MCF_IPSBAR; +#if defined(MCFINT_INTC1_VECBASE) + if (irq > MCFINT_INTC1_VECBASE) { + imrp += MCFICM_INTC1; + irq -= MCFINT_PER_INTC; + } else +#endif + imrp += MCFICM_INTC0; + + irq -= MCFINT_VECBASE; + + if (irq > 32) + imrp += MCFINTC_IMRH; + else + imrp += MCFINTC_IMRL; + + return (void *)imrp; +} + +static inline unsigned int coldfire_irqnum_to_bit(unsigned int irq) +{ + irq -= MCFINT_VECBASE; + + if (irq > 32) + irq -= 32; + + return irq; +} + +static void coldfire_mask(unsigned int irq) +{ + volatile unsigned long *imrp; + u32 mask; + u32 irq_bit; + + imrp = coldfire_irqnum_to_mem(irq); + irq_bit = coldfire_irqnum_to_bit(irq); + + mask = 1 << irq_bit; + *imrp |= mask; +} + +static void coldfire_unmask(unsigned int irq) +{ + volatile unsigned long *imrp; + u32 mask; + u32 irq_bit; + + imrp = coldfire_irqnum_to_mem(irq); + irq_bit = coldfire_irqnum_to_bit(irq); + + mask = 1 << irq_bit; + *imrp &= ~mask; +} + +static void coldfire_nop(unsigned int irq) +{ +} + +static struct irq_chip m_irq_chip = { + .name = "M68K-INTC", + .ack = coldfire_nop, + .mask = coldfire_mask, + .unmask = coldfire_unmask, +}; + +void __init coldfire_init_irq_chip(void) +{ + volatile u32 *imrp; + volatile u8 *icrp; + u32 irq; + u32 i; + + for (irq = 0; irq < NR_IRQS; irq++) + set_irq_chip_and_handler_name(irq, &m_irq_chip, + handle_level_irq, m_irq_chip.name); + + /* setup prios for interrupt sources (first field is reserved) */ + icrp = (u8 *)MCF_IPSBAR + MCFICM_INTC0 + MCFINTC_ICR0; + for (i = 1; i <= 63; i++) + icrp[i] = i; + + /* remove the disable all flag, disable all interrupt sources */ + imrp = coldfire_irqnum_to_mem(MCFINT_VECBASE); + *imrp = 0xfffffffe; + +#if defined(MCFINT_INTC1_VECBASE) + icrp = (u8 *)MCF_IPSBAR + MCFICM_INTC1 + MCFINTC_ICR0; + for (i = 1; i <= 63; i++) + icrp[i] = i; + + imrp = coldfire_irqnum_to_mem(MCFINT_INTC1_VECBASE); + *imrp = 0xfffffffe; +#endif +} Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/pit.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/pit.c 2009-02-08 00:01:57.000000000 -0500 @@ -0,0 +1,180 @@ +/***************************************************************************/ + +/* + * pit.c -- Freescale ColdFire PIT timer. Currently this type of + * hardware timer only exists in the Freescale ColdFire + * 5270/5271, 5282 and 5208 CPUs. No doubt newer ColdFire + * family members will probably use it too. + * + * Copyright (C) 1999-2008, Greg Ungerer (gerg@snapgear.com) + * Copyright (C) 2001-2004, SnapGear Inc. (www.snapgear.com) + */ + +/***************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/***************************************************************************/ + +/* + * By default use timer1 as the system clock timer. + */ +#define FREQ ((MCF_CLK / 2) / 64) +#define TA(a) (MCF_IPSBAR + MCFPIT_BASE1 + (a)) +#define INTC0 (MCF_IPSBAR + MCFICM_INTC0) +#define PIT_CYCLES_PER_JIFFY (FREQ / HZ) + +static u32 pit_cnt; + +/* + * Initialize the PIT timer. + * + * This is also called after resume to bring the PIT into operation again. + */ + +static void init_cf_pit_timer(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + + __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); + __raw_writew(PIT_CYCLES_PER_JIFFY, TA(MCFPIT_PMR)); + __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | \ + MCFPIT_PCSR_OVW | MCFPIT_PCSR_RLD | \ + MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR)); + break; + + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + + __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); + break; + + case CLOCK_EVT_MODE_ONESHOT: + + __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); + __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | \ + MCFPIT_PCSR_OVW | MCFPIT_PCSR_CLK64, \ + TA(MCFPIT_PCSR)); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } +} + +/* + * Program the next event in oneshot mode + * + * Delta is given in PIT ticks + */ +static int cf_pit_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + __raw_writew(delta, TA(MCFPIT_PMR)); + return 0; +} + +struct clock_event_device cf_pit_clockevent = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_cf_pit_timer, + .set_next_event = cf_pit_next_event, + .shift = 32, + .irq = MCFINT_VECBASE + MCFINT_PIT1, +}; + + + +/***************************************************************************/ + +static irqreturn_t pit_tick(int irq, void *dummy) +{ + struct clock_event_device *evt = &cf_pit_clockevent; + u16 pcsr; + + /* Reset the ColdFire timer */ + pcsr = __raw_readw(TA(MCFPIT_PCSR)); + __raw_writew(pcsr | MCFPIT_PCSR_PIF, TA(MCFPIT_PCSR)); + + pit_cnt += PIT_CYCLES_PER_JIFFY; + evt->event_handler(evt); + return IRQ_HANDLED; +} + +/***************************************************************************/ + +static struct irqaction pit_irq = { + .name = "timer", + .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_NODELAY, + .handler = pit_tick, +}; + +/***************************************************************************/ + +static cycle_t pit_read_clk(void) +{ + unsigned long flags; + u32 cycles; + u16 pcntr; + + local_irq_save(flags); + pcntr = __raw_readw(TA(MCFPIT_PCNTR)); + cycles = pit_cnt; + local_irq_restore(flags); + + return cycles + PIT_CYCLES_PER_JIFFY - pcntr; +} + +/***************************************************************************/ + +static struct clocksource pit_clk = { + .name = "pit", + .rating = 100, + .read = pit_read_clk, + .shift = 20, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +/***************************************************************************/ + +void hw_timer_init(void) +{ + u32 imr; + + cf_pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32); + cf_pit_clockevent.max_delta_ns = + clockevent_delta2ns(0xFFFF, &cf_pit_clockevent); + cf_pit_clockevent.min_delta_ns = + clockevent_delta2ns(0x3f, &cf_pit_clockevent); + clockevents_register_device(&cf_pit_clockevent); + + setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &pit_irq); + +#if !defined(CONFIG_M523x) + __raw_writeb(ICR_INTRCONF, INTC0 + MCFINTC_ICR0 + MCFINT_PIT1); + imr = __raw_readl(INTC0 + MCFPIT_IMR); + imr &= ~MCFPIT_IMR_IBIT; + __raw_writel(imr, INTC0 + MCFPIT_IMR); + +#endif + pit_clk.mult = clocksource_hz2mult(FREQ, pit_clk.shift); + clocksource_register(&pit_clk); +} + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/timers.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/timers.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,182 @@ +/***************************************************************************/ + +/* + * timers.c -- generic ColdFire hardware timer support. + * + * Copyright (C) 1999-2008, Greg Ungerer + */ + +/***************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/***************************************************************************/ + +/* + * By default use timer1 as the system clock timer. + */ +#define FREQ (MCF_BUSCLK / 16) +#define TA(a) (MCF_MBAR + MCFTIMER_BASE1 + (a)) + +/* + * Default the timer and vector to use for ColdFire. Some ColdFire + * CPU's and some boards may want different. Their sub-architecture + * startup code (in config.c) can change these if they want. + */ +unsigned int mcf_timervector = 29; +unsigned int mcf_profilevector = 31; +unsigned int mcf_timerlevel = 5; + +/* + * These provide the underlying interrupt vector support. + * Unfortunately it is a little different on each ColdFire. + */ +extern void mcf_settimericr(int timer, int level); +void coldfire_profile_init(void); + +#if defined(CONFIG_M532x) +#define __raw_readtrr __raw_readl +#define __raw_writetrr __raw_writel +#else +#define __raw_readtrr __raw_readw +#define __raw_writetrr __raw_writew +#endif + +static u32 mcftmr_cycles_per_jiffy; +static u32 mcftmr_cnt; + +/***************************************************************************/ + +static irqreturn_t mcftmr_tick(int irq, void *dummy) +{ + /* Reset the ColdFire timer */ + __raw_writeb(MCFTIMER_TER_CAP | MCFTIMER_TER_REF, TA(MCFTIMER_TER)); + + mcftmr_cnt += mcftmr_cycles_per_jiffy; + return arch_timer_interrupt(irq, dummy); +} + +/***************************************************************************/ + +static struct irqaction mcftmr_timer_irq = { + .name = "timer", + .flags = IRQF_DISABLED | IRQF_TIMER, + .handler = mcftmr_tick, +}; + +/***************************************************************************/ + +static cycle_t mcftmr_read_clk(void) +{ + unsigned long flags; + u32 cycles; + u16 tcn; + + local_irq_save(flags); + tcn = __raw_readw(TA(MCFTIMER_TCN)); + cycles = mcftmr_cnt; + local_irq_restore(flags); + + return cycles + tcn; +} + +/***************************************************************************/ + +static struct clocksource mcftmr_clk = { + .name = "tmr", + .rating = 250, + .read = mcftmr_read_clk, + .shift = 20, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +/***************************************************************************/ + +void hw_timer_init(void) +{ + setup_irq(mcf_timervector, &mcftmr_timer_irq); + + __raw_writew(MCFTIMER_TMR_DISABLE, TA(MCFTIMER_TMR)); + mcftmr_cycles_per_jiffy = FREQ / HZ; + __raw_writetrr(mcftmr_cycles_per_jiffy, TA(MCFTIMER_TRR)); + __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | + MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR)); + + mcftmr_clk.mult = clocksource_hz2mult(FREQ, mcftmr_clk.shift); + clocksource_register(&mcftmr_clk); + + mcf_settimericr(1, mcf_timerlevel); + +#ifdef CONFIG_HIGHPROFILE + coldfire_profile_init(); +#endif +} + +/***************************************************************************/ +#ifdef CONFIG_HIGHPROFILE +/***************************************************************************/ + +/* + * By default use timer2 as the profiler clock timer. + */ +#define PA(a) (MCF_MBAR + MCFTIMER_BASE2 + (a)) + +/* + * Choose a reasonably fast profile timer. Make it an odd value to + * try and get good coverage of kernel operations. + */ +#define PROFILEHZ 1013 + +/* + * Use the other timer to provide high accuracy profiling info. + */ +irqreturn_t coldfire_profile_tick(int irq, void *dummy) +{ + /* Reset ColdFire timer2 */ + __raw_writeb(MCFTIMER_TER_CAP | MCFTIMER_TER_REF, PA(MCFTIMER_TER)); + if (current->pid) + profile_tick(CPU_PROFILING); + return IRQ_HANDLED; +} + +/***************************************************************************/ + +static struct irqaction coldfire_profile_irq = { + .name = "profile timer", + .flags = IRQF_DISABLED | IRQF_TIMER, + .handler = coldfire_profile_tick, +}; + +void coldfire_profile_init(void) +{ + printk(KERN_INFO "PROFILE: lodging TIMER2 @ %dHz as profile timer\n", + PROFILEHZ); + + setup_irq(mcf_profilevector, &coldfire_profile_irq); + + /* Set up TIMER 2 as high speed profile clock */ + __raw_writew(MCFTIMER_TMR_DISABLE, PA(MCFTIMER_TMR)); + + __raw_writetrr(((MCF_BUSCLK / 16) / PROFILEHZ), PA(MCFTIMER_TRR)); + __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | + MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, PA(MCFTIMER_TMR)); + + mcf_settimericr(2, 7); +} + +/***************************************************************************/ +#endif /* CONFIG_HIGHPROFILE */ +/***************************************************************************/ Index: linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/vectors.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/platform/coldfire/vectors.c 2009-02-08 00:00:48.000000000 -0500 @@ -0,0 +1,105 @@ +/***************************************************************************/ + +/* + * linux/arch/m68knommu/platform/5307/vectors.c + * + * Copyright (C) 1999-2007, Greg Ungerer + */ + +/***************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/***************************************************************************/ + +#ifdef TRAP_DBG_INTERRUPT + +asmlinkage void dbginterrupt_c(struct frame *fp) +{ + extern void dump(struct pt_regs *fp); + printk(KERN_DEBUG "%s(%d): BUS ERROR TRAP\n", __FILE__, __LINE__); + dump((struct pt_regs *) fp); + asm("halt"); +} + +#endif + +/***************************************************************************/ + +extern e_vector *_ramvec; + +void set_evector(int vecnum, void (*handler)(void)) +{ + if (vecnum >= 0 && vecnum <= 255) + _ramvec[vecnum] = handler; +} + +/***************************************************************************/ + +/* Assembler routines */ +asmlinkage void buserr(void); +asmlinkage void trap(void); +asmlinkage void system_call(void); +asmlinkage void inthandler(void); + +void __init init_vectors(void) +{ + int i; + + /* + * There is a common trap handler and common interrupt + * handler that handle almost every vector. We treat + * the system call and bus error special, they get their + * own first level handlers. + */ + for (i = 3; (i <= 23); i++) + _ramvec[i] = trap; + for (i = 33; (i <= 63); i++) + _ramvec[i] = trap; + for (i = 24; (i <= 31); i++) + _ramvec[i] = inthandler; + for (i = 64; (i < 255); i++) + _ramvec[i] = inthandler; + _ramvec[255] = 0; + + _ramvec[2] = buserr; + _ramvec[32] = system_call; + +#ifdef TRAP_DBG_INTERRUPT + _ramvec[12] = dbginterrupt; +#endif +} + +/***************************************************************************/ + +void enable_vector(unsigned int irq) +{ + /* Currently no action on ColdFire */ +} + +void disable_vector(unsigned int irq) +{ + /* Currently no action on ColdFire */ +} + +void ack_vector(unsigned int irq) +{ + /* Currently no action on ColdFire */ +} + +/***************************************************************************/ + +void coldfire_reset(void) +{ + HARD_RESET_NOW(); +} + +/***************************************************************************/ Index: linux-2.6.24.7-rt27/drivers/net/fec.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/net/fec.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/net/fec.c 2009-02-08 00:00:48.000000000 -0500 @@ -2,12 +2,6 @@ * Fast Ethernet Controller (FEC) driver for Motorola MPC8xx. * Copyright (c) 1997 Dan Malek (dmalek@jlc.net) * - * This version of the driver is specific to the FADS implementation, - * since the board contains control registers external to the processor - * for the control of the LevelOne LXT970 transceiver. The MPC860T manual - * describes connections using the internal parallel port I/O, which - * is basically all of Port D. - * * Right now, I am very wasteful with the buffers. I allocate memory * pages and then divide them into 2K frame buffers. This way I know I * have buffers large enough to hold one frame within one buffer descriptor. @@ -49,17 +43,9 @@ #include #include -#if defined(CONFIG_M523x) || defined(CONFIG_M527x) || \ - defined(CONFIG_M5272) || defined(CONFIG_M528x) || \ - defined(CONFIG_M520x) || defined(CONFIG_M532x) #include #include #include "fec.h" -#else -#include -#include -#include "commproc.h" -#endif #if defined(CONFIG_FEC2) #define FEC_MAX_PORTS 2 @@ -67,6 +53,7 @@ #define FEC_MAX_PORTS 1 #endif + /* * Define the fixed address of the FEC hardware. */ @@ -79,15 +66,15 @@ static unsigned int fec_hw[] = { #elif defined(CONFIG_M523x) || defined(CONFIG_M528x) (MCF_MBAR + 0x1000), #elif defined(CONFIG_M520x) - (MCF_MBAR+0x30000), + (MCF_MBAR + 0x30000), #elif defined(CONFIG_M532x) - (MCF_MBAR+0xfc030000), + (MCF_MBAR + 0xfc030000), #else - &(((immap_t *)IMAP_ADDR)->im_cpm.cp_fec), + &(((immap_t *) IMAP_ADDR)->im_cpm.cp_fec), #endif }; -static unsigned char fec_mac_default[] = { +static unsigned char fec_mac_default[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; @@ -101,20 +88,20 @@ static unsigned char fec_mac_default[] = #define FEC_FLASHMAC 0xf0006000 #elif defined(CONFIG_CANCam) #define FEC_FLASHMAC 0xf0020000 -#elif defined (CONFIG_M5272C3) +#elif defined(CONFIG_M5272C3) #define FEC_FLASHMAC (0xffe04000 + 4) #elif defined(CONFIG_MOD5272) -#define FEC_FLASHMAC 0xffc0406b +#define FEC_FLASHMAC 0xffc0406b #else #define FEC_FLASHMAC 0 #endif /* Forward declarations of some structures to support different PHYs */ - +typedef void (mii_func)(uint val, struct net_device *dev); typedef struct { uint mii_data; - void (*funct)(uint mii_reg, struct net_device *dev); + mii_func *funct; } phy_cmd_t; typedef struct { @@ -165,7 +152,6 @@ typedef struct { #define PKT_MINBUF_SIZE 64 #define PKT_MAXBLR_SIZE 1520 - /* * The 5270/5271/5280/5282/532x RX control register also contains maximum frame * size bits. Other FEC hardware does not, so we need to take that into @@ -188,75 +174,67 @@ typedef struct { */ struct fec_enet_private { /* Hardware registers of the FEC device */ - volatile fec_t *hwp; + volatile fec_t *hwp; struct net_device *netdev; /* The saved address of a sent-in-place packet/buffer, for skfree(). */ unsigned char *tx_bounce[TX_RING_SIZE]; - struct sk_buff* tx_skbuff[TX_RING_SIZE]; - ushort skb_cur; - ushort skb_dirty; + struct sk_buff *tx_skbuff[TX_RING_SIZE]; + ushort skb_cur; + ushort skb_dirty; /* CPM dual port RAM relative addresses. - */ - cbd_t *rx_bd_base; /* Address of Rx and Tx buffers. */ - cbd_t *tx_bd_base; - cbd_t *cur_rx, *cur_tx; /* The next free ring entry */ - cbd_t *dirty_tx; /* The ring entries to be free()ed. */ - uint tx_full; - spinlock_t lock; - - uint phy_id; - uint phy_id_done; - uint phy_status; - uint phy_speed; - phy_info_t const *phy; + */ + cbd_t *rx_bd_base; /* Address of Rx and Tx buffers. */ + cbd_t *tx_bd_base; + cbd_t *cur_rx, *cur_tx; /* The next free ring entry */ + cbd_t *dirty_tx; /* The ring entries to be free()ed. */ + uint tx_full; + /* hold while accessing the HW like ringbuffer for tx/rx but not MAC */ + spinlock_t hw_lock; + /* hold while accessing the mii_list_t() elements */ + spinlock_t mii_lock; + + uint phy_id; + uint phy_id_done; + uint phy_status; + uint phy_speed; + phy_info_t const *phy; struct work_struct phy_task; - uint sequence_done; - uint mii_phy_task_queued; + uint sequence_done; + uint mii_phy_task_queued; + + uint phy_addr; - uint phy_addr; + int index; + int opened; + int link; + int old_link; + int full_duplex; +}; - int index; - int opened; - int link; - int old_link; - int full_duplex; -}; - -static int fec_enet_open(struct net_device *dev); -static int fec_enet_start_xmit(struct sk_buff *skb, struct net_device *dev); -static void fec_enet_mii(struct net_device *dev); -static irqreturn_t fec_enet_interrupt(int irq, void * dev_id); -static void fec_enet_tx(struct net_device *dev); -static void fec_enet_rx(struct net_device *dev); -static int fec_enet_close(struct net_device *dev); -static void set_multicast_list(struct net_device *dev); static void fec_restart(struct net_device *dev, int duplex); static void fec_stop(struct net_device *dev); -static void fec_set_mac_address(struct net_device *dev); - /* MII processing. We keep this as simple as possible. Requests are * placed on the list (if there is room). When the request is finished * by the MII, an optional function may be called. */ typedef struct mii_list { - uint mii_regval; - void (*mii_func)(uint val, struct net_device *dev); - struct mii_list *mii_next; + uint mii_regval; + void (*mii_func)(uint val, struct net_device *dev); + struct mii_list *mii_next; } mii_list_t; -#define NMII 20 -static mii_list_t mii_cmds[NMII]; -static mii_list_t *mii_free; -static mii_list_t *mii_head; -static mii_list_t *mii_tail; +#define NMII 20 +static mii_list_t mii_cmds[NMII]; +static mii_list_t *mii_free; +static mii_list_t *mii_head; +static mii_list_t *mii_tail; -static int mii_queue(struct net_device *dev, int request, - void (*func)(uint, struct net_device *)); +static int mii_queue(struct net_device *dev, int request, mii_func *funct); /* Make MII read/write commands for the FEC. */ @@ -272,52 +250,52 @@ static int mii_queue(struct net_device * /* Register definitions for the PHY. */ -#define MII_REG_CR 0 /* Control Register */ -#define MII_REG_SR 1 /* Status Register */ -#define MII_REG_PHYIR1 2 /* PHY Identification Register 1 */ -#define MII_REG_PHYIR2 3 /* PHY Identification Register 2 */ -#define MII_REG_ANAR 4 /* A-N Advertisement Register */ -#define MII_REG_ANLPAR 5 /* A-N Link Partner Ability Register */ -#define MII_REG_ANER 6 /* A-N Expansion Register */ -#define MII_REG_ANNPTR 7 /* A-N Next Page Transmit Register */ -#define MII_REG_ANLPRNPR 8 /* A-N Link Partner Received Next Page Reg. */ +#define MII_REG_CR 0 /* Control Register */ +#define MII_REG_SR 1 /* Status Register */ +#define MII_REG_PHYIR1 2 /* PHY Identification Register 1 */ +#define MII_REG_PHYIR2 3 /* PHY Identification Register 2 */ +#define MII_REG_ANAR 4 /* A-N Advertisement Register */ +#define MII_REG_ANLPAR 5 /* A-N Link Partner Ability Register */ +#define MII_REG_ANER 6 /* A-N Expansion Register */ +#define MII_REG_ANNPTR 7 /* A-N Next Page Transmit Register */ +#define MII_REG_ANLPRNPR 8 /* A-N Link Partner Received Next Page Reg. */ /* values for phy_status */ -#define PHY_CONF_ANE 0x0001 /* 1 auto-negotiation enabled */ -#define PHY_CONF_LOOP 0x0002 /* 1 loopback mode enabled */ -#define PHY_CONF_SPMASK 0x00f0 /* mask for speed */ -#define PHY_CONF_10HDX 0x0010 /* 10 Mbit half duplex supported */ -#define PHY_CONF_10FDX 0x0020 /* 10 Mbit full duplex supported */ -#define PHY_CONF_100HDX 0x0040 /* 100 Mbit half duplex supported */ -#define PHY_CONF_100FDX 0x0080 /* 100 Mbit full duplex supported */ - -#define PHY_STAT_LINK 0x0100 /* 1 up - 0 down */ -#define PHY_STAT_FAULT 0x0200 /* 1 remote fault */ -#define PHY_STAT_ANC 0x0400 /* 1 auto-negotiation complete */ -#define PHY_STAT_SPMASK 0xf000 /* mask for speed */ -#define PHY_STAT_10HDX 0x1000 /* 10 Mbit half duplex selected */ -#define PHY_STAT_10FDX 0x2000 /* 10 Mbit full duplex selected */ -#define PHY_STAT_100HDX 0x4000 /* 100 Mbit half duplex selected */ -#define PHY_STAT_100FDX 0x8000 /* 100 Mbit full duplex selected */ +#define PHY_CONF_ANE 0x0001 /* 1 auto-negotiation enabled */ +#define PHY_CONF_LOOP 0x0002 /* 1 loopback mode enabled */ +#define PHY_CONF_SPMASK 0x00f0 /* mask for speed */ +#define PHY_CONF_10HDX 0x0010 /* 10 Mbit half duplex supported */ +#define PHY_CONF_10FDX 0x0020 /* 10 Mbit full duplex supported */ +#define PHY_CONF_100HDX 0x0040 /* 100 Mbit half duplex supported */ +#define PHY_CONF_100FDX 0x0080 /* 100 Mbit full duplex supported */ + +#define PHY_STAT_LINK 0x0100 /* 1 up - 0 down */ +#define PHY_STAT_FAULT 0x0200 /* 1 remote fault */ +#define PHY_STAT_ANC 0x0400 /* 1 auto-negotiation complete */ +#define PHY_STAT_SPMASK 0xf000 /* mask for speed */ +#define PHY_STAT_10HDX 0x1000 /* 10 Mbit half duplex selected */ +#define PHY_STAT_10FDX 0x2000 /* 10 Mbit full duplex selected */ +#define PHY_STAT_100HDX 0x4000 /* 100 Mbit half duplex selected */ +#define PHY_STAT_100FDX 0x8000 /* 100 Mbit full duplex selected */ - -static int -fec_enet_start_xmit(struct sk_buff *skb, struct net_device *dev) +static int fec_enet_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct fec_enet_private *fep; - volatile fec_t *fecp; - volatile cbd_t *bdp; - unsigned short status; + volatile fec_t *fecp; + volatile cbd_t *bdp; + unsigned short status; + unsigned long flags; fep = netdev_priv(dev); - fecp = (volatile fec_t*)dev->base_addr; + fecp = (volatile fec_t *)dev->base_addr; if (!fep->link) { /* Link is down or autonegotiation is in progress. */ return 1; } + spin_lock_irqsave(&fep->hw_lock, flags); /* Fill in a Tx ring entry */ bdp = fep->cur_tx; @@ -328,6 +306,7 @@ fec_enet_start_xmit(struct sk_buff *skb, * This should not happen, since dev->tbusy should be set. */ printk("%s: tx queue full!.\n", dev->name); + spin_unlock_irqrestore(&fep->hw_lock, flags); return 1; } #endif @@ -337,28 +316,29 @@ fec_enet_start_xmit(struct sk_buff *skb, status &= ~BD_ENET_TX_STATS; /* Set buffer length and buffer pointer. - */ + */ bdp->cbd_bufaddr = __pa(skb->data); bdp->cbd_datlen = skb->len; /* - * On some FEC implementations data must be aligned on - * 4-byte boundaries. Use bounce buffers to copy data - * and get it aligned. Ugh. + * On some FEC implementations data must be aligned on + * 4-byte boundaries. Use bounce buffers to copy data + * and get it aligned. Ugh. */ if (bdp->cbd_bufaddr & 0x3) { unsigned int index; index = bdp - fep->tx_bd_base; - memcpy(fep->tx_bounce[index], (void *) bdp->cbd_bufaddr, bdp->cbd_datlen); + memcpy(fep->tx_bounce[index], (void *)bdp->cbd_bufaddr, + bdp->cbd_datlen); bdp->cbd_bufaddr = __pa(fep->tx_bounce[index]); } /* Save skb pointer. - */ + */ fep->tx_skbuff[fep->skb_cur] = skb; dev->stats.tx_bytes += skb->len; - fep->skb_cur = (fep->skb_cur+1) & TX_RING_MOD_MASK; + fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK; /* Push the data cache so the CPM does not get stale memory * data. @@ -366,14 +346,13 @@ fec_enet_start_xmit(struct sk_buff *skb, flush_dcache_range((unsigned long)skb->data, (unsigned long)skb->data + skb->len); - spin_lock_irq(&fep->lock); /* Send it on its way. Tell FEC it's ready, interrupt when done, * it's the last BD of the frame, and to put the CRC on the end. */ status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR - | BD_ENET_TX_LAST | BD_ENET_TX_TC); + | BD_ENET_TX_LAST | BD_ENET_TX_TC); bdp->cbd_sc = status; dev->trans_start = jiffies; @@ -382,7 +361,7 @@ fec_enet_start_xmit(struct sk_buff *skb, fecp->fec_x_des_active = 0; /* If this was the last BD in the ring, start at the beginning again. - */ + */ if (status & BD_ENET_TX_WRAP) { bdp = fep->tx_bd_base; } else { @@ -394,15 +373,14 @@ fec_enet_start_xmit(struct sk_buff *skb, netif_stop_queue(dev); } - fep->cur_tx = (cbd_t *)bdp; + fep->cur_tx = (cbd_t *) bdp; - spin_unlock_irq(&fep->lock); + spin_unlock_irqrestore(&fep->hw_lock, flags); return 0; } -static void -fec_timeout(struct net_device *dev) +static void fec_timeout(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); @@ -410,115 +388,200 @@ fec_timeout(struct net_device *dev) dev->stats.tx_errors++; #ifndef final_version { - int i; - cbd_t *bdp; + int i; + cbd_t *bdp; - printk("Ring data dump: cur_tx %lx%s, dirty_tx %lx cur_rx: %lx\n", - (unsigned long)fep->cur_tx, fep->tx_full ? " (full)" : "", - (unsigned long)fep->dirty_tx, - (unsigned long)fep->cur_rx); + printk + ("Ring data dump: cur_tx %lx%s, dirty_tx %lx cur_rx: %lx\n", + (unsigned long)fep->cur_tx, fep->tx_full ? " (full)" : "", + (unsigned long)fep->dirty_tx, (unsigned long)fep->cur_rx); - bdp = fep->tx_bd_base; - printk(" tx: %u buffers\n", TX_RING_SIZE); - for (i = 0 ; i < TX_RING_SIZE; i++) { - printk(" %08x: %04x %04x %08x\n", - (uint) bdp, - bdp->cbd_sc, - bdp->cbd_datlen, - (int) bdp->cbd_bufaddr); - bdp++; - } + bdp = fep->tx_bd_base; + printk(" tx: %u buffers\n", TX_RING_SIZE); + for (i = 0; i < TX_RING_SIZE; i++) { + printk(" %08x: %04x %04x %08x\n", + (uint) bdp, + bdp->cbd_sc, + bdp->cbd_datlen, (int)bdp->cbd_bufaddr); + bdp++; + } - bdp = fep->rx_bd_base; - printk(" rx: %lu buffers\n", (unsigned long) RX_RING_SIZE); - for (i = 0 ; i < RX_RING_SIZE; i++) { - printk(" %08x: %04x %04x %08x\n", - (uint) bdp, - bdp->cbd_sc, - bdp->cbd_datlen, - (int) bdp->cbd_bufaddr); - bdp++; - } + bdp = fep->rx_bd_base; + printk(" rx: %lu buffers\n", (unsigned long)RX_RING_SIZE); + for (i = 0; i < RX_RING_SIZE; i++) { + printk(" %08x: %04x %04x %08x\n", + (uint) bdp, + bdp->cbd_sc, + bdp->cbd_datlen, (int)bdp->cbd_bufaddr); + bdp++; + } } #endif fec_restart(dev, fep->full_duplex); netif_wake_queue(dev); } -/* The interrupt handler. - * This is called from the MPC core interrupt. +/* During a receive, the cur_rx points to the current incoming buffer. + * When we update through the ring, if the next incoming buffer has + * not been given to the system, we just set the empty indicator, + * effectively tossing the packet. */ -static irqreturn_t -fec_enet_interrupt(int irq, void * dev_id) +static void fec_enet_rx(struct net_device *dev) { - struct net_device *dev = dev_id; - volatile fec_t *fecp; - uint int_events; - int handled = 0; + struct fec_enet_private *fep; + volatile fec_t *fecp; + volatile cbd_t *bdp; + unsigned short status; + struct sk_buff *skb; + ushort pkt_len; + __u8 *data; - fecp = (volatile fec_t*)dev->base_addr; +#ifdef CONFIG_M532x + flush_cache_all(); +#endif - /* Get the interrupt events that caused us to be here. - */ - while ((int_events = fecp->fec_ievent) != 0) { - fecp->fec_ievent = int_events; + fep = netdev_priv(dev); + spin_lock_irq(&fep->hw_lock); + fecp = (volatile fec_t *)dev->base_addr; - /* Handle receive event in its own function. + /* First, grab all of the stats for the incoming packet. + * These get messed up if we get called due to a busy condition. + */ + bdp = fep->cur_rx; + + while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { + +#ifndef final_version + /* Since we have allocated space to hold a complete frame, + * the last indicator should be set. */ - if (int_events & FEC_ENET_RXF) { - handled = 1; - fec_enet_rx(dev); + if ((status & BD_ENET_RX_LAST) == 0) + printk("FEC ENET: rcv is not +last\n"); +#endif + + if (!fep->opened) + goto rx_processing_done; + + /* Check for errors. */ + if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO | + BD_ENET_RX_CR | BD_ENET_RX_OV)) { + dev->stats.rx_errors++; + if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH)) { + /* Frame too long or too short. */ + dev->stats.rx_length_errors++; + } + if (status & BD_ENET_RX_NO) /* Frame alignment */ + dev->stats.rx_frame_errors++; + if (status & BD_ENET_RX_CR) /* CRC Error */ + dev->stats.rx_crc_errors++; + if (status & BD_ENET_RX_OV) /* FIFO overrun */ + dev->stats.rx_fifo_errors++; } - /* Transmit OK, or non-fatal error. Update the buffer - descriptors. FEC handles all errors, we just discover - them as part of the transmit process. - */ - if (int_events & FEC_ENET_TXF) { - handled = 1; - fec_enet_tx(dev); + /* Report late collisions as a frame error. + * On this error, the BD is closed, but we don't know what we + * have in the buffer. So, just drop this frame on the floor. + */ + if (status & BD_ENET_RX_CL) { + dev->stats.rx_errors++; + dev->stats.rx_frame_errors++; + goto rx_processing_done; } - if (int_events & FEC_ENET_MII) { - handled = 1; - fec_enet_mii(dev); + /* Process the incoming frame. + */ + dev->stats.rx_packets++; + pkt_len = bdp->cbd_datlen; + dev->stats.rx_bytes += pkt_len; + data = (__u8 *) __va(bdp->cbd_bufaddr); + + /* This does 16 byte alignment, exactly what we need. + * The packet length includes FCS, but we don't want to + * include that when passing upstream as it messes up + * bridging applications. + */ + skb = dev_alloc_skb(pkt_len - 4); + + if (skb == NULL) { + printk("%s: Memory squeeze, dropping packet.\n", + dev->name); + dev->stats.rx_dropped++; + } else { + skb_put(skb, pkt_len - 4); /* Make room */ + skb_copy_to_linear_data(skb, data, pkt_len - 4); + skb->protocol = eth_type_trans(skb, dev); + netif_rx(skb); } +rx_processing_done: - } - return IRQ_RETVAL(handled); -} + /* Clear the status flags for this buffer. + */ + status &= ~BD_ENET_RX_STATS; + /* Mark the buffer empty. + */ + status |= BD_ENET_RX_EMPTY; + bdp->cbd_sc = status; -static void -fec_enet_tx(struct net_device *dev) + /* Update BD pointer to next entry. + */ + if (status & BD_ENET_RX_WRAP) + bdp = fep->rx_bd_base; + else + bdp++; + +#if 1 + /* Doing this here will keep the FEC running while we process + * incoming frames. On a heavily loaded network, we should be + * able to keep up at the expense of system resources. + */ + fecp->fec_r_des_active = 0; +#endif + } /* while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) */ + fep->cur_rx = (cbd_t *) bdp; + +#if 0 + /* Doing this here will allow us to process all frames in the + * ring before the FEC is allowed to put more there. On a heavily + * loaded network, some frames may be lost. Unfortunately, this + * increases the interrupt overhead since we can potentially work + * our way back to the interrupt return only to come right back + * here. + */ + fecp->fec_r_des_active = 0; +#endif + spin_unlock_irq(&fep->hw_lock); +} + +static void fec_enet_tx(struct net_device *dev) { - struct fec_enet_private *fep; - volatile cbd_t *bdp; + struct fec_enet_private *fep; + volatile cbd_t *bdp; unsigned short status; - struct sk_buff *skb; + struct sk_buff *skb; fep = netdev_priv(dev); - spin_lock(&fep->lock); + spin_lock_irq(&fep->hw_lock); bdp = fep->dirty_tx; while (((status = bdp->cbd_sc) & BD_ENET_TX_READY) == 0) { - if (bdp == fep->cur_tx && fep->tx_full == 0) break; + if (bdp == fep->cur_tx && fep->tx_full == 0) + break; skb = fep->tx_skbuff[fep->skb_dirty]; /* Check for errors. */ if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC | - BD_ENET_TX_RL | BD_ENET_TX_UN | - BD_ENET_TX_CSL)) { + BD_ENET_TX_RL | BD_ENET_TX_UN | BD_ENET_TX_CSL)) { dev->stats.tx_errors++; - if (status & BD_ENET_TX_HB) /* No heartbeat */ + if (status & BD_ENET_TX_HB) /* No heartbeat */ dev->stats.tx_heartbeat_errors++; - if (status & BD_ENET_TX_LC) /* Late collision */ + if (status & BD_ENET_TX_LC) /* Late collision */ dev->stats.tx_window_errors++; - if (status & BD_ENET_TX_RL) /* Retrans limit */ + if (status & BD_ENET_TX_RL) /* Retrans limit */ dev->stats.tx_aborted_errors++; - if (status & BD_ENET_TX_UN) /* Underrun */ + if (status & BD_ENET_TX_UN) /* Underrun */ dev->stats.tx_fifo_errors++; - if (status & BD_ENET_TX_CSL) /* Carrier lost */ + if (status & BD_ENET_TX_CSL) /* Carrier lost */ dev->stats.tx_carrier_errors++; } else { dev->stats.tx_packets++; @@ -556,164 +619,32 @@ fec_enet_tx(struct net_device *dev) netif_wake_queue(dev); } } - fep->dirty_tx = (cbd_t *)bdp; - spin_unlock(&fep->lock); -} - - -/* During a receive, the cur_rx points to the current incoming buffer. - * When we update through the ring, if the next incoming buffer has - * not been given to the system, we just set the empty indicator, - * effectively tossing the packet. - */ -static void -fec_enet_rx(struct net_device *dev) -{ - struct fec_enet_private *fep; - volatile fec_t *fecp; - volatile cbd_t *bdp; - unsigned short status; - struct sk_buff *skb; - ushort pkt_len; - __u8 *data; - -#ifdef CONFIG_M532x - flush_cache_all(); -#endif - - fep = netdev_priv(dev); - fecp = (volatile fec_t*)dev->base_addr; - - /* First, grab all of the stats for the incoming packet. - * These get messed up if we get called due to a busy condition. - */ - bdp = fep->cur_rx; - -while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { - -#ifndef final_version - /* Since we have allocated space to hold a complete frame, - * the last indicator should be set. - */ - if ((status & BD_ENET_RX_LAST) == 0) - printk("FEC ENET: rcv is not +last\n"); -#endif - - if (!fep->opened) - goto rx_processing_done; - - /* Check for errors. */ - if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO | - BD_ENET_RX_CR | BD_ENET_RX_OV)) { - dev->stats.rx_errors++; - if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH)) { - /* Frame too long or too short. */ - dev->stats.rx_length_errors++; - } - if (status & BD_ENET_RX_NO) /* Frame alignment */ - dev->stats.rx_frame_errors++; - if (status & BD_ENET_RX_CR) /* CRC Error */ - dev->stats.rx_crc_errors++; - if (status & BD_ENET_RX_OV) /* FIFO overrun */ - dev->stats.rx_fifo_errors++; - } - - /* Report late collisions as a frame error. - * On this error, the BD is closed, but we don't know what we - * have in the buffer. So, just drop this frame on the floor. - */ - if (status & BD_ENET_RX_CL) { - dev->stats.rx_errors++; - dev->stats.rx_frame_errors++; - goto rx_processing_done; - } - - /* Process the incoming frame. - */ - dev->stats.rx_packets++; - pkt_len = bdp->cbd_datlen; - dev->stats.rx_bytes += pkt_len; - data = (__u8*)__va(bdp->cbd_bufaddr); - - /* This does 16 byte alignment, exactly what we need. - * The packet length includes FCS, but we don't want to - * include that when passing upstream as it messes up - * bridging applications. - */ - skb = dev_alloc_skb(pkt_len-4); - - if (skb == NULL) { - printk("%s: Memory squeeze, dropping packet.\n", dev->name); - dev->stats.rx_dropped++; - } else { - skb_put(skb,pkt_len-4); /* Make room */ - skb_copy_to_linear_data(skb, data, pkt_len-4); - skb->protocol=eth_type_trans(skb,dev); - netif_rx(skb); - } - rx_processing_done: - - /* Clear the status flags for this buffer. - */ - status &= ~BD_ENET_RX_STATS; - - /* Mark the buffer empty. - */ - status |= BD_ENET_RX_EMPTY; - bdp->cbd_sc = status; - - /* Update BD pointer to next entry. - */ - if (status & BD_ENET_RX_WRAP) - bdp = fep->rx_bd_base; - else - bdp++; - -#if 1 - /* Doing this here will keep the FEC running while we process - * incoming frames. On a heavily loaded network, we should be - * able to keep up at the expense of system resources. - */ - fecp->fec_r_des_active = 0; -#endif - } /* while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) */ - fep->cur_rx = (cbd_t *)bdp; - -#if 0 - /* Doing this here will allow us to process all frames in the - * ring before the FEC is allowed to put more there. On a heavily - * loaded network, some frames may be lost. Unfortunately, this - * increases the interrupt overhead since we can potentially work - * our way back to the interrupt return only to come right back - * here. - */ - fecp->fec_r_des_active = 0; -#endif + fep->dirty_tx = (cbd_t *) bdp; + spin_unlock_irq(&fep->hw_lock); } - /* called from interrupt context */ -static void -fec_enet_mii(struct net_device *dev) +static void fec_enet_mii(struct net_device *dev) { - struct fec_enet_private *fep; - volatile fec_t *ep; - mii_list_t *mip; - uint mii_reg; + struct fec_enet_private *fep; + volatile fec_t *ep; + mii_list_t *mip; + uint mii_reg; + mii_func *mii_func = NULL; fep = netdev_priv(dev); + spin_lock_irq(&fep->mii_lock); + ep = fep->hwp; mii_reg = ep->fec_mii_data; - spin_lock(&fep->lock); - if ((mip = mii_head) == NULL) { printk("MII and no head!\n"); goto unlock; } if (mip->mii_func != NULL) - (*(mip->mii_func))(mii_reg, dev); + mii_func = *(mip->mii_func); mii_head = mip->mii_next; mip->mii_next = mii_free; @@ -723,26 +654,71 @@ fec_enet_mii(struct net_device *dev) ep->fec_mii_data = mip->mii_regval; unlock: - spin_unlock(&fep->lock); + spin_unlock_irq(&fep->mii_lock); + if (mii_func) + mii_func(mii_reg, dev); } -static int -mii_queue(struct net_device *dev, int regval, void (*func)(uint, struct net_device *)) +/* The interrupt handler. + * This is called from the MPC core interrupt. + */ +static irqreturn_t fec_enet_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + volatile fec_t *fecp; + uint int_events; + irqreturn_t ret = IRQ_NONE; + + fecp = (volatile fec_t *)dev->base_addr; + + /* Get the interrupt events that caused us to be here. + */ + do { + int_events = fecp->fec_ievent; + fecp->fec_ievent = int_events; + + /* Handle receive event in its own function. + */ + if (int_events & FEC_ENET_RXF) { + ret = IRQ_HANDLED; + fec_enet_rx(dev); + } + + /* Transmit OK, or non-fatal error. Update the buffer + descriptors. FEC handles all errors, we just discover + them as part of the transmit process. + */ + if (int_events & FEC_ENET_TXF) { + ret = IRQ_HANDLED; + fec_enet_tx(dev); + } + + if (int_events & FEC_ENET_MII) { + ret = IRQ_HANDLED; + fec_enet_mii(dev); + } + + } while (int_events); + + return ret; +} + + +static int mii_queue(struct net_device *dev, int regval, mii_func *func) { struct fec_enet_private *fep; - unsigned long flags; - mii_list_t *mip; - int retval; + unsigned long flags; + mii_list_t *mip; + int retval; /* Add PHY address to register command. - */ + */ fep = netdev_priv(dev); - regval |= fep->phy_addr << 23; + spin_lock_irqsave(&fep->mii_lock, flags); + regval |= fep->phy_addr << 23; retval = 0; - spin_lock_irqsave(&fep->lock,flags); - if ((mip = mii_free) != NULL) { mii_free = mip->mii_next; mip->mii_regval = regval; @@ -759,14 +735,13 @@ mii_queue(struct net_device *dev, int re retval = 1; } - spin_unlock_irqrestore(&fep->lock,flags); - - return(retval); + spin_unlock_irqrestore(&fep->mii_lock, flags); + return retval; } static void mii_do_cmd(struct net_device *dev, const phy_cmd_t *c) { - if(!c) + if (!c) return; for (; c->mii_data != mk_mii_end; c++) @@ -827,11 +802,11 @@ static void mii_parse_anar(uint mii_reg, /* ------------------------------------------------------------------------- */ /* The Level one LXT970 is used by many boards */ -#define MII_LXT970_MIRROR 16 /* Mirror register */ -#define MII_LXT970_IER 17 /* Interrupt Enable Register */ -#define MII_LXT970_ISR 18 /* Interrupt Status Register */ -#define MII_LXT970_CONFIG 19 /* Configuration Register */ -#define MII_LXT970_CSR 20 /* Chip Status Register */ +#define MII_LXT970_MIRROR 16 /* Mirror register */ +#define MII_LXT970_IER 17 /* Interrupt Enable Register */ +#define MII_LXT970_ISR 18 /* Interrupt Status Register */ +#define MII_LXT970_CONFIG 19 /* Configuration Register */ +#define MII_LXT970_CSR 20 /* Chip Status Register */ static void mii_parse_lxt970_csr(uint mii_reg, struct net_device *dev) { @@ -855,28 +830,28 @@ static void mii_parse_lxt970_csr(uint mi } static phy_cmd_t const phy_cmd_lxt970_config[] = { - { mk_mii_read(MII_REG_CR), mii_parse_cr }, - { mk_mii_read(MII_REG_ANAR), mii_parse_anar }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_lxt970_startup[] = { /* enable interrupts */ - { mk_mii_write(MII_LXT970_IER, 0x0002), NULL }, - { mk_mii_write(MII_REG_CR, 0x1200), NULL }, /* autonegotiate */ - { mk_mii_end, } - }; + {mk_mii_read(MII_REG_CR), mii_parse_cr}, + {mk_mii_read(MII_REG_ANAR), mii_parse_anar}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_lxt970_startup[] = { /* enable interrupts */ + {mk_mii_write(MII_LXT970_IER, 0x0002), NULL}, + {mk_mii_write(MII_REG_CR, 0x1200), NULL}, /* autonegotiate */ + {mk_mii_end,} +}; static phy_cmd_t const phy_cmd_lxt970_ack_int[] = { - /* read SR and ISR to acknowledge */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_read(MII_LXT970_ISR), NULL }, - - /* find out the current status */ - { mk_mii_read(MII_LXT970_CSR), mii_parse_lxt970_csr }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_lxt970_shutdown[] = { /* disable interrupts */ - { mk_mii_write(MII_LXT970_IER, 0x0000), NULL }, - { mk_mii_end, } - }; + /* read SR and ISR to acknowledge */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_read(MII_LXT970_ISR), NULL}, + + /* find out the current status */ + {mk_mii_read(MII_LXT970_CSR), mii_parse_lxt970_csr}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_lxt970_shutdown[] = { /* disable interrupts */ + {mk_mii_write(MII_LXT970_IER, 0x0000), NULL}, + {mk_mii_end,} +}; static phy_info_t const phy_info_lxt970 = { .id = 0x07810000, .name = "LXT970", @@ -891,12 +866,12 @@ static phy_info_t const phy_info_lxt970 /* register definitions for the 971 */ -#define MII_LXT971_PCR 16 /* Port Control Register */ -#define MII_LXT971_SR2 17 /* Status Register 2 */ -#define MII_LXT971_IER 18 /* Interrupt Enable Register */ -#define MII_LXT971_ISR 19 /* Interrupt Status Register */ -#define MII_LXT971_LCR 20 /* LED Control Register */ -#define MII_LXT971_TCR 30 /* Transmit Control Register */ +#define MII_LXT971_PCR 16 /* Port Control Register */ +#define MII_LXT971_SR2 17 /* Status Register 2 */ +#define MII_LXT971_IER 18 /* Interrupt Enable Register */ +#define MII_LXT971_ISR 19 /* Interrupt Status Register */ +#define MII_LXT971_LCR 20 /* LED Control Register */ +#define MII_LXT971_TCR 30 /* Transmit Control Register */ /* * I had some nice ideas of running the MDIO faster... @@ -938,35 +913,35 @@ static void mii_parse_lxt971_sr2(uint mi } static phy_cmd_t const phy_cmd_lxt971_config[] = { - /* limit to 10MBit because my prototype board - * doesn't work with 100. */ - { mk_mii_read(MII_REG_CR), mii_parse_cr }, - { mk_mii_read(MII_REG_ANAR), mii_parse_anar }, - { mk_mii_read(MII_LXT971_SR2), mii_parse_lxt971_sr2 }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_lxt971_startup[] = { /* enable interrupts */ - { mk_mii_write(MII_LXT971_IER, 0x00f2), NULL }, - { mk_mii_write(MII_REG_CR, 0x1200), NULL }, /* autonegotiate */ - { mk_mii_write(MII_LXT971_LCR, 0xd422), NULL }, /* LED config */ - /* Somehow does the 971 tell me that the link is down - * the first read after power-up. - * read here to get a valid value in ack_int */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_end, } - }; + /* limit to 10MBit because my prototype board + * doesn't work with 100. */ + {mk_mii_read(MII_REG_CR), mii_parse_cr}, + {mk_mii_read(MII_REG_ANAR), mii_parse_anar}, + {mk_mii_read(MII_LXT971_SR2), mii_parse_lxt971_sr2}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_lxt971_startup[] = { /* enable interrupts */ + {mk_mii_write(MII_LXT971_IER, 0x00f2), NULL}, + {mk_mii_write(MII_REG_CR, 0x1200), NULL}, /* autonegotiate */ + {mk_mii_write(MII_LXT971_LCR, 0xd422), NULL}, /* LED config */ + /* Somehow does the 971 tell me that the link is down + * the first read after power-up. + * read here to get a valid value in ack_int */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_end,} +}; static phy_cmd_t const phy_cmd_lxt971_ack_int[] = { - /* acknowledge the int before reading status ! */ - { mk_mii_read(MII_LXT971_ISR), NULL }, - /* find out the current status */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_read(MII_LXT971_SR2), mii_parse_lxt971_sr2 }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_lxt971_shutdown[] = { /* disable interrupts */ - { mk_mii_write(MII_LXT971_IER, 0x0000), NULL }, - { mk_mii_end, } - }; + /* acknowledge the int before reading status ! */ + {mk_mii_read(MII_LXT971_ISR), NULL}, + /* find out the current status */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_read(MII_LXT971_SR2), mii_parse_lxt971_sr2}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_lxt971_shutdown[] = { /* disable interrupts */ + {mk_mii_write(MII_LXT971_IER, 0x0000), NULL}, + {mk_mii_end,} +}; static phy_info_t const phy_info_lxt971 = { .id = 0x0001378e, .name = "LXT971", @@ -981,12 +956,12 @@ static phy_info_t const phy_info_lxt971 /* register definitions */ -#define MII_QS6612_MCR 17 /* Mode Control Register */ -#define MII_QS6612_FTR 27 /* Factory Test Register */ -#define MII_QS6612_MCO 28 /* Misc. Control Register */ -#define MII_QS6612_ISR 29 /* Interrupt Source Register */ -#define MII_QS6612_IMR 30 /* Interrupt Mask Register */ -#define MII_QS6612_PCR 31 /* 100BaseTx PHY Control Reg. */ +#define MII_QS6612_MCR 17 /* Mode Control Register */ +#define MII_QS6612_FTR 27 /* Factory Test Register */ +#define MII_QS6612_MCO 28 /* Misc. Control Register */ +#define MII_QS6612_ISR 29 /* Interrupt Source Register */ +#define MII_QS6612_IMR 30 /* Interrupt Mask Register */ +#define MII_QS6612_PCR 31 /* 100BaseTx PHY Control Reg. */ static void mii_parse_qs6612_pcr(uint mii_reg, struct net_device *dev) { @@ -996,46 +971,54 @@ static void mii_parse_qs6612_pcr(uint mi status = *s & ~(PHY_STAT_SPMASK); - switch((mii_reg >> 2) & 7) { - case 1: status |= PHY_STAT_10HDX; break; - case 2: status |= PHY_STAT_100HDX; break; - case 5: status |= PHY_STAT_10FDX; break; - case 6: status |= PHY_STAT_100FDX; break; -} + switch ((mii_reg >> 2) & 7) { + case 1: + status |= PHY_STAT_10HDX; + break; + case 2: + status |= PHY_STAT_100HDX; + break; + case 5: + status |= PHY_STAT_10FDX; + break; + case 6: + status |= PHY_STAT_100FDX; + break; + } *s = status; } static phy_cmd_t const phy_cmd_qs6612_config[] = { - /* The PHY powers up isolated on the RPX, - * so send a command to allow operation. - */ - { mk_mii_write(MII_QS6612_PCR, 0x0dc0), NULL }, + /* The PHY powers up isolated on the RPX, + * so send a command to allow operation. + */ + {mk_mii_write(MII_QS6612_PCR, 0x0dc0), NULL}, - /* parse cr and anar to get some info */ - { mk_mii_read(MII_REG_CR), mii_parse_cr }, - { mk_mii_read(MII_REG_ANAR), mii_parse_anar }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_qs6612_startup[] = { /* enable interrupts */ - { mk_mii_write(MII_QS6612_IMR, 0x003a), NULL }, - { mk_mii_write(MII_REG_CR, 0x1200), NULL }, /* autonegotiate */ - { mk_mii_end, } - }; + /* parse cr and anar to get some info */ + {mk_mii_read(MII_REG_CR), mii_parse_cr}, + {mk_mii_read(MII_REG_ANAR), mii_parse_anar}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_qs6612_startup[] = { /* enable interrupts */ + {mk_mii_write(MII_QS6612_IMR, 0x003a), NULL}, + {mk_mii_write(MII_REG_CR, 0x1200), NULL}, /* autonegotiate */ + {mk_mii_end,} +}; static phy_cmd_t const phy_cmd_qs6612_ack_int[] = { - /* we need to read ISR, SR and ANER to acknowledge */ - { mk_mii_read(MII_QS6612_ISR), NULL }, - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_read(MII_REG_ANER), NULL }, - - /* read pcr to get info */ - { mk_mii_read(MII_QS6612_PCR), mii_parse_qs6612_pcr }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_qs6612_shutdown[] = { /* disable interrupts */ - { mk_mii_write(MII_QS6612_IMR, 0x0000), NULL }, - { mk_mii_end, } - }; + /* we need to read ISR, SR and ANER to acknowledge */ + {mk_mii_read(MII_QS6612_ISR), NULL}, + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_read(MII_REG_ANER), NULL}, + + /* read pcr to get info */ + {mk_mii_read(MII_QS6612_PCR), mii_parse_qs6612_pcr}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_qs6612_shutdown[] = { /* disable interrupts */ + {mk_mii_write(MII_QS6612_IMR, 0x0000), NULL}, + {mk_mii_end,} +}; static phy_info_t const phy_info_qs6612 = { .id = 0x00181440, .name = "QS6612", @@ -1050,13 +1033,13 @@ static phy_info_t const phy_info_qs6612 /* register definitions for the 874 */ -#define MII_AM79C874_MFR 16 /* Miscellaneous Feature Register */ -#define MII_AM79C874_ICSR 17 /* Interrupt/Status Register */ -#define MII_AM79C874_DR 18 /* Diagnostic Register */ -#define MII_AM79C874_PMLR 19 /* Power and Loopback Register */ -#define MII_AM79C874_MCR 21 /* ModeControl Register */ -#define MII_AM79C874_DC 23 /* Disconnect Counter */ -#define MII_AM79C874_REC 24 /* Recieve Error Counter */ +#define MII_AM79C874_MFR 16 /* Miscellaneous Feature Register */ +#define MII_AM79C874_ICSR 17 /* Interrupt/Status Register */ +#define MII_AM79C874_DR 18 /* Diagnostic Register */ +#define MII_AM79C874_PMLR 19 /* Power and Loopback Register */ +#define MII_AM79C874_MCR 21 /* ModeControl Register */ +#define MII_AM79C874_DC 23 /* Disconnect Counter */ +#define MII_AM79C874_REC 24 /* Recieve Error Counter */ static void mii_parse_am79c874_dr(uint mii_reg, struct net_device *dev) { @@ -1069,37 +1052,39 @@ static void mii_parse_am79c874_dr(uint m if (mii_reg & 0x0080) status |= PHY_STAT_ANC; if (mii_reg & 0x0400) - status |= ((mii_reg & 0x0800) ? PHY_STAT_100FDX : PHY_STAT_100HDX); + status |= + ((mii_reg & 0x0800) ? PHY_STAT_100FDX : PHY_STAT_100HDX); else - status |= ((mii_reg & 0x0800) ? PHY_STAT_10FDX : PHY_STAT_10HDX); + status |= + ((mii_reg & 0x0800) ? PHY_STAT_10FDX : PHY_STAT_10HDX); *s = status; } static phy_cmd_t const phy_cmd_am79c874_config[] = { - { mk_mii_read(MII_REG_CR), mii_parse_cr }, - { mk_mii_read(MII_REG_ANAR), mii_parse_anar }, - { mk_mii_read(MII_AM79C874_DR), mii_parse_am79c874_dr }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_am79c874_startup[] = { /* enable interrupts */ - { mk_mii_write(MII_AM79C874_ICSR, 0xff00), NULL }, - { mk_mii_write(MII_REG_CR, 0x1200), NULL }, /* autonegotiate */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_end, } - }; + {mk_mii_read(MII_REG_CR), mii_parse_cr}, + {mk_mii_read(MII_REG_ANAR), mii_parse_anar}, + {mk_mii_read(MII_AM79C874_DR), mii_parse_am79c874_dr}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_am79c874_startup[] = { /* enable interrupts */ + {mk_mii_write(MII_AM79C874_ICSR, 0xff00), NULL}, + {mk_mii_write(MII_REG_CR, 0x1200), NULL}, /* autonegotiate */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_end,} +}; static phy_cmd_t const phy_cmd_am79c874_ack_int[] = { - /* find out the current status */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_read(MII_AM79C874_DR), mii_parse_am79c874_dr }, - /* we only need to read ISR to acknowledge */ - { mk_mii_read(MII_AM79C874_ICSR), NULL }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_am79c874_shutdown[] = { /* disable interrupts */ - { mk_mii_write(MII_AM79C874_ICSR, 0x0000), NULL }, - { mk_mii_end, } - }; + /* find out the current status */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_read(MII_AM79C874_DR), mii_parse_am79c874_dr}, + /* we only need to read ISR to acknowledge */ + {mk_mii_read(MII_AM79C874_ICSR), NULL}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_am79c874_shutdown[] = { /* disable interrupts */ + {mk_mii_write(MII_AM79C874_ICSR, 0x0000), NULL}, + {mk_mii_end,} +}; static phy_info_t const phy_info_am79c874 = { .id = 0x00022561, .name = "AM79C874", @@ -1109,7 +1094,6 @@ static phy_info_t const phy_info_am79c87 .shutdown = phy_cmd_am79c874_shutdown }; - /* ------------------------------------------------------------------------- */ /* Kendin KS8721BL phy */ @@ -1120,27 +1104,27 @@ static phy_info_t const phy_info_am79c87 #define MII_KS8721BL_PHYCR 31 static phy_cmd_t const phy_cmd_ks8721bl_config[] = { - { mk_mii_read(MII_REG_CR), mii_parse_cr }, - { mk_mii_read(MII_REG_ANAR), mii_parse_anar }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_ks8721bl_startup[] = { /* enable interrupts */ - { mk_mii_write(MII_KS8721BL_ICSR, 0xff00), NULL }, - { mk_mii_write(MII_REG_CR, 0x1200), NULL }, /* autonegotiate */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_end, } - }; + {mk_mii_read(MII_REG_CR), mii_parse_cr}, + {mk_mii_read(MII_REG_ANAR), mii_parse_anar}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_ks8721bl_startup[] = { /* enable interrupts */ + {mk_mii_write(MII_KS8721BL_ICSR, 0xff00), NULL}, + {mk_mii_write(MII_REG_CR, 0x1200), NULL}, /* autonegotiate */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_end,} +}; static phy_cmd_t const phy_cmd_ks8721bl_ack_int[] = { - /* find out the current status */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - /* we only need to read ISR to acknowledge */ - { mk_mii_read(MII_KS8721BL_ICSR), NULL }, - { mk_mii_end, } - }; -static phy_cmd_t const phy_cmd_ks8721bl_shutdown[] = { /* disable interrupts */ - { mk_mii_write(MII_KS8721BL_ICSR, 0x0000), NULL }, - { mk_mii_end, } - }; + /* find out the current status */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + /* we only need to read ISR to acknowledge */ + {mk_mii_read(MII_KS8721BL_ICSR), NULL}, + {mk_mii_end,} +}; +static phy_cmd_t const phy_cmd_ks8721bl_shutdown[] = { /* disable interrupts */ + {mk_mii_write(MII_KS8721BL_ICSR, 0x0000), NULL}, + {mk_mii_end,} +}; static phy_info_t const phy_info_ks8721bl = { .id = 0x00022161, .name = "KS8721BL", @@ -1153,7 +1137,7 @@ static phy_info_t const phy_info_ks8721b /* ------------------------------------------------------------------------- */ /* register definitions for the DP83848 */ -#define MII_DP8384X_PHYSTST 16 /* PHY Status Register */ +#define MII_DP8384X_PHYSTST 16 /* PHY Status Register */ static void mii_parse_dp8384x_sr2(uint mii_reg, struct net_device *dev) { @@ -1169,15 +1153,19 @@ static void mii_parse_dp8384x_sr2(uint m } else fep->link = 0; /* Status of link */ - if (mii_reg & 0x0010) /* Autonegotioation complete */ + if (mii_reg & 0x0010) /* Autonegotioation complete */ *s |= PHY_STAT_ANC; - if (mii_reg & 0x0002) { /* 10MBps? */ - if (mii_reg & 0x0004) /* Full Duplex? */ + /* 10MBps? */ + if (mii_reg & 0x0002) { + /* Full Duplex? */ + if (mii_reg & 0x0004) *s |= PHY_STAT_10FDX; else *s |= PHY_STAT_10HDX; - } else { /* 100 Mbps? */ - if (mii_reg & 0x0004) /* Full Duplex? */ + } else { + /* 100 Mbps then */ + /* Full Duplex? */ + if (mii_reg & 0x0004) *s |= PHY_STAT_100FDX; else *s |= PHY_STAT_100HDX; @@ -1186,32 +1174,33 @@ static void mii_parse_dp8384x_sr2(uint m *s |= PHY_STAT_FAULT; } -static phy_info_t phy_info_dp83848= { +static phy_info_t phy_info_dp83848 = { 0x020005c9, "DP83848", - (const phy_cmd_t []) { /* config */ - { mk_mii_read(MII_REG_CR), mii_parse_cr }, - { mk_mii_read(MII_REG_ANAR), mii_parse_anar }, - { mk_mii_read(MII_DP8384X_PHYSTST), mii_parse_dp8384x_sr2 }, - { mk_mii_end, } + (const phy_cmd_t[]){ /* config */ + {mk_mii_read(MII_REG_CR), mii_parse_cr}, + {mk_mii_read(MII_REG_ANAR), mii_parse_anar}, + {mk_mii_read(MII_DP8384X_PHYSTST), + mii_parse_dp8384x_sr2}, + {mk_mii_end,} }, - (const phy_cmd_t []) { /* startup - enable interrupts */ - { mk_mii_write(MII_REG_CR, 0x1200), NULL }, /* autonegotiate */ - { mk_mii_read(MII_REG_SR), mii_parse_sr }, - { mk_mii_end, } + (const phy_cmd_t[]){ /* startup - enable interrupts */ + {mk_mii_write(MII_REG_CR, 0x1200), NULL}, /* autonegotiate */ + {mk_mii_read(MII_REG_SR), mii_parse_sr}, + {mk_mii_end,} }, - (const phy_cmd_t []) { /* ack_int - never happens, no interrupt */ - { mk_mii_end, } + (const phy_cmd_t[]){ /* ack_int - never happens, no interrupt */ + {mk_mii_end,} }, - (const phy_cmd_t []) { /* shutdown */ - { mk_mii_end, } + (const phy_cmd_t[]){ /* shutdown */ + {mk_mii_end,} }, }; /* ------------------------------------------------------------------------- */ -static phy_info_t const * const phy_info[] = { +static phy_info_t const *const phy_info[] = { &phy_info_lxt970, &phy_info_lxt971, &phy_info_qs6612, @@ -1221,22 +1210,38 @@ static phy_info_t const * const phy_info NULL }; -/* ------------------------------------------------------------------------- */ -#if !defined(CONFIG_M532x) -#ifdef CONFIG_RPXCLASSIC -static void -mii_link_interrupt(void *dev_id); -#else -static irqreturn_t -mii_link_interrupt(int irq, void * dev_id); -#endif +#if defined(CONFIG_M5272) +static void fec_phy_ack_intr(void) +{ + volatile unsigned long *icrp; + /* Acknowledge the interrupt */ + icrp = (volatile unsigned long *)(MCF_MBAR + MCFSIM_ICR1); + *icrp = 0x0d000000; +} + +/* This interrupt occurs when the PHY detects a link change. +*/ +static irqreturn_t mii_link_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct fec_enet_private *fep = netdev_priv(dev); + + fec_phy_ack_intr(); + +#if 0 + disable_irq(fep->mii_irq); /* disable now, enable later */ #endif -#if defined(CONFIG_M5272) + mii_do_cmd(dev, fep->phy->ack_int); + mii_do_cmd(dev, phy_cmd_relink); /* restart and display status */ + + return IRQ_HANDLED; +} + /* * Code specific to Coldfire 5272 setup. */ -static void __inline__ fec_request_intrs(struct net_device *dev) +static void __init fec_request_intrs(struct net_device *dev) { volatile unsigned long *icrp; static const struct idesc { @@ -1244,27 +1249,36 @@ static void __inline__ fec_request_intrs unsigned short irq; irq_handler_t handler; } *idp, id[] = { - { "fec(RX)", 86, fec_enet_interrupt }, - { "fec(TX)", 87, fec_enet_interrupt }, - { "fec(OTHER)", 88, fec_enet_interrupt }, - { "fec(MII)", 66, mii_link_interrupt }, - { NULL }, + /* + * Available but not allocated because not handled: + * fec(OTHER) 88 + */ + { "fec(RX)", 86, fec_enet_interrupt}, + { "fec(TX)", 87, fec_enet_interrupt}, + { "fec(MII)", 66, mii_link_interrupt}, + { NULL, 0 }, }; /* Setup interrupt handlers. */ for (idp = id; idp->name; idp++) { - if (request_irq(idp->irq, idp->handler, 0, idp->name, dev) != 0) - printk("FEC: Could not allocate %s IRQ(%d)!\n", idp->name, idp->irq); + int ret; + + ret =request_irq(idp->irq, idp->handler, IRQF_DISABLED, idp->name, + dev); + if (ret) + printk("FEC: Could not allocate %s IRQ(%d)!\n", + idp->name, idp->irq); } /* Unmask interrupt at ColdFire 5272 SIM */ - icrp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_ICR3); + icrp = (volatile unsigned long *)(MCF_MBAR + MCFSIM_ICR3); *icrp = 0x00000ddd; - icrp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_ICR1); + icrp = (volatile unsigned long *)(MCF_MBAR + MCFSIM_ICR1); *icrp = 0x0d000000; } -static void __inline__ fec_set_mii(struct net_device *dev, struct fec_enet_private *fep) +static void __init fec_set_mii(struct net_device *dev, + struct fec_enet_private *fep) { volatile fec_t *fecp; @@ -1282,7 +1296,7 @@ static void __inline__ fec_set_mii(struc fec_restart(dev, 0); } -static void __inline__ fec_get_mac(struct net_device *dev) +static void __init fec_get_mac(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); volatile fec_t *fecp; @@ -1303,8 +1317,8 @@ static void __inline__ fec_get_mac(struc (iap[3] == 0xff) && (iap[4] == 0xff) && (iap[5] == 0xff)) iap = fec_mac_default; } else { - *((unsigned long *) &tmpaddr[0]) = fecp->fec_addr_low; - *((unsigned short *) &tmpaddr[4]) = (fecp->fec_addr_high >> 16); + *((unsigned long *)&tmpaddr[0]) = fecp->fec_addr_low; + *((unsigned short *)&tmpaddr[4]) = (fecp->fec_addr_high >> 16); iap = &tmpaddr[0]; } @@ -1312,36 +1326,29 @@ static void __inline__ fec_get_mac(struc /* Adjust MAC if using default MAC address */ if (iap == fec_mac_default) - dev->dev_addr[ETH_ALEN-1] = fec_mac_default[ETH_ALEN-1] + fep->index; + dev->dev_addr[ETH_ALEN - 1] = + fec_mac_default[ETH_ALEN - 1] + fep->index; } -static void __inline__ fec_enable_phy_intr(void) +static void fec_enable_phy_intr(void) { } -static void __inline__ fec_disable_phy_intr(void) +static void fec_disable_phy_intr(void) { volatile unsigned long *icrp; - icrp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_ICR1); + icrp = (volatile unsigned long *)(MCF_MBAR + MCFSIM_ICR1); *icrp = 0x08000000; } -static void __inline__ fec_phy_ack_intr(void) -{ - volatile unsigned long *icrp; - /* Acknowledge the interrupt */ - icrp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_ICR1); - *icrp = 0x0d000000; -} - -static void __inline__ fec_localhw_setup(void) +static void fec_localhw_setup(void) { } /* * Do not need to make region uncached on 5272. */ -static void __inline__ fec_uncache(unsigned long addr) +static void __init fec_uncache(unsigned long addr) { } @@ -1353,7 +1360,7 @@ static void __inline__ fec_uncache(unsig * Code specific to Coldfire 5230/5231/5232/5234/5235, * the 5270/5271/5274/5275 and 5280/5282 setups. */ -static void __inline__ fec_request_intrs(struct net_device *dev) +static void __init fec_request_intrs(struct net_device *dev) { struct fec_enet_private *fep; int b; @@ -1361,20 +1368,16 @@ static void __inline__ fec_request_intrs char *name; unsigned short irq; } *idp, id[] = { - { "fec(TXF)", 23 }, - { "fec(TXB)", 24 }, - { "fec(TXFIFO)", 25 }, - { "fec(TXCR)", 26 }, - { "fec(RXF)", 27 }, - { "fec(RXB)", 28 }, - { "fec(MII)", 29 }, - { "fec(LC)", 30 }, - { "fec(HBERR)", 31 }, - { "fec(GRA)", 32 }, - { "fec(EBERR)", 33 }, - { "fec(BABT)", 34 }, - { "fec(BABR)", 35 }, - { NULL }, + /* + * Available but not allocated because not handled: + * fec(TXB) 24, fec(TXFIFO) 25, fec(TXCR) 26, fec(RXB) 28, + * fec(LC) 30, fec(HBERR) 31, fec(GRA) 32, fec(EBERR) 33, + * fec(BABT) 34, fec(BABR), 35 + */ + { "fec(TXF)", 23}, + { "fec(RXF)", 27}, + { "fec(MII)", 29}, + { NULL, 0}, }; fep = netdev_priv(dev); @@ -1382,43 +1385,47 @@ static void __inline__ fec_request_intrs /* Setup interrupt handlers. */ for (idp = id; idp->name; idp++) { - if (request_irq(b+idp->irq, fec_enet_interrupt, 0, idp->name, dev) != 0) - printk("FEC: Could not allocate %s IRQ(%d)!\n", idp->name, b+idp->irq); - } + int ret; + ret = request_irq(b + idp->irq, fec_enet_interrupt, IRQF_DISABLED, + idp->name, dev); + if (ret) + printk("FEC: Could not allocate %s IRQ(%d)!\n", + idp->name, b + idp->irq); + } +#if defined(CONFIG_M527x) || defined(CONFIG_M528x) /* Unmask interrupts at ColdFire 5280/5282 interrupt controller */ { - volatile unsigned char *icrp; - volatile unsigned long *imrp; + volatile unsigned char *icrp; + volatile unsigned long *imrp; int i, ilip; b = (fep->index) ? MCFICM_INTC1 : MCFICM_INTC0; - icrp = (volatile unsigned char *) (MCF_IPSBAR + b + - MCFINTC_ICR0); + icrp = (volatile unsigned char *)(MCF_IPSBAR + b + + MCFINTC_ICR0); for (i = 23, ilip = 0x28; (i < 36); i++) icrp[i] = ilip--; - imrp = (volatile unsigned long *) (MCF_IPSBAR + b + - MCFINTC_IMRH); + imrp = (volatile unsigned long *)(MCF_IPSBAR + b + + MCFINTC_IMRH); *imrp &= ~0x0000000f; - imrp = (volatile unsigned long *) (MCF_IPSBAR + b + - MCFINTC_IMRL); + imrp = (volatile unsigned long *)(MCF_IPSBAR + b + + MCFINTC_IMRL); *imrp &= ~0xff800001; } - +#endif #if defined(CONFIG_M528x) /* Set up gpio outputs for MII lines */ { volatile u16 *gpio_paspar; volatile u8 *gpio_pehlpar; - gpio_paspar = (volatile u16 *) (MCF_IPSBAR + 0x100056); - gpio_pehlpar = (volatile u16 *) (MCF_IPSBAR + 0x100058); + gpio_paspar = (volatile u16 *)(MCF_IPSBAR + 0x100056); + gpio_pehlpar = (volatile u16 *)(MCF_IPSBAR + 0x100058); *gpio_paspar |= 0x0f00; *gpio_pehlpar = 0xc0; } #endif - #if defined(CONFIG_M527x) /* Set up gpio outputs for MII lines */ { @@ -1443,7 +1450,8 @@ static void __inline__ fec_request_intrs #endif /* CONFIG_M527x */ } -static void __inline__ fec_set_mii(struct net_device *dev, struct fec_enet_private *fep) +static void __init fec_set_mii(struct net_device *dev, + struct fec_enet_private *fep) { volatile fec_t *fecp; @@ -1461,7 +1469,7 @@ static void __inline__ fec_set_mii(struc fec_restart(dev, 0); } -static void __inline__ fec_get_mac(struct net_device *dev) +static void __init fec_get_mac(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); volatile fec_t *fecp; @@ -1482,8 +1490,8 @@ static void __inline__ fec_get_mac(struc (iap[3] == 0xff) && (iap[4] == 0xff) && (iap[5] == 0xff)) iap = fec_mac_default; } else { - *((unsigned long *) &tmpaddr[0]) = fecp->fec_addr_low; - *((unsigned short *) &tmpaddr[4]) = (fecp->fec_addr_high >> 16); + *((unsigned long *)&tmpaddr[0]) = fecp->fec_addr_low; + *((unsigned short *)&tmpaddr[4]) = (fecp->fec_addr_high >> 16); iap = &tmpaddr[0]; } @@ -1491,29 +1499,26 @@ static void __inline__ fec_get_mac(struc /* Adjust MAC if using default MAC address */ if (iap == fec_mac_default) - dev->dev_addr[ETH_ALEN-1] = fec_mac_default[ETH_ALEN-1] + fep->index; + dev->dev_addr[ETH_ALEN - 1] = + fec_mac_default[ETH_ALEN - 1] + fep->index; } -static void __inline__ fec_enable_phy_intr(void) +static void fec_enable_phy_intr(void) { } -static void __inline__ fec_disable_phy_intr(void) +static void fec_disable_phy_intr(void) { } -static void __inline__ fec_phy_ack_intr(void) -{ -} - -static void __inline__ fec_localhw_setup(void) +static void fec_localhw_setup(void) { } /* * Do not need to make region uncached on 5272. */ -static void __inline__ fec_uncache(unsigned long addr) +static void __init fec_uncache(unsigned long addr) { } @@ -1524,7 +1529,7 @@ static void __inline__ fec_uncache(unsig /* * Code specific to Coldfire 520x */ -static void __inline__ fec_request_intrs(struct net_device *dev) +static void __init fec_request_intrs(struct net_device *dev) { struct fec_enet_private *fep; int b; @@ -1532,20 +1537,16 @@ static void __inline__ fec_request_intrs char *name; unsigned short irq; } *idp, id[] = { - { "fec(TXF)", 23 }, - { "fec(TXB)", 24 }, - { "fec(TXFIFO)", 25 }, - { "fec(TXCR)", 26 }, - { "fec(RXF)", 27 }, - { "fec(RXB)", 28 }, - { "fec(MII)", 29 }, - { "fec(LC)", 30 }, - { "fec(HBERR)", 31 }, - { "fec(GRA)", 32 }, - { "fec(EBERR)", 33 }, - { "fec(BABT)", 34 }, - { "fec(BABR)", 35 }, - { NULL }, + /* + * Available but not allocated because not handled: + * fec(TXB) 24, fec(TXFIFO) 25, fec(TXCR) 26, fec(RXB) 28, + * fec(LC) 30, fec(HBERR) 31, fec(GRA) 32, fec(EBERR) 33, + * fec(BABT) 34, fec(BABR) 35 + */ + { "fec(TXF)", 23}, + { "fec(RXF)", 27}, + { "fec(MII)", 29}, + { NULL, 0}, }; fep = netdev_priv(dev); @@ -1553,28 +1554,34 @@ static void __inline__ fec_request_intrs /* Setup interrupt handlers. */ for (idp = id; idp->name; idp++) { - if (request_irq(b+idp->irq,fec_enet_interrupt,0,idp->name,dev)!=0) - printk("FEC: Could not allocate %s IRQ(%d)!\n", idp->name, b+idp->irq); + int ret; + + ret = request_irq(b + idp->irq, fec_enet_interrupt, IRQF_DISABLED, + idp->name, dev); + if (ret) + printk("FEC: Could not allocate %s IRQ(%d)!\n", + idp->name, b + idp->irq); } /* Unmask interrupts at ColdFire interrupt controller */ { - volatile unsigned char *icrp; - volatile unsigned long *imrp; + volatile unsigned char *icrp; + volatile unsigned long *imrp; - icrp = (volatile unsigned char *) (MCF_IPSBAR + MCFICM_INTC0 + - MCFINTC_ICR0); + icrp = (volatile unsigned char *)(MCF_IPSBAR + MCFICM_INTC0 + + MCFINTC_ICR0); for (b = 36; (b < 49); b++) icrp[b] = 0x04; - imrp = (volatile unsigned long *) (MCF_IPSBAR + MCFICM_INTC0 + - MCFINTC_IMRH); + imrp = (volatile unsigned long *)(MCF_IPSBAR + MCFICM_INTC0 + + MCFINTC_IMRH); *imrp &= ~0x0001FFF0; } *(volatile unsigned char *)(MCF_IPSBAR + MCF_GPIO_PAR_FEC) |= 0xf0; *(volatile unsigned char *)(MCF_IPSBAR + MCF_GPIO_PAR_FECI2C) |= 0x0f; } -static void __inline__ fec_set_mii(struct net_device *dev, struct fec_enet_private *fep) +static void __init fec_set_mii(struct net_device *dev, + struct fec_enet_private *fep) { volatile fec_t *fecp; @@ -1592,7 +1599,7 @@ static void __inline__ fec_set_mii(struc fec_restart(dev, 0); } -static void __inline__ fec_get_mac(struct net_device *dev) +static void __init fec_get_mac(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); volatile fec_t *fecp; @@ -1607,14 +1614,14 @@ static void __inline__ fec_get_mac(struc */ iap = FEC_FLASHMAC; if ((iap[0] == 0) && (iap[1] == 0) && (iap[2] == 0) && - (iap[3] == 0) && (iap[4] == 0) && (iap[5] == 0)) + (iap[3] == 0) && (iap[4] == 0) && (iap[5] == 0)) iap = fec_mac_default; if ((iap[0] == 0xff) && (iap[1] == 0xff) && (iap[2] == 0xff) && - (iap[3] == 0xff) && (iap[4] == 0xff) && (iap[5] == 0xff)) + (iap[3] == 0xff) && (iap[4] == 0xff) && (iap[5] == 0xff)) iap = fec_mac_default; } else { - *((unsigned long *) &tmpaddr[0]) = fecp->fec_addr_low; - *((unsigned short *) &tmpaddr[4]) = (fecp->fec_addr_high >> 16); + *((unsigned long *)&tmpaddr[0]) = fecp->fec_addr_low; + *((unsigned short *)&tmpaddr[4]) = (fecp->fec_addr_high >> 16); iap = &tmpaddr[0]; } @@ -1622,26 +1629,23 @@ static void __inline__ fec_get_mac(struc /* Adjust MAC if using default MAC address */ if (iap == fec_mac_default) - dev->dev_addr[ETH_ALEN-1] = fec_mac_default[ETH_ALEN-1] + fep->index; -} - -static void __inline__ fec_enable_phy_intr(void) -{ + dev->dev_addr[ETH_ALEN - 1] = + fec_mac_default[ETH_ALEN - 1] + fep->index; } -static void __inline__ fec_disable_phy_intr(void) +static void fec_enable_phy_intr(void) { } -static void __inline__ fec_phy_ack_intr(void) +static void fec_disable_phy_intr(void) { } -static void __inline__ fec_localhw_setup(void) +static void fec_localhw_setup(void) { } -static void __inline__ fec_uncache(unsigned long addr) +static void __init fec_uncache(unsigned long addr) { } @@ -1651,7 +1655,7 @@ static void __inline__ fec_uncache(unsig /* * Code specific for M532x */ -static void __inline__ fec_request_intrs(struct net_device *dev) +static void __init fec_request_intrs(struct net_device *dev) { struct fec_enet_private *fep; int b; @@ -1659,20 +1663,16 @@ static void __inline__ fec_request_intrs char *name; unsigned short irq; } *idp, id[] = { - { "fec(TXF)", 36 }, - { "fec(TXB)", 37 }, - { "fec(TXFIFO)", 38 }, - { "fec(TXCR)", 39 }, - { "fec(RXF)", 40 }, - { "fec(RXB)", 41 }, - { "fec(MII)", 42 }, - { "fec(LC)", 43 }, - { "fec(HBERR)", 44 }, - { "fec(GRA)", 45 }, - { "fec(EBERR)", 46 }, - { "fec(BABT)", 47 }, - { "fec(BABR)", 48 }, - { NULL }, + /* + * Available but not allocated because not handled: + * fec(TXB) 37, fec(TXFIFO) 38, fec(TXCR) 39, fec(RXB) 41, + * fec(LC) 43, fec(HBERR) 44, fec(GRA) 45, fec(EBERR) 46, + * fec(BABT) 47, fec(BABR) 48 + */ + { "fec(TXF)", 36}, + { "fec(RXF)", 40}, + { "fec(MII)", 42}, + { NULL, 0}, }; fep = netdev_priv(dev); @@ -1680,9 +1680,13 @@ static void __inline__ fec_request_intrs /* Setup interrupt handlers. */ for (idp = id; idp->name; idp++) { - if (request_irq(b+idp->irq,fec_enet_interrupt,0,idp->name,dev)!=0) - printk("FEC: Could not allocate %s IRQ(%d)!\n", - idp->name, b+idp->irq); + int ret; + + ret = request_irq(b + idp->irq, fec_enet_interrupt, IRQF_DISABLED, + idp->name, dev); + if (ret) + printk("FEC: Could not allocate %s IRQ(%d)!\n", + idp->name, b + idp->irq); } /* Unmask interrupts */ @@ -1700,31 +1704,31 @@ static void __inline__ fec_request_intrs MCF_INTC0_ICR47 = 0x2; MCF_INTC0_ICR48 = 0x2; - MCF_INTC0_IMRH &= ~( - MCF_INTC_IMRH_INT_MASK36 | - MCF_INTC_IMRH_INT_MASK37 | - MCF_INTC_IMRH_INT_MASK38 | - MCF_INTC_IMRH_INT_MASK39 | - MCF_INTC_IMRH_INT_MASK40 | - MCF_INTC_IMRH_INT_MASK41 | - MCF_INTC_IMRH_INT_MASK42 | - MCF_INTC_IMRH_INT_MASK43 | - MCF_INTC_IMRH_INT_MASK44 | - MCF_INTC_IMRH_INT_MASK45 | - MCF_INTC_IMRH_INT_MASK46 | - MCF_INTC_IMRH_INT_MASK47 | - MCF_INTC_IMRH_INT_MASK48 ); + MCF_INTC0_IMRH &= ~(MCF_INTC_IMRH_INT_MASK36 | + MCF_INTC_IMRH_INT_MASK37 | + MCF_INTC_IMRH_INT_MASK38 | + MCF_INTC_IMRH_INT_MASK39 | + MCF_INTC_IMRH_INT_MASK40 | + MCF_INTC_IMRH_INT_MASK41 | + MCF_INTC_IMRH_INT_MASK42 | + MCF_INTC_IMRH_INT_MASK43 | + MCF_INTC_IMRH_INT_MASK44 | + MCF_INTC_IMRH_INT_MASK45 | + MCF_INTC_IMRH_INT_MASK46 | + MCF_INTC_IMRH_INT_MASK47 | + MCF_INTC_IMRH_INT_MASK48); /* Set up gpio outputs for MII lines */ MCF_GPIO_PAR_FECI2C |= (0 | - MCF_GPIO_PAR_FECI2C_PAR_MDC_EMDC | - MCF_GPIO_PAR_FECI2C_PAR_MDIO_EMDIO); + MCF_GPIO_PAR_FECI2C_PAR_MDC_EMDC | + MCF_GPIO_PAR_FECI2C_PAR_MDIO_EMDIO); MCF_GPIO_PAR_FEC = (0 | - MCF_GPIO_PAR_FEC_PAR_FEC_7W_FEC | - MCF_GPIO_PAR_FEC_PAR_FEC_MII_FEC); + MCF_GPIO_PAR_FEC_PAR_FEC_7W_FEC | + MCF_GPIO_PAR_FEC_PAR_FEC_MII_FEC); } -static void __inline__ fec_set_mii(struct net_device *dev, struct fec_enet_private *fep) +static void __init fec_set_mii(struct net_device *dev, + struct fec_enet_private *fep) { volatile fec_t *fecp; @@ -1741,7 +1745,7 @@ static void __inline__ fec_set_mii(struc fec_restart(dev, 0); } -static void __inline__ fec_get_mac(struct net_device *dev) +static void __init fec_get_mac(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); volatile fec_t *fecp; @@ -1762,8 +1766,8 @@ static void __inline__ fec_get_mac(struc (iap[3] == 0xff) && (iap[4] == 0xff) && (iap[5] == 0xff)) iap = fec_mac_default; } else { - *((unsigned long *) &tmpaddr[0]) = fecp->fec_addr_low; - *((unsigned short *) &tmpaddr[4]) = (fecp->fec_addr_high >> 16); + *((unsigned long *)&tmpaddr[0]) = fecp->fec_addr_low; + *((unsigned short *)&tmpaddr[4]) = (fecp->fec_addr_high >> 16); iap = &tmpaddr[0]; } @@ -1771,143 +1775,109 @@ static void __inline__ fec_get_mac(struc /* Adjust MAC if using default MAC address */ if (iap == fec_mac_default) - dev->dev_addr[ETH_ALEN-1] = fec_mac_default[ETH_ALEN-1] + fep->index; -} - -static void __inline__ fec_enable_phy_intr(void) -{ + dev->dev_addr[ETH_ALEN - 1] = + fec_mac_default[ETH_ALEN - 1] + fep->index; } -static void __inline__ fec_disable_phy_intr(void) +static void fec_enable_phy_intr(void) { } -static void __inline__ fec_phy_ack_intr(void) +static void fec_disable_phy_intr(void) { } -static void __inline__ fec_localhw_setup(void) +static void fec_localhw_setup(void) { } /* * Do not need to make region uncached on 532x. */ -static void __inline__ fec_uncache(unsigned long addr) +static void __init fec_uncache(unsigned long addr) { } /* ------------------------------------------------------------------------- */ - #else /* * Code specific to the MPC860T setup. */ -static void __inline__ fec_request_intrs(struct net_device *dev) +static void __init fec_request_intrs(struct net_device *dev) { volatile immap_t *immap; - immap = (immap_t *)IMAP_ADDR; /* pointer to internal registers */ + immap = (immap_t *) IMAP_ADDR; /* pointer to internal registers */ - if (request_8xxirq(FEC_INTERRUPT, fec_enet_interrupt, 0, "fec", dev) != 0) + if (request_8xxirq(FEC_INTERRUPT, fec_enet_interrupt, 0, "fec", dev) != + 0) panic("Could not allocate FEC IRQ!"); - -#ifdef CONFIG_RPXCLASSIC - /* Make Port C, bit 15 an input that causes interrupts. - */ - immap->im_ioport.iop_pcpar &= ~0x0001; - immap->im_ioport.iop_pcdir &= ~0x0001; - immap->im_ioport.iop_pcso &= ~0x0001; - immap->im_ioport.iop_pcint |= 0x0001; - cpm_install_handler(CPMVEC_PIO_PC15, mii_link_interrupt, dev); - - /* Make LEDS reflect Link status. - */ - *((uint *) RPX_CSR_ADDR) &= ~BCSR2_FETHLEDMODE; -#endif -#ifdef CONFIG_FADS - if (request_8xxirq(SIU_IRQ2, mii_link_interrupt, 0, "mii", dev) != 0) - panic("Could not allocate MII IRQ!"); -#endif } -static void __inline__ fec_get_mac(struct net_device *dev) +static void __init fec_get_mac(struct net_device *dev) { bd_t *bd; - bd = (bd_t *)__res; + bd = (bd_t *) __res; memcpy(dev->dev_addr, bd->bi_enetaddr, ETH_ALEN); - -#ifdef CONFIG_RPXCLASSIC - /* The Embedded Planet boards have only one MAC address in - * the EEPROM, but can have two Ethernet ports. For the - * FEC port, we create another address by setting one of - * the address bits above something that would have (up to - * now) been allocated. - */ - dev->dev_adrd[3] |= 0x80; -#endif } -static void __inline__ fec_set_mii(struct net_device *dev, struct fec_enet_private *fep) +static void __init fec_set_mii(struct net_device *dev, + struct fec_enet_private *fep) { extern uint _get_IMMR(void); volatile immap_t *immap; volatile fec_t *fecp; fecp = fep->hwp; - immap = (immap_t *)IMAP_ADDR; /* pointer to internal registers */ + immap = (immap_t *) IMAP_ADDR; /* pointer to internal registers */ /* Configure all of port D for MII. - */ + */ immap->im_ioport.iop_pdpar = 0x1fff; /* Bits moved from Rev. D onward. - */ + */ if ((_get_IMMR() & 0xffff) < 0x0501) immap->im_ioport.iop_pddir = 0x1c58; /* Pre rev. D */ else immap->im_ioport.iop_pddir = 0x1fff; /* Rev. D and later */ /* Set MII speed to 2.5 MHz - */ + */ fecp->fec_mii_speed = fep->phy_speed = - ((bd->bi_busfreq * 1000000) / 2500000) & 0x7e; + ((bd->bi_busfreq * 1000000) / 2500000) & 0x7e; } -static void __inline__ fec_enable_phy_intr(void) +static void fec_enable_phy_intr(void) { volatile fec_t *fecp; fecp = fep->hwp; /* Enable MII command finished interrupt - */ - fecp->fec_ivec = (FEC_INTERRUPT/2) << 29; -} - -static void __inline__ fec_disable_phy_intr(void) -{ + */ + fecp->fec_ivec = (FEC_INTERRUPT / 2) << 29; } -static void __inline__ fec_phy_ack_intr(void) +static void fec_disable_phy_intr(void) { } -static void __inline__ fec_localhw_setup(void) +static void fec_localhw_setup(void) { volatile fec_t *fecp; fecp = fep->hwp; fecp->fec_r_hash = PKT_MAXBUF_SIZE; /* Enable big endian and don't care about SDMA FC. - */ + */ fecp->fec_fun_code = 0x78000000; } -static void __inline__ fec_uncache(unsigned long addr) +static void __init fec_uncache(unsigned long addr) { pte_t *pte; pte = va_to_pte(mem_addr); @@ -1936,11 +1906,19 @@ static void mii_display_status(struct ne } else { printk("link up"); - switch(*s & PHY_STAT_SPMASK) { - case PHY_STAT_100FDX: printk(", 100MBit Full Duplex"); break; - case PHY_STAT_100HDX: printk(", 100MBit Half Duplex"); break; - case PHY_STAT_10FDX: printk(", 10MBit Full Duplex"); break; - case PHY_STAT_10HDX: printk(", 10MBit Half Duplex"); break; + switch (*s & PHY_STAT_SPMASK) { + case PHY_STAT_100FDX: + printk(", 100MBit Full Duplex"); + break; + case PHY_STAT_100HDX: + printk(", 100MBit Half Duplex"); + break; + case PHY_STAT_10FDX: + printk(", 10MBit Full Duplex"); + break; + case PHY_STAT_10HDX: + printk(", 10MBit Half Duplex"); + break; default: printk(", Unknown speed/duplex"); } @@ -1957,14 +1935,15 @@ static void mii_display_status(struct ne static void mii_display_config(struct work_struct *work) { - struct fec_enet_private *fep = container_of(work, struct fec_enet_private, phy_task); + struct fec_enet_private *fep = + container_of(work, struct fec_enet_private, phy_task); struct net_device *dev = fep->netdev; uint status = fep->phy_status; /* - ** When we get here, phy_task is already removed from - ** the workqueue. It is thus safe to allow to reuse it. - */ + ** When we get here, phy_task is already removed from + ** the workqueue. It is thus safe to allow to reuse it. + */ fep->mii_phy_task_queued = 0; printk("%s: config: auto-negotiation ", dev->name); @@ -1994,14 +1973,15 @@ static void mii_display_config(struct wo static void mii_relink(struct work_struct *work) { - struct fec_enet_private *fep = container_of(work, struct fec_enet_private, phy_task); + struct fec_enet_private *fep = + container_of(work, struct fec_enet_private, phy_task); struct net_device *dev = fep->netdev; int duplex; /* - ** When we get here, phy_task is already removed from - ** the workqueue. It is thus safe to allow to reuse it. - */ + ** When we get here, phy_task is already removed from + ** the workqueue. It is thus safe to allow to reuse it. + */ fep->mii_phy_task_queued = 0; fep->link = (fep->phy_status & PHY_STAT_LINK) ? 1 : 0; mii_display_status(dev); @@ -2009,8 +1989,7 @@ static void mii_relink(struct work_struc if (fep->link) { duplex = 0; - if (fep->phy_status - & (PHY_STAT_100FDX | PHY_STAT_10FDX)) + if (fep->phy_status & (PHY_STAT_100FDX | PHY_STAT_10FDX)) duplex = 1; fec_restart(dev, duplex); } else @@ -2028,12 +2007,12 @@ static void mii_queue_relink(uint mii_re struct fec_enet_private *fep = netdev_priv(dev); /* - ** We cannot queue phy_task twice in the workqueue. It - ** would cause an endless loop in the workqueue. - ** Fortunately, if the last mii_relink entry has not yet been - ** executed now, it will do the job for the current interrupt, - ** which is just what we want. - */ + ** We cannot queue phy_task twice in the workqueue. It + ** would cause an endless loop in the workqueue. + ** Fortunately, if the last mii_relink entry has not yet been + ** executed now, it will do the job for the current interrupt, + ** which is just what we want. + */ if (fep->mii_phy_task_queued) return; @@ -2056,18 +2035,17 @@ static void mii_queue_config(uint mii_re } phy_cmd_t const phy_cmd_relink[] = { - { mk_mii_read(MII_REG_CR), mii_queue_relink }, - { mk_mii_end, } - }; + {mk_mii_read(MII_REG_CR), mii_queue_relink}, + {mk_mii_end,} +}; phy_cmd_t const phy_cmd_config[] = { - { mk_mii_read(MII_REG_CR), mii_queue_config }, - { mk_mii_end, } - }; + {mk_mii_read(MII_REG_CR), mii_queue_config}, + {mk_mii_end,} +}; /* Read remainder of PHY ID. */ -static void -mii_discover_phy3(uint mii_reg, struct net_device *dev) +static void mii_discover_phy3(uint mii_reg, struct net_device *dev) { struct fec_enet_private *fep; int i; @@ -2076,8 +2054,8 @@ mii_discover_phy3(uint mii_reg, struct n fep->phy_id |= (mii_reg & 0xffff); printk("fec: PHY @ 0x%x, ID 0x%08x", fep->phy_addr, fep->phy_id); - for(i = 0; phy_info[i]; i++) { - if(phy_info[i]->id == (fep->phy_id >> 4)) + for (i = 0; phy_info[i]; i++) { + if (phy_info[i]->id == (fep->phy_id >> 4)) break; } @@ -2093,8 +2071,7 @@ mii_discover_phy3(uint mii_reg, struct n /* Scan all of the MII PHY addresses looking for someone to respond * with a valid ID. This usually happens quickly. */ -static void -mii_discover_phy(uint mii_reg, struct net_device *dev) +static void mii_discover_phy(uint mii_reg, struct net_device *dev) { struct fec_enet_private *fep; volatile fec_t *fecp; @@ -2107,14 +2084,14 @@ mii_discover_phy(uint mii_reg, struct ne if ((phytype = (mii_reg & 0xffff)) != 0xffff && phytype != 0) { /* Got first part of ID, now get remainder. - */ + */ fep->phy_id = phytype << 16; mii_queue(dev, mk_mii_read(MII_REG_PHYIR2), - mii_discover_phy3); + mii_discover_phy3); } else { fep->phy_addr++; mii_queue(dev, mk_mii_read(MII_REG_PHYIR1), - mii_discover_phy); + mii_discover_phy); } } else { printk("FEC: No PHY device found.\n"); @@ -2124,33 +2101,23 @@ mii_discover_phy(uint mii_reg, struct ne } } -/* This interrupt occurs when the PHY detects a link change. -*/ -#ifdef CONFIG_RPXCLASSIC -static void -mii_link_interrupt(void *dev_id) -#else -static irqreturn_t -mii_link_interrupt(int irq, void * dev_id) -#endif +/* Set a MAC change in hardware. + */ +static void fec_set_mac_address(struct net_device *dev) { - struct net_device *dev = dev_id; - struct fec_enet_private *fep = netdev_priv(dev); - - fec_phy_ack_intr(); + volatile fec_t *fecp; -#if 0 - disable_irq(fep->mii_irq); /* disable now, enable later */ -#endif + fecp = ((struct fec_enet_private *)netdev_priv(dev))->hwp; - mii_do_cmd(dev, fep->phy->ack_int); - mii_do_cmd(dev, phy_cmd_relink); /* restart and display status */ + /* Set station address. */ + fecp->fec_addr_low = dev->dev_addr[3] | (dev->dev_addr[2] << 8) | + (dev->dev_addr[1] << 16) | (dev->dev_addr[0] << 24); + fecp->fec_addr_high = (dev->dev_addr[5] << 16) | + (dev->dev_addr[4] << 24); - return IRQ_HANDLED; } -static int -fec_enet_open(struct net_device *dev) +static int fec_enet_open(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); @@ -2165,7 +2132,7 @@ fec_enet_open(struct net_device *dev) if (fep->phy) { mii_do_cmd(dev, fep->phy->ack_int); mii_do_cmd(dev, fep->phy->config); - mii_do_cmd(dev, phy_cmd_config); /* display configuration */ + mii_do_cmd(dev, phy_cmd_config); /* display configuration */ /* Poll until the PHY tells us its configuration * (not link state). @@ -2174,7 +2141,7 @@ fec_enet_open(struct net_device *dev) * This should take about 25 usec per register at 2.5 MHz, * and we read approximately 5 registers. */ - while(!fep->sequence_done) + while (!fep->sequence_done) schedule(); mii_do_cmd(dev, fep->phy->startup); @@ -2185,7 +2152,7 @@ fec_enet_open(struct net_device *dev) */ fep->link = 1; } else { - fep->link = 1; /* lets just try it and see */ + fep->link = 1; /* lets just try it and see */ /* no phy, go full duplex, it's most likely a hub chip */ fec_restart(dev, 1); } @@ -2195,13 +2162,12 @@ fec_enet_open(struct net_device *dev) return 0; /* Success */ } -static int -fec_enet_close(struct net_device *dev) +static int fec_enet_close(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); /* Don't know what to do yet. - */ + */ fep->opened = 0; netif_stop_queue(dev); fec_stop(dev); @@ -2219,7 +2185,7 @@ fec_enet_close(struct net_device *dev) * this kind of feature?). */ -#define HASH_BITS 6 /* #bits in hash */ +#define HASH_BITS 6 /* #bits in hash */ #define CRC32_POLY 0xEDB88320 static void set_multicast_list(struct net_device *dev) @@ -2233,76 +2199,61 @@ static void set_multicast_list(struct ne fep = netdev_priv(dev); ep = fep->hwp; - if (dev->flags&IFF_PROMISC) { + if (dev->flags & IFF_PROMISC) { ep->fec_r_cntrl |= 0x0008; - } else { + return ; + } - ep->fec_r_cntrl &= ~0x0008; + ep->fec_r_cntrl &= ~0x0008; - if (dev->flags & IFF_ALLMULTI) { - /* Catch all multicast addresses, so set the - * filter to all 1's. - */ - ep->fec_hash_table_high = 0xffffffff; - ep->fec_hash_table_low = 0xffffffff; - } else { - /* Clear filter and add the addresses in hash register. - */ - ep->fec_hash_table_high = 0; - ep->fec_hash_table_low = 0; - - dmi = dev->mc_list; - - for (j = 0; j < dev->mc_count; j++, dmi = dmi->next) - { - /* Only support group multicast for now. - */ - if (!(dmi->dmi_addr[0] & 1)) - continue; - - /* calculate crc32 value of mac address - */ - crc = 0xffffffff; - - for (i = 0; i < dmi->dmi_addrlen; i++) - { - data = dmi->dmi_addr[i]; - for (bit = 0; bit < 8; bit++, data >>= 1) - { - crc = (crc >> 1) ^ - (((crc ^ data) & 1) ? CRC32_POLY : 0); - } - } - - /* only upper 6 bits (HASH_BITS) are used - which point to specific bit in he hash registers - */ - hash = (crc >> (32 - HASH_BITS)) & 0x3f; - - if (hash > 31) - ep->fec_hash_table_high |= 1 << (hash - 32); - else - ep->fec_hash_table_low |= 1 << hash; - } - } + if (dev->flags & IFF_ALLMULTI) { + /* Catch all multicast addresses, so set the + * filter to all 1's. + */ + ep->fec_hash_table_high = 0xffffffff; + ep->fec_hash_table_low = 0xffffffff; + return ; } -} + /* + * Clear filter and add the addresses in hash register. + */ + ep->fec_hash_table_high = 0; + ep->fec_hash_table_low = 0; -/* Set a MAC change in hardware. - */ -static void -fec_set_mac_address(struct net_device *dev) -{ - volatile fec_t *fecp; + dmi = dev->mc_list; - fecp = ((struct fec_enet_private *)netdev_priv(dev))->hwp; + for (j = 0; j < dev->mc_count; j++, dmi = dmi->next) { + /* Only support group multicast for now. + */ + if (!(dmi->dmi_addr[0] & 1)) + continue; - /* Set station address. */ - fecp->fec_addr_low = dev->dev_addr[3] | (dev->dev_addr[2] << 8) | - (dev->dev_addr[1] << 16) | (dev->dev_addr[0] << 24); - fecp->fec_addr_high = (dev->dev_addr[5] << 16) | - (dev->dev_addr[4] << 24); + /* calculate crc32 value of mac address + */ + crc = 0xffffffff; + + for (i = 0; i < dmi->dmi_addrlen; i++) { + data = dmi->dmi_addr[i]; + for (bit = 0; bit < 8; + bit++, data >>= 1) { + crc = + (crc >> 1) ^ + (((crc ^ data) & 1) ? + CRC32_POLY : 0); + } + } + /* only upper 6 bits (HASH_BITS) are used + which point to specific bit in he hash registers + */ + hash = (crc >> (32 - HASH_BITS)) & 0x3f; + + if (hash > 31) + ep->fec_hash_table_high |= + 1 << (hash - 32); + else + ep->fec_hash_table_low |= 1 << hash; + } } /* Initialize the FEC Ethernet on 860T (or ColdFire 5272). @@ -2310,38 +2261,40 @@ fec_set_mac_address(struct net_device *d /* * XXX: We need to clean up on failure exits here. */ +static int index; int __init fec_enet_init(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); - unsigned long mem_addr; - volatile cbd_t *bdp; - cbd_t *cbd_base; - volatile fec_t *fecp; - int i, j; - static int index = 0; + unsigned long mem_addr; + volatile cbd_t *bdp; + cbd_t *cbd_base; + volatile fec_t *fecp; + int i, j; /* Only allow us to be probed once. */ if (index >= FEC_MAX_PORTS) return -ENXIO; /* Allocate memory for buffer descriptors. - */ + */ mem_addr = __get_free_page(GFP_KERNEL); if (mem_addr == 0) { printk("FEC: allocate descriptor memory failed?\n"); return -ENOMEM; } + spin_lock_init(&fep->hw_lock); + spin_lock_init(&fep->mii_lock); /* Create an Ethernet device instance. - */ - fecp = (volatile fec_t *) fec_hw[index]; + */ + fecp = (volatile fec_t *)fec_hw[index]; fep->index = index; fep->hwp = fecp; fep->netdev = dev; /* Whack a reset. We should wait for this. - */ + */ fecp->fec_ecntrl = 1; udelay(10); @@ -2353,13 +2306,12 @@ int __init fec_enet_init(struct net_devi */ fec_get_mac(dev); - cbd_base = (cbd_t *)mem_addr; - /* XXX: missing check for allocation failure */ + cbd_base = (cbd_t *) mem_addr; fec_uncache(mem_addr); /* Set receive and transmit descriptor base. - */ + */ fep->rx_bd_base = cbd_base; fep->tx_bd_base = cbd_base + RX_RING_SIZE; @@ -2369,20 +2321,20 @@ int __init fec_enet_init(struct net_devi fep->skb_cur = fep->skb_dirty = 0; /* Initialize the receive buffer descriptors. - */ + */ bdp = fep->rx_bd_base; - for (i=0; icbd_sc = BD_ENET_RX_EMPTY; bdp->cbd_bufaddr = __pa(mem_addr); mem_addr += FEC_ENET_RX_FRSIZE; @@ -2391,43 +2343,44 @@ int __init fec_enet_init(struct net_devi } /* Set the last buffer to wrap. - */ + */ bdp--; bdp->cbd_sc |= BD_SC_WRAP; /* ...and the same for transmmit. - */ + */ bdp = fep->tx_bd_base; - for (i=0, j=FEC_ENET_TX_FRPPG; i= FEC_ENET_TX_FRPPG) { + /* XXX: missing check for allocation failure */ mem_addr = __get_free_page(GFP_KERNEL); j = 1; } else { mem_addr += FEC_ENET_TX_FRSIZE; j++; } - fep->tx_bounce[i] = (unsigned char *) mem_addr; + fep->tx_bounce[i] = (unsigned char *)mem_addr; /* Initialize the BD for every fragment in the page. - */ + */ bdp->cbd_sc = 0; bdp->cbd_bufaddr = 0; bdp++; } /* Set the last buffer to wrap. - */ + */ bdp--; bdp->cbd_sc |= BD_SC_WRAP; /* Set receive and transmit descriptor base. - */ - fecp->fec_r_des_start = __pa((uint)(fep->rx_bd_base)); - fecp->fec_x_des_start = __pa((uint)(fep->tx_bd_base)); + */ + fecp->fec_r_des_start = __pa((uint) (fep->rx_bd_base)); + fecp->fec_x_des_start = __pa((uint) (fep->tx_bd_base)); /* Install our interrupt handlers. This varies depending on * the architecture. - */ + */ fec_request_intrs(dev); fecp->fec_hash_table_high = 0; @@ -2446,8 +2399,8 @@ int __init fec_enet_init(struct net_devi dev->stop = fec_enet_close; dev->set_multicast_list = set_multicast_list; - for (i=0; ifec_ievent = 0xffc00000; - fecp->fec_imask = (FEC_ENET_TXF | FEC_ENET_TXB | - FEC_ENET_RXF | FEC_ENET_RXB | FEC_ENET_MII); + fecp->fec_imask = (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII); /* Queue up command to detect the PHY and initialize the * remainder of the interface. @@ -2473,8 +2425,7 @@ int __init fec_enet_init(struct net_devi * change. This only happens when switching between half and full * duplex. */ -static void -fec_restart(struct net_device *dev, int duplex) +static void fec_restart(struct net_device *dev, int duplex) { struct fec_enet_private *fep; volatile cbd_t *bdp; @@ -2485,42 +2436,42 @@ fec_restart(struct net_device *dev, int fecp = fep->hwp; /* Whack a reset. We should wait for this. - */ + */ fecp->fec_ecntrl = 1; udelay(10); /* Clear any outstanding interrupt. - */ + */ fecp->fec_ievent = 0xffc00000; fec_enable_phy_intr(); /* Set station address. - */ + */ fec_set_mac_address(dev); /* Reset all multicast. - */ + */ fecp->fec_hash_table_high = 0; fecp->fec_hash_table_low = 0; /* Set maximum receive buffer size. - */ + */ fecp->fec_r_buff_size = PKT_MAXBLR_SIZE; fec_localhw_setup(); /* Set receive and transmit descriptor base. - */ - fecp->fec_r_des_start = __pa((uint)(fep->rx_bd_base)); - fecp->fec_x_des_start = __pa((uint)(fep->tx_bd_base)); + */ + fecp->fec_r_des_start = __pa((uint) (fep->rx_bd_base)); + fecp->fec_x_des_start = __pa((uint) (fep->tx_bd_base)); fep->dirty_tx = fep->cur_tx = fep->tx_bd_base; fep->cur_rx = fep->rx_bd_base; /* Reset SKB transmit buffers. - */ + */ fep->skb_cur = fep->skb_dirty = 0; - for (i=0; i<=TX_RING_MOD_MASK; i++) { + for (i = 0; i <= TX_RING_MOD_MASK; i++) { if (fep->tx_skbuff[i] != NULL) { dev_kfree_skb_any(fep->tx_skbuff[i]); fep->tx_skbuff[i] = NULL; @@ -2528,43 +2479,43 @@ fec_restart(struct net_device *dev, int } /* Initialize the receive buffer descriptors. - */ + */ bdp = fep->rx_bd_base; - for (i=0; icbd_sc = BD_ENET_RX_EMPTY; bdp++; } /* Set the last buffer to wrap. - */ + */ bdp--; bdp->cbd_sc |= BD_SC_WRAP; /* ...and the same for transmmit. - */ + */ bdp = fep->tx_bd_base; - for (i=0; icbd_sc = 0; bdp->cbd_bufaddr = 0; bdp++; } /* Set the last buffer to wrap. - */ + */ bdp--; bdp->cbd_sc |= BD_SC_WRAP; /* Enable MII mode. - */ + */ if (duplex) { - fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x04;/* MII enable */ - fecp->fec_x_cntrl = 0x04; /* FD enable */ + fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x04; /* MII enable */ + fecp->fec_x_cntrl = 0x04; /* FD enable */ } else { /* MII enable|No Rcv on Xmit */ fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x06; @@ -2573,22 +2524,20 @@ fec_restart(struct net_device *dev, int fep->full_duplex = duplex; /* Set MII speed. - */ + */ fecp->fec_mii_speed = fep->phy_speed; /* And last, enable the transmit and receive processing. - */ + */ fecp->fec_ecntrl = 2; fecp->fec_r_des_active = 0; /* Enable interrupts we wish to service. - */ - fecp->fec_imask = (FEC_ENET_TXF | FEC_ENET_TXB | - FEC_ENET_RXF | FEC_ENET_RXB | FEC_ENET_MII); + */ + fecp->fec_imask = (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII); } -static void -fec_stop(struct net_device *dev) +static void fec_stop(struct net_device *dev) { volatile fec_t *fecp; struct fec_enet_private *fep; @@ -2597,23 +2546,23 @@ fec_stop(struct net_device *dev) fecp = fep->hwp; /* - ** We cannot expect a graceful transmit stop without link !!! - */ - if (fep->link) - { + ** We cannot expect a graceful transmit stop without link !!! + */ + if (fep->link) { fecp->fec_x_cntrl = 0x01; /* Graceful transmit stop */ udelay(10); if (!(fecp->fec_ievent & FEC_ENET_GRA)) - printk("fec_stop : Graceful transmit stop did not complete !\n"); - } + printk + ("fec_stop : Graceful transmit stop did not complete !\n"); + } /* Whack a reset. We should wait for this. - */ + */ fecp->fec_ecntrl = 1; udelay(10); /* Clear outstanding MII command interrupts. - */ + */ fecp->fec_ievent = FEC_ENET_MII; fec_enable_phy_intr(); @@ -2624,7 +2573,7 @@ fec_stop(struct net_device *dev) static int __init fec_enet_module_init(void) { struct net_device *dev; - int i, j, err; + int i, err; DECLARE_MAC_BUF(mac); printk("FEC ENET Version 0.2\n"); @@ -2651,5 +2600,4 @@ static int __init fec_enet_module_init(v } module_init(fec_enet_module_init); - MODULE_LICENSE("GPL"); Index: linux-2.6.24.7-rt27/drivers/serial/68328serial.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/serial/68328serial.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/serial/68328serial.c 2009-02-08 00:00:48.000000000 -0500 @@ -1410,7 +1410,7 @@ rs68328_init(void) if (request_irq(uart_irqs[i], rs_interrupt, - IRQ_FLG_STD, + IRQF_DISABLED, "M68328_UART", NULL)) panic("Unable to attach 68328 serial interrupt\n"); } Index: linux-2.6.24.7-rt27/drivers/serial/mcf.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/serial/mcf.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/serial/mcf.c 2009-02-08 00:00:48.000000000 -0500 @@ -69,7 +69,7 @@ static unsigned int mcf_tx_empty(struct static unsigned int mcf_get_mctrl(struct uart_port *port) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; unsigned int sigs; @@ -87,7 +87,7 @@ static unsigned int mcf_get_mctrl(struct static void mcf_set_mctrl(struct uart_port *port, unsigned int sigs) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; spin_lock_irqsave(&port->lock, flags); @@ -104,7 +104,7 @@ static void mcf_set_mctrl(struct uart_po static void mcf_start_tx(struct uart_port *port) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; spin_lock_irqsave(&port->lock, flags); @@ -117,7 +117,7 @@ static void mcf_start_tx(struct uart_por static void mcf_stop_tx(struct uart_port *port) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; spin_lock_irqsave(&port->lock, flags); @@ -130,7 +130,7 @@ static void mcf_stop_tx(struct uart_port static void mcf_stop_rx(struct uart_port *port) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; spin_lock_irqsave(&port->lock, flags); @@ -163,7 +163,7 @@ static void mcf_enable_ms(struct uart_po static int mcf_startup(struct uart_port *port) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; spin_lock_irqsave(&port->lock, flags); @@ -189,7 +189,7 @@ static int mcf_startup(struct uart_port static void mcf_shutdown(struct uart_port *port) { - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned long flags; spin_lock_irqsave(&port->lock, flags); @@ -273,7 +273,7 @@ static void mcf_set_termios(struct uart_ static void mcf_rx_chars(struct mcf_uart *pp) { - struct uart_port *port = (struct uart_port *) pp; + struct uart_port *port = &pp->port; unsigned char status, ch, flag; while ((status = readb(port->membase + MCFUART_USR)) & MCFUART_USR_RXREADY) { @@ -319,7 +319,7 @@ static void mcf_rx_chars(struct mcf_uart static void mcf_tx_chars(struct mcf_uart *pp) { - struct uart_port *port = (struct uart_port *) pp; + struct uart_port *port = &pp->port; struct circ_buf *xmit = &port->info->xmit; if (port->x_char) { @@ -352,7 +352,7 @@ static void mcf_tx_chars(struct mcf_uart static irqreturn_t mcf_interrupt(int irq, void *data) { struct uart_port *port = data; - struct mcf_uart *pp = (struct mcf_uart *) port; + struct mcf_uart *pp = container_of(port, struct mcf_uart, port); unsigned int isr; isr = readb(port->membase + MCFUART_UISR) & pp->imr; @@ -434,7 +434,7 @@ static struct uart_ops mcf_uart_ops = { static struct mcf_uart mcf_ports[3]; -#define MCF_MAXPORTS (sizeof(mcf_ports) / sizeof(struct mcf_uart)) +#define MCF_MAXPORTS ARRAY_SIZE(mcf_ports) /****************************************************************************/ #if defined(CONFIG_SERIAL_MCF_CONSOLE) Index: linux-2.6.24.7-rt27/drivers/serial/mcfserial.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/serial/mcfserial.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/serial/mcfserial.c 2009-02-08 00:00:48.000000000 -0500 @@ -65,7 +65,8 @@ struct timer_list mcfrs_timer_struct; #define CONSOLE_BAUD_RATE 115200 #define DEFAULT_CBAUD B115200 #elif defined(CONFIG_ARNEWSH) || defined(CONFIG_FREESCALE) || \ - defined(CONFIG_senTec) || defined(CONFIG_SNEHA) || defined(CONFIG_AVNET) + defined(CONFIG_senTec) || defined(CONFIG_SNEHA) || defined(CONFIG_AVNET) || \ + defined(CONFIG_SAVANT) #define CONSOLE_BAUD_RATE 19200 #define DEFAULT_CBAUD B19200 #endif @@ -324,7 +325,7 @@ static void mcfrs_start(struct tty_struc * ----------------------------------------------------------------------- */ -static inline void receive_chars(struct mcf_serial *info) +static noinline void receive_chars(struct mcf_serial *info) { volatile unsigned char *uartp; struct tty_struct *tty = info->tty; @@ -369,7 +370,7 @@ static inline void receive_chars(struct return; } -static inline void transmit_chars(struct mcf_serial *info) +static noinline void transmit_chars(struct mcf_serial *info) { volatile unsigned char *uartp; @@ -1489,14 +1490,28 @@ int mcfrs_open(struct tty_struct *tty, s /* * Based on the line number set up the internal interrupt stuff. */ -static void mcfrs_irqinit(struct mcf_serial *info) +static int mcfrs_irqinit(struct mcf_serial *info) { + volatile unsigned char *uartp; + int ret; + + uartp = info->addr; + /* Clear mask, so no surprise interrupts. */ + uartp[MCFUART_UIMR] = 0; + + ret = request_irq(info->irq, mcfrs_interrupt, IRQF_DISABLED, + "ColdFire UART", NULL); + if (ret) { + printk("MCFRS: Unable to attach ColdFire UART %d interrupt " + "vector=%d, error: %d\n", info->line, + info->irq, ret); + return ret; + } + #if defined(CONFIG_M5272) volatile unsigned long *icrp; volatile unsigned long *portp; - volatile unsigned char *uartp; - uartp = info->addr; icrp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_ICR2); switch (info->line) { @@ -1518,11 +1533,10 @@ static void mcfrs_irqinit(struct mcf_ser portp = (volatile unsigned long *) (MCF_MBAR + MCFSIM_PDCNT); *portp = (*portp & ~0x000003fc) | 0x000002a8; #elif defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) - volatile unsigned char *icrp, *uartp; +#if !defined(CONFIG_M523x) + volatile unsigned char *icrp; volatile unsigned long *imrp; - uartp = info->addr; - icrp = (volatile unsigned char *) (MCF_MBAR + MCFICM_INTC0 + MCFINTC_ICR0 + MCFINT_UART0 + info->line); *icrp = 0x30 + info->line; /* level 6, line based priority */ @@ -1530,6 +1544,14 @@ static void mcfrs_irqinit(struct mcf_ser imrp = (volatile unsigned long *) (MCF_MBAR + MCFICM_INTC0 + MCFINTC_IMRL); *imrp &= ~((1 << (info->irq - MCFINT_VECBASE)) | 1); +#endif +#if defined(CONFIG_M523x) + { + volatile unsigned short *par_uartp; + par_uartp = (volatile unsigned short *) (MCF_MBAR + MCF523x_GPIO_PAR_UART); + *par_uartp = 0x3FFF; /* setup GPIO for UART0, UART1 & UART2 */ + } +#endif #if defined(CONFIG_M527x) { /* @@ -1554,37 +1576,38 @@ static void mcfrs_irqinit(struct mcf_ser } #endif #elif defined(CONFIG_M520x) - volatile unsigned char *icrp, *uartp; - volatile unsigned long *imrp; - - uartp = info->addr; - - icrp = (volatile unsigned char *) (MCF_MBAR + MCFICM_INTC0 + - MCFINTC_ICR0 + MCFINT_UART0 + info->line); - *icrp = 0x03; + { + volatile unsigned char *icrp; + volatile unsigned long *imrp; - imrp = (volatile unsigned long *) (MCF_MBAR + MCFICM_INTC0 + - MCFINTC_IMRL); - *imrp &= ~((1 << (info->irq - MCFINT_VECBASE)) | 1); - if (info->line < 2) { - unsigned short *uart_par; - uart_par = (unsigned short *)(MCF_IPSBAR + MCF_GPIO_PAR_UART); - if (info->line == 0) - *uart_par |= MCF_GPIO_PAR_UART_PAR_UTXD0 - | MCF_GPIO_PAR_UART_PAR_URXD0; - else if (info->line == 1) - *uart_par |= MCF_GPIO_PAR_UART_PAR_UTXD1 - | MCF_GPIO_PAR_UART_PAR_URXD1; + icrp = (volatile unsigned char *) (MCF_MBAR + MCFICM_INTC0 + + MCFINTC_ICR0 + MCFINT_UART0 + info->line); + *icrp = 0x03; + + imrp = (volatile unsigned long *) (MCF_MBAR + MCFICM_INTC0 + + MCFINTC_IMRL); + *imrp &= ~((1 << (info->irq - MCFINT_VECBASE)) | 1); + if (info->line < 2) { + unsigned short *uart_par; + uart_par = (unsigned short *)(MCF_IPSBAR + + MCF_GPIO_PAR_UART); + if (info->line == 0) + *uart_par |= MCF_GPIO_PAR_UART_PAR_UTXD0 + | MCF_GPIO_PAR_UART_PAR_URXD0; + else if (info->line == 1) + *uart_par |= MCF_GPIO_PAR_UART_PAR_UTXD1 + | MCF_GPIO_PAR_UART_PAR_URXD1; } else if (info->line == 2) { unsigned char *feci2c_par; - feci2c_par = (unsigned char *)(MCF_IPSBAR + MCF_GPIO_PAR_FECI2C); + feci2c_par = (unsigned char *)(MCF_IPSBAR + + MCF_GPIO_PAR_FECI2C); *feci2c_par &= ~0x0F; *feci2c_par |= MCF_GPIO_PAR_FECI2C_PAR_SCL_UTXD2 - | MCF_GPIO_PAR_FECI2C_PAR_SDA_URXD2; + | MCF_GPIO_PAR_FECI2C_PAR_SDA_URXD2; } + } #elif defined(CONFIG_M532x) - volatile unsigned char *uartp; - uartp = info->addr; + switch (info->line) { case 0: MCF_INTC0_ICR26 = 0x3; @@ -1605,7 +1628,6 @@ static void mcfrs_irqinit(struct mcf_ser break; } #else - volatile unsigned char *icrp, *uartp; switch (info->line) { case 0: @@ -1623,23 +1645,12 @@ static void mcfrs_irqinit(struct mcf_ser default: printk("MCFRS: don't know how to handle UART %d interrupt?\n", info->line); - return; + return -ENODEV; } - uartp = info->addr; uartp[MCFUART_UIVR] = info->irq; #endif - - /* Clear mask, so no surprise interrupts. */ - uartp[MCFUART_UIMR] = 0; - - if (request_irq(info->irq, mcfrs_interrupt, IRQF_DISABLED, - "ColdFire UART", NULL)) { - printk("MCFRS: Unable to attach ColdFire UART %d interrupt " - "vector=%d\n", info->line, info->irq); - } - - return; + return 0; } @@ -1729,7 +1740,6 @@ static int __init mcfrs_init(void) { struct mcf_serial *info; - unsigned long flags; int i; /* Setup base handler, and timer table. */ @@ -1769,12 +1779,12 @@ mcfrs_init(void) return(-EBUSY); } - local_irq_save(flags); - /* * Configure all the attached serial ports. */ for (i = 0, info = mcfrs_table; (i < NR_PORTS); i++, info++) { + int ret; + info->magic = SERIAL_MAGIC; info->line = i; info->tty = 0; @@ -1792,14 +1802,11 @@ mcfrs_init(void) info->imr = 0; mcfrs_setsignals(info, 0, 0); - mcfrs_irqinit(info); - - printk("ttyS%d at 0x%04x (irq = %d)", info->line, - (unsigned int) info->addr, info->irq); - printk(" is a builtin ColdFire UART\n"); + ret = mcfrs_irqinit(info); + if (!ret) + printk("ttyS%d at 0x%p (irq = %d) is a builtin " + "ColdFire UART\n", info->line, info->addr, info->irq); } - - local_irq_restore(flags); return 0; } Index: linux-2.6.24.7-rt27/fs/nfs/file.c =================================================================== --- linux-2.6.24.7-rt27.orig/fs/nfs/file.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/fs/nfs/file.c 2009-02-08 00:00:48.000000000 -0500 @@ -64,7 +64,11 @@ const struct file_operations nfs_file_op .write = do_sync_write, .aio_read = nfs_file_read, .aio_write = nfs_file_write, +#ifdef CONFIG_MMU .mmap = nfs_file_mmap, +#else + .mmap = generic_file_mmap, +#endif .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, Index: linux-2.6.24.7-rt27/include/asm-generic/vmlinux.lds.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-generic/vmlinux.lds.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-generic/vmlinux.lds.h 2009-02-08 00:05:15.000000000 -0500 @@ -6,9 +6,21 @@ #define VMLINUX_SYMBOL(_sym_) _sym_ #endif +#ifndef OUTPUT_DATA_SECTION +#define OUTPUT_DATA_SECTION +#endif + /* Align . to a 8 byte boundary equals to maximum function alignment. */ #define ALIGN_FUNCTION() . = ALIGN(8) +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +#define MCOUNT_REC() VMLINUX_SYMBOL(__start_mcount_loc) = .; \ + *(__mcount_loc) \ + VMLINUX_SYMBOL(__stop_mcount_loc) = .; +#else +#define MCOUNT_REC() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -25,11 +37,11 @@ *(.rodata) *(.rodata.*) \ *(__vermagic) /* Kernel version magic */ \ *(__markers_strings) /* Markers: strings */ \ - } \ + } OUTPUT_DATA_SECTION \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ *(.rodata1) \ - } \ + } OUTPUT_DATA_SECTION \ \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ @@ -48,89 +60,89 @@ VMLINUX_SYMBOL(__start_pci_fixups_resume) = .; \ *(.pci_fixup_resume) \ VMLINUX_SYMBOL(__end_pci_fixups_resume) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* RapidIO route ops */ \ .rio_route : AT(ADDR(.rio_route) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rio_route_ops) = .; \ *(.rio_route_ops) \ VMLINUX_SYMBOL(__end_rio_route_ops) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ *(__ksymtab) \ VMLINUX_SYMBOL(__stop___ksymtab) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: GPL-only symbols */ \ __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl) = .; \ *(__ksymtab_gpl) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: Normal unused symbols */ \ __ksymtab_unused : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused) = .; \ *(__ksymtab_unused) \ VMLINUX_SYMBOL(__stop___ksymtab_unused) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused_gpl) = .; \ *(__ksymtab_unused_gpl) \ VMLINUX_SYMBOL(__stop___ksymtab_unused_gpl) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .; \ *(__ksymtab_gpl_future) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: Normal symbols */ \ __kcrctab : AT(ADDR(__kcrctab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab) = .; \ *(__kcrctab) \ VMLINUX_SYMBOL(__stop___kcrctab) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: GPL-only symbols */ \ __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl) = .; \ *(__kcrctab_gpl) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: Normal unused symbols */ \ __kcrctab_unused : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused) = .; \ *(__kcrctab_unused) \ VMLINUX_SYMBOL(__stop___kcrctab_unused) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused_gpl) = .; \ *(__kcrctab_unused_gpl) \ VMLINUX_SYMBOL(__stop___kcrctab_unused_gpl) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .; \ *(__kcrctab_gpl_future) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Kernel symbol table: strings */ \ __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \ *(__ksymtab_strings) \ - } \ + } OUTPUT_DATA_SECTION \ \ /* Built-in module parameters. */ \ __param : AT(ADDR(__param) - LOAD_OFFSET) { \ @@ -138,7 +150,7 @@ *(__param) \ VMLINUX_SYMBOL(__stop___param) = .; \ VMLINUX_SYMBOL(__end_rodata) = .; \ - } \ + } OUTPUT_DATA_SECTION \ \ . = ALIGN((align)); @@ -227,7 +239,7 @@ __start___bug_table = .; \ *(__bug_table) \ __stop___bug_table = .; \ - } + } OUTPUT_DATA_SECTION #define NOTES \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ @@ -261,5 +273,5 @@ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ *(.data.percpu) \ *(.data.percpu.shared_aligned) \ - } \ + } OUTPUT_DATA_SECTION \ __per_cpu_end = .; Index: linux-2.6.24.7-rt27/include/asm-m68knommu/bitops.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/bitops.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/bitops.h 2009-02-08 00:00:48.000000000 -0500 @@ -14,8 +14,38 @@ #error only can be included directly #endif +#if defined (__mcfisaaplus__) || defined (__mcfisac__) +static inline int ffs(unsigned int val) +{ + if (!val) + return 0; + + asm volatile( + "bitrev %0\n\t" + "ff1 %0\n\t" + : "=d" (val) + : "0" (val) + ); + val++; + return val; +} + +static inline int __ffs(unsigned int val) +{ + asm volatile( + "bitrev %0\n\t" + "ff1 %0\n\t" + : "=d" (val) + : "0" (val) + ); + return val; +} + +#else #include #include +#endif + #include #include Index: linux-2.6.24.7-rt27/include/asm-m68knommu/byteorder.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/byteorder.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/byteorder.h 2009-02-08 00:00:48.000000000 -0500 @@ -1,13 +1,27 @@ #ifndef _M68KNOMMU_BYTEORDER_H #define _M68KNOMMU_BYTEORDER_H -#include +#include #if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) # define __BYTEORDER_HAS_U64__ # define __SWAB_64_THRU_32__ #endif +#if defined (__mcfisaaplus__) || defined (__mcfisac__) +static inline __attribute_const__ __u32 ___arch__swab32(__u32 val) +{ + asm( + "byterev %0" + : "=d" (val) + : "0" (val) + ); + return val; +} + +#define __arch__swab32(x) ___arch__swab32(x) +#endif + #include #endif /* _M68KNOMMU_BYTEORDER_H */ Index: linux-2.6.24.7-rt27/include/asm-m68knommu/cacheflush.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/cacheflush.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/cacheflush.h 2009-02-08 00:00:48.000000000 -0500 @@ -53,7 +53,7 @@ static inline void __flush_cache_all(voi #endif /* CONFIG_M5407 */ #if defined(CONFIG_M527x) || defined(CONFIG_M528x) __asm__ __volatile__ ( - "movel #0x81400100, %%d0\n\t" + "movel #0x81000200, %%d0\n\t" "movec %%d0, %%CACR\n\t" "nop\n\t" : : : "d0" ); Index: linux-2.6.24.7-rt27/include/asm-m68knommu/commproc.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/commproc.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/commproc.h 2009-02-08 00:00:48.000000000 -0500 @@ -519,25 +519,6 @@ typedef struct scc_enet { #define SICR_ENET_CLKRT ((uint)0x00002c00) #endif -#ifdef CONFIG_RPXCLASSIC -/* Bits in parallel I/O port registers that have to be set/cleared - * to configure the pins for SCC1 use. - */ -#define PA_ENET_RXD ((ushort)0x0001) -#define PA_ENET_TXD ((ushort)0x0002) -#define PA_ENET_TCLK ((ushort)0x0200) -#define PA_ENET_RCLK ((ushort)0x0800) -#define PB_ENET_TENA ((uint)0x00001000) -#define PC_ENET_CLSN ((ushort)0x0010) -#define PC_ENET_RENA ((ushort)0x0020) - -/* Control bits in the SICR to route TCLK (CLK2) and RCLK (CLK4) to - * SCC1. Also, make sure GR1 (bit 24) and SC1 (bit 25) are zero. - */ -#define SICR_ENET_MASK ((uint)0x000000ff) -#define SICR_ENET_CLKRT ((uint)0x0000003d) -#endif - /* SCC Event register as used by Ethernet. */ #define SCCE_ENET_GRA ((ushort)0x0080) /* Graceful stop complete */ Index: linux-2.6.24.7-rt27/include/asm-m68knommu/dma.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/dma.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/dma.h 2009-02-08 00:00:48.000000000 -0500 @@ -35,7 +35,8 @@ /* * Set number of channels of DMA on ColdFire for different implementations. */ -#if defined(CONFIG_M5249) || defined(CONFIG_M5307) || defined(CONFIG_M5407) +#if defined(CONFIG_M5249) || defined(CONFIG_M5307) || defined(CONFIG_M5407) || \ + defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) #define MAX_M68K_DMA_CHANNELS 4 #elif defined(CONFIG_M5272) #define MAX_M68K_DMA_CHANNELS 1 Index: linux-2.6.24.7-rt27/include/asm-m68knommu/m523xsim.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/m523xsim.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/m523xsim.h 2009-02-08 00:00:48.000000000 -0500 @@ -11,7 +11,6 @@ #define m523xsim_h /****************************************************************************/ - /* * Define the 523x SIM register set addresses. */ @@ -27,10 +26,35 @@ #define MCFINTC_IACKL 0x19 /* */ #define MCFINTC_ICR0 0x40 /* Base ICR register */ +/* INTC0 - interrupt numbers */ #define MCFINT_VECBASE 64 /* Vector base number */ -#define MCFINT_UART0 13 /* Interrupt number for UART0 */ -#define MCFINT_PIT1 36 /* Interrupt number for PIT1 */ -#define MCFINT_QSPI 18 /* Interrupt number for QSPI */ +#define MCFINT_EPF4 4 /* EPORT4 */ +#define MCFINT_EPF5 5 /* EPORT5 */ +#define MCFINT_EPF6 6 /* EPORT6 */ +#define MCFINT_EPF7 7 /* EPORT7 */ +#define MCFINT_UART0 13 /* UART0 */ +#define MCFINT_QSPI 18 /* QSPI */ +#define MCFINT_PIT1 36 /* PIT1 */ +#define MCFINT_PER_INTC 64 + +/* INTC1 - interrupt numbers */ +#define MCFINT_INTC1_VECBASE (MCFINT_VECBASE + MCFINT_PER_INTC) +#define MCFINT_TC0F 27 /* eTPU Channel 0 */ +#define MCFINT_TC1F 28 /* eTPU Channel 1 */ +#define MCFINT_TC2F 29 /* eTPU Channel 2 */ +#define MCFINT_TC3F 30 /* eTPU Channel 3 */ +#define MCFINT_TC4F 31 /* eTPU Channel 4 */ +#define MCFINT_TC5F 32 /* eTPU Channel 5 */ +#define MCFINT_TC6F 33 /* eTPU Channel 6 */ +#define MCFINT_TC7F 34 /* eTPU Channel 7 */ +#define MCFINT_TC8F 35 /* eTPU Channel 8 */ +#define MCFINT_TC9F 36 /* eTPU Channel 9 */ +#define MCFINT_TC10F 37 /* eTPU Channel 10 */ +#define MCFINT_TC11F 38 /* eTPU Channel 11 */ +#define MCFINT_TC12F 39 /* eTPU Channel 12 */ +#define MCFINT_TC13F 40 /* eTPU Channel 13 */ +#define MCFINT_TC14F 41 /* eTPU Channel 14 */ +#define MCFINT_TC15F 42 /* eTPU Channel 15 */ /* * SDRAM configuration registers. @@ -41,5 +65,120 @@ #define MCFSIM_DACR1 0x50 /* SDRAM base address 1 */ #define MCFSIM_DMR1 0x54 /* SDRAM address mask 1 */ +/* + * GPIO Registers and Pin Assignments + */ +#define MCF_GPIO_PAR_FECI2C 0x100047 /* FEC Pin Assignment reg */ +#define MCF523x_GPIO_PAR_UART 0x100048 /* UART Pin Assignment reg */ +#define MCF523x_GPIO_PAR_QSPI 0x10004a /* QSPI Pin Assignment reg */ +#define MCF523x_GPIO_PAR_TIMER 0x10004c /* TIMER Pin Assignment reg */ +#define MCF523x_GPIO_PDDR_QSPI 0x10001a /* QSPI Pin Direction reg */ +#define MCF523x_GPIO_PDDR_TIMER 0x10001b /* TIMER Pin Direction reg */ +#define MCF523x_GPIO_PPDSDR_QSPI 0x10002a /* QSPI Pin Data reg */ +#define MCF523x_GPIO_PPDSDR_TIMER 0x10002b /* TIMER Pin Data reg */ + +#define MCF_GPIO_PAR_FECI2C_PAR_SDA(x) (((x) & 0x03) << 0) +#define MCF_GPIO_PAR_FECI2C_PAR_SCL(x) (((x) & 0x03) << 2) + +/* + * eTPU Registers + */ +#define MCF523x_ETPU 0x1d0000 /* eTPU Base */ +#define MCF523x_ETPU_CIOSR 0x00220 /* eTPU Intr Overflow Status */ +#define MCF523x_ETPU_CIER 0x00240 /* eTPU Intr Enable */ +#define MCF523x_ETPU_CR(c) (0x00400 + ((c) * 0x10)) /* eTPU c Config */ +#define MCF523x_ETPU_SCR(c) (0x00404 + ((c) * 0x10)) /* eTPU c Status & Ctrl */ +#define MCF523x_ETPU_SDM 0x08000 /* eTPU Shared Data Memory */ + +/* + * WDOG registers + */ +#define MCF523x_WCR ((volatile uint16_t *) (MCF_IPSBAR + 0x140000)) /* control register 16 bits */ +#define MCF523x_WMR ((volatile uint16_t *) (MCF_IPSBAR + 0x140002)) /* modulus status 16 bits */ +#define MCF523x_MCNTR ((volatile uint16_t *) (MCF_IPSBAR + 0x140004)) /* count register 16 bits */ +#define MCF523x_WSR ((volatile uint16_t *) (MCF_IPSBAR + 0x140006)) /* service register 16 bits */ + +/* + * Reset registers + */ +#define MCF523x_RSR ((volatile uint8_t *) (MCF_IPSBAR + 0x110001)) /* reset reason codes */ + +/* + * WDOG bit level definitions and macros. + */ +#define MCF523x_WCR_ENABLE_BIT 0x0001 + +#define MCF523x_WCR_ENABLE 0x0001 +#define MCF523x_WCR_DISABLE 0x0000 +#define MCF523x_WCR_HALTEDSTOP 0x0002 +#define MCF523x_WCR_HALTEDRUN 0x0000 +#define MCF523x_WCR_DOZESTOP 0x0004 +#define MCF523x_WCR_DOZERUN 0x0000 +#define MCF523x_WCR_WAITSTOP 0x0008 +#define MCF523x_WCR_WAITRUN 0x0000 + +#define MCF523x_WMR_DEFAULT_VALUE 0xffff + +/* + * Inter-IC (I2C) Module + * Read/Write access macros for general use + */ +#define MCF_I2C_I2ADR ((volatile u8 *) (MCF_IPSBAR + 0x0300)) /* Address */ +#define MCF_I2C_I2FDR ((volatile u8 *) (MCF_IPSBAR + 0x0304)) /* Freq Divider */ +#define MCF_I2C_I2CR ((volatile u8 *) (MCF_IPSBAR + 0x0308)) /* Control */ +#define MCF_I2C_I2SR ((volatile u8 *) (MCF_IPSBAR + 0x030C)) /* Status */ +#define MCF_I2C_I2DR ((volatile u8 *) (MCF_IPSBAR + 0x0310)) /* Data I/O */ + +/* + * Bit level definitions and macros + */ +#define MCF_I2C_I2ADR_ADDR(x) (((x) & 0x7F) << 0x01) +#define MCF_I2C_I2FDR_IC(x) ((x) & 0x3F) + +#define MCF_I2C_I2CR_IEN 0x80 /* I2C enable */ +#define MCF_I2C_I2CR_IIEN 0x40 /* interrupt enable */ +#define MCF_I2C_I2CR_MSTA 0x20 /* master/slave mode */ +#define MCF_I2C_I2CR_MTX 0x10 /* transmit/receive mode */ +#define MCF_I2C_I2CR_TXAK 0x08 /* transmit acknowledge enable */ +#define MCF_I2C_I2CR_RSTA 0x04 /* repeat start */ + +#define MCF_I2C_I2SR_ICF 0x80 /* data transfer bit */ +#define MCF_I2C_I2SR_IAAS 0x40 /* I2C addressed as a slave */ +#define MCF_I2C_I2SR_IBB 0x20 /* I2C bus busy */ +#define MCF_I2C_I2SR_IAL 0x10 /* aribitration lost */ +#define MCF_I2C_I2SR_SRW 0x04 /* slave read/write */ +#define MCF_I2C_I2SR_IIF 0x02 /* I2C interrupt */ +#define MCF_I2C_I2SR_RXAK 0x01 /* received acknowledge */ + +/* + * Edge Port (EPORT) Module + */ +#define MCF523x_EPPAR 0x130000 +#define MCF523x_EPDDR 0x130002 +#define MCF523x_EPIER 0x130003 +#define MCF523x_EPDR 0x130004 +#define MCF523x_EPPDR 0x130005 +#define MCF523x_EPFR 0x130006 + +/* + * Chip Select (CS) Module + */ +#define MCF523x_CSAR0 0x80 +#define MCF523x_CSAR3 0xA4 +#define MCF523x_CSMR3 0xA8 + +/* + * System Access Control Unit (SACU) + */ +#define MCF523x_PACR1 0x25 +#define MCF523x_PACR2 0x26 +#define MCF523x_PACR3 0x27 +#define MCF523x_PACR4 0x28 +#define MCF523x_PACR5 0x2A +#define MCF523x_PACR6 0x2B +#define MCF523x_PACR7 0x2C +#define MCF523x_PACR8 0x2E +#define MCF523x_GPACR 0x30 + /****************************************************************************/ #endif /* m523xsim_h */ Index: linux-2.6.24.7-rt27/include/asm-m68knommu/m528xsim.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/m528xsim.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/m528xsim.h 2009-02-08 00:00:48.000000000 -0500 @@ -30,6 +30,9 @@ #define MCFINT_VECBASE 64 /* Vector base number */ #define MCFINT_UART0 13 /* Interrupt number for UART0 */ #define MCFINT_PIT1 55 /* Interrupt number for PIT1 */ +#define MCFINT_QSPI 18 /* Interrupt number for QSPI */ + +#define MCF5282_INTC0 (MCF_IPSBAR + MCFICM_INTC0) /* * SDRAM configuration registers. @@ -50,44 +53,53 @@ /* Port UA Pin Assignment Register (8 Bit) */ #define MCF5282_GPIO_PUAPAR 0x10005C +#define MCF5282_GPIO_PORTQS (*(volatile u8 *) (MCF_IPSBAR + 0x0010000D)) +#define MCF5282_GPIO_DDRQS (*(volatile u8 *) (MCF_IPSBAR + 0x00100021)) +#define MCF5282_GPIO_PORTQSP (*(volatile u8 *) (MCF_IPSBAR + 0x00100035)) +#define MCF5282_GPIO_PQSPAR (*(volatile u8 *) (MCF_IPSBAR + 0x00100059)) + +#define MCF5282_GPIO_PEPAR (*(volatile u16 *) (MCF_IPSBAR + 0x00100052)) + +#define MCF5282_GPIO_PORTE (*(volatile u8 *) (MCF_IPSBAR + 0x00100004)) +#define MCF5282_GPIO_DDRE (*(volatile u8 *) (MCF_IPSBAR + 0x00100018)) +#define MCF5282_GPIO_PORTEP (*(volatile u8 *) (MCF_IPSBAR + 0x0010002C)) + /* Interrupt Mask Register Register Low */ #define MCF5282_INTC0_IMRL (volatile u32 *) (MCF_IPSBAR + 0x0C0C) /* Interrupt Control Register 7 */ #define MCF5282_INTC0_ICR17 (volatile u8 *) (MCF_IPSBAR + 0x0C51) - - /********************************************************************* * * Inter-IC (I2C) Module * *********************************************************************/ /* Read/Write access macros for general use */ -#define MCF5282_I2C_I2ADR (volatile u8 *) (MCF_IPSBAR + 0x0300) // Address -#define MCF5282_I2C_I2FDR (volatile u8 *) (MCF_IPSBAR + 0x0304) // Freq Divider -#define MCF5282_I2C_I2CR (volatile u8 *) (MCF_IPSBAR + 0x0308) // Control -#define MCF5282_I2C_I2SR (volatile u8 *) (MCF_IPSBAR + 0x030C) // Status -#define MCF5282_I2C_I2DR (volatile u8 *) (MCF_IPSBAR + 0x0310) // Data I/O +#define MCF_I2C_I2ADR (volatile u8 *) (MCF_IPSBAR + 0x0300) // Address +#define MCF_I2C_I2FDR (volatile u8 *) (MCF_IPSBAR + 0x0304) // Freq Divider +#define MCF_I2C_I2CR (volatile u8 *) (MCF_IPSBAR + 0x0308) // Control +#define MCF_I2C_I2SR (volatile u8 *) (MCF_IPSBAR + 0x030C) // Status +#define MCF_I2C_I2DR (volatile u8 *) (MCF_IPSBAR + 0x0310) // Data I/O /* Bit level definitions and macros */ -#define MCF5282_I2C_I2ADR_ADDR(x) (((x)&0x7F)<<0x01) +#define MCF_I2C_I2ADR_ADDR(x) (((x)&0x7F)<<0x01) -#define MCF5282_I2C_I2FDR_IC(x) (((x)&0x3F)) +#define MCF_I2C_I2FDR_IC(x) (((x)&0x3F)) -#define MCF5282_I2C_I2CR_IEN (0x80) // I2C enable -#define MCF5282_I2C_I2CR_IIEN (0x40) // interrupt enable -#define MCF5282_I2C_I2CR_MSTA (0x20) // master/slave mode -#define MCF5282_I2C_I2CR_MTX (0x10) // transmit/receive mode -#define MCF5282_I2C_I2CR_TXAK (0x08) // transmit acknowledge enable -#define MCF5282_I2C_I2CR_RSTA (0x04) // repeat start - -#define MCF5282_I2C_I2SR_ICF (0x80) // data transfer bit -#define MCF5282_I2C_I2SR_IAAS (0x40) // I2C addressed as a slave -#define MCF5282_I2C_I2SR_IBB (0x20) // I2C bus busy -#define MCF5282_I2C_I2SR_IAL (0x10) // aribitration lost -#define MCF5282_I2C_I2SR_SRW (0x04) // slave read/write -#define MCF5282_I2C_I2SR_IIF (0x02) // I2C interrupt -#define MCF5282_I2C_I2SR_RXAK (0x01) // received acknowledge +#define MCF_I2C_I2CR_IEN (0x80) // I2C enable +#define MCF_I2C_I2CR_IIEN (0x40) // interrupt enable +#define MCF_I2C_I2CR_MSTA (0x20) // master/slave mode +#define MCF_I2C_I2CR_MTX (0x10) // transmit/receive mode +#define MCF_I2C_I2CR_TXAK (0x08) // transmit acknowledge enable +#define MCF_I2C_I2CR_RSTA (0x04) // repeat start + +#define MCF_I2C_I2SR_ICF (0x80) // data transfer bit +#define MCF_I2C_I2SR_IAAS (0x40) // I2C addressed as a slave +#define MCF_I2C_I2SR_IBB (0x20) // I2C bus busy +#define MCF_I2C_I2SR_IAL (0x10) // aribitration lost +#define MCF_I2C_I2SR_SRW (0x04) // slave read/write +#define MCF_I2C_I2SR_IIF (0x02) // I2C interrupt +#define MCF_I2C_I2SR_RXAK (0x01) // received acknowledge @@ -107,6 +119,11 @@ #define MCF5282_QSPI_QDR MCF_IPSBAR + 0x0354 #define MCF5282_QSPI_QCR MCF_IPSBAR + 0x0354 +#define MCF5282_QSPI_PAR (MCF_IPSBAR + 0x00100059) + +#define MCF5282_QSPI_IRQ_SOURCE 18 +#define MCF5282_QSPI_IRQ_VECTOR (64 + MCF5282_QSPI_IRQ_SOURCE) + /* Bit level definitions and macros */ #define MCF5282_QSPI_QMR_MSTR (0x8000) #define MCF5282_QSPI_QMR_DOHIE (0x4000) Index: linux-2.6.24.7-rt27/include/asm-m68knommu/m532xsim.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/m532xsim.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/m532xsim.h 2009-02-08 00:00:48.000000000 -0500 @@ -16,6 +16,7 @@ #define MCFINT_VECBASE 64 #define MCFINT_UART0 26 /* Interrupt number for UART0 */ #define MCFINT_UART1 27 /* Interrupt number for UART1 */ +#define MCFINT_UART2 28 /* Interrupt number for UART2 */ #define MCF_WTM_WCR MCF_REG16(0xFC098000) @@ -72,9 +73,21 @@ #define mcf_getimr() \ *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IMR)) +#define mcf_getimrh() \ + *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IMRH)) + +#define mcf_getimrl() \ + *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IMRL)) + #define mcf_setimr(imr) \ *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IMR)) = (imr); +#define mcf_setimrh(imr) \ + *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IMRH)) = (imr); + +#define mcf_setimrl(imr) \ + *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IMRL)) = (imr); + #define mcf_getipr() \ *((volatile unsigned long *) (MCF_MBAR + MCFSIM_IPR)) @@ -131,31 +144,31 @@ *********************************************************************/ /* Read/Write access macros for general use */ -#define MCF532x_I2C_I2ADR (volatile u8 *) (0xFC058000) // Address -#define MCF532x_I2C_I2FDR (volatile u8 *) (0xFC058004) // Freq Divider -#define MCF532x_I2C_I2CR (volatile u8 *) (0xFC058008) // Control -#define MCF532x_I2C_I2SR (volatile u8 *) (0xFC05800C) // Status -#define MCF532x_I2C_I2DR (volatile u8 *) (0xFC058010) // Data I/O +#define MCF_I2C_I2ADR (volatile u8 *) (0xFC058000) /* Address */ +#define MCF_I2C_I2FDR (volatile u8 *) (0xFC058004) /* Freq Divider */ +#define MCF_I2C_I2CR (volatile u8 *) (0xFC058008) /* Control */ +#define MCF_I2C_I2SR (volatile u8 *) (0xFC05800C) /* Status */ +#define MCF_I2C_I2DR (volatile u8 *) (0xFC058010) /* Data I/O */ /* Bit level definitions and macros */ -#define MCF532x_I2C_I2ADR_ADDR(x) (((x)&0x7F)<<0x01) +#define MCF_I2C_I2ADR_ADDR(x) (((x)&0x7F)<<0x01) -#define MCF532x_I2C_I2FDR_IC(x) (((x)&0x3F)) +#define MCF_I2C_I2FDR_IC(x) (((x)&0x3F)) -#define MCF532x_I2C_I2CR_IEN (0x80) // I2C enable -#define MCF532x_I2C_I2CR_IIEN (0x40) // interrupt enable -#define MCF532x_I2C_I2CR_MSTA (0x20) // master/slave mode -#define MCF532x_I2C_I2CR_MTX (0x10) // transmit/receive mode -#define MCF532x_I2C_I2CR_TXAK (0x08) // transmit acknowledge enable -#define MCF532x_I2C_I2CR_RSTA (0x04) // repeat start - -#define MCF532x_I2C_I2SR_ICF (0x80) // data transfer bit -#define MCF532x_I2C_I2SR_IAAS (0x40) // I2C addressed as a slave -#define MCF532x_I2C_I2SR_IBB (0x20) // I2C bus busy -#define MCF532x_I2C_I2SR_IAL (0x10) // aribitration lost -#define MCF532x_I2C_I2SR_SRW (0x04) // slave read/write -#define MCF532x_I2C_I2SR_IIF (0x02) // I2C interrupt -#define MCF532x_I2C_I2SR_RXAK (0x01) // received acknowledge +#define MCF_I2C_I2CR_IEN (0x80) /* I2C enable */ +#define MCF_I2C_I2CR_IIEN (0x40) /* interrupt enable */ +#define MCF_I2C_I2CR_MSTA (0x20) /* master/slave mode */ +#define MCF_I2C_I2CR_MTX (0x10) /* transmit/receive mode */ +#define MCF_I2C_I2CR_TXAK (0x08) /* transmit acknowledge enable */ +#define MCF_I2C_I2CR_RSTA (0x04) /* repeat start */ + +#define MCF_I2C_I2SR_ICF (0x80) /* data transfer bit */ +#define MCF_I2C_I2SR_IAAS (0x40) /* I2C addressed as a slave */ +#define MCF_I2C_I2SR_IBB (0x20) /* I2C bus busy */ +#define MCF_I2C_I2SR_IAL (0x10) /* aribitration lost */ +#define MCF_I2C_I2SR_SRW (0x04) /* slave read/write */ +#define MCF_I2C_I2SR_IIF (0x02) /* I2C interrupt */ +#define MCF_I2C_I2SR_RXAK (0x01) /* received acknowledge */ #define MCF532x_PAR_FECI2C (volatile u8 *) (0xFC0A4053) @@ -2234,5 +2247,36 @@ #define MCF_EPORT_EPFR_EPF6 (0x40) #define MCF_EPORT_EPFR_EPF7 (0x80) +/********************************************************************* + * + * Cross-Bar Switch (XBS) + * + *********************************************************************/ +#define MCF_XBS_PRS1 MCF_REG32(0xFC004100) +#define MCF_XBS_CRS1 MCF_REG32(0xFC004110) +#define MCF_XBS_PRS4 MCF_REG32(0xFC004400) +#define MCF_XBS_CRS4 MCF_REG32(0xFC004410) +#define MCF_XBS_PRS6 MCF_REG32(0xFC004600) +#define MCF_XBS_CRS6 MCF_REG32(0xFC004610) +#define MCF_XBS_PRS7 MCF_REG32(0xFC004700) +#define MCF_XBS_CRS7 MCF_REG32(0xFC004710) + +#define MCF_XBS_PRIO_FACTTEST(x) (((x)&0x7) << 28) +#define MCF_XBS_PRIO_USBOTG(x) (((x)&0x7) << 24) +#define MCF_XBS_PRIO_USBHOST(x) (((x)&0x7) << 20) +#define MCF_XBS_PRIO_LCD(x) (((x)&0x7) << 16) +#define MCF_XBS_PRIO_FEC(x) (((x)&0x7) << 8) +#define MCF_XBS_PRIO_EDMA(x) (((x)&0x7) << 4) +#define MCF_XBS_PRIO_CORE(x) (((x)&0x7) << 0) + +#define MCF_PRIO_LVL_1 (0) +#define MCF_PRIO_LVL_2 (1) +#define MCF_PRIO_LVL_3 (2) +#define MCF_PRIO_LVL_4 (3) +#define MCF_PRIO_LVL_5 (4) +#define MCF_PRIO_LVL_6 (5) +#define MCF_PRIO_LVL_7 (6) + + /********************************************************************/ #endif /* m532xsim_h */ Index: linux-2.6.24.7-rt27/include/asm-m68knommu/mcfcache.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/mcfcache.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/mcfcache.h 2009-02-08 00:00:48.000000000 -0500 @@ -60,7 +60,7 @@ nop movel #0x0000c020, %d0 /* Set SDRAM cached only */ movec %d0, %ACR0 - movel #0xff00c000, %d0 /* Cache Flash also */ + movel #0x00000000, %d0 /* No other regions cached */ movec %d0, %ACR1 movel #0x80000200, %d0 /* Setup cache mask */ movec %d0, %CACR /* Enable cache */ Index: linux-2.6.24.7-rt27/include/asm-m68knommu/mcfuart.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-m68knommu/mcfuart.h 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-m68knommu/mcfuart.h 2009-02-08 00:00:48.000000000 -0500 @@ -12,7 +12,6 @@ #define mcfuart_h /****************************************************************************/ - /* * Define the base address of the UARTS within the MBAR address * space. @@ -33,7 +32,7 @@ #define MCFUART_BASE2 0x240 /* Base address of UART2 */ #define MCFUART_BASE3 0x280 /* Base address of UART3 */ #elif defined(CONFIG_M5249) || defined(CONFIG_M5307) || defined(CONFIG_M5407) -#if defined(CONFIG_NETtel) || defined(CONFIG_DISKtel) || defined(CONFIG_SECUREEDGEMP3) +#if defined(CONFIG_NETtel) || defined(CONFIG_SECUREEDGEMP3) #define MCFUART_BASE1 0x200 /* Base address of UART1 */ #define MCFUART_BASE2 0x1c0 /* Base address of UART2 */ #else Index: linux-2.6.24.7-rt27/mm/nommu.c =================================================================== --- linux-2.6.24.7-rt27.orig/mm/nommu.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/mm/nommu.c 2009-02-08 00:00:48.000000000 -0500 @@ -952,6 +952,16 @@ unsigned long do_mmap_pgoff(struct file if (ret < 0) goto error; + /* + * If the driver implemented his own mmap(), the + * base addr could have changed. Therefor + * vm_end musst be updated to. + * + * See comment of DaveM in mm/mmap.c as reference + */ + if(addr != vma->vm_start) + vma->vm_end = vma->vm_start + len; + /* okay... we have a mapping; now we have to register it */ result = (void *) vma->vm_start; Index: linux-2.6.24.7-rt27/mm/page_alloc.c =================================================================== --- linux-2.6.24.7-rt27.orig/mm/page_alloc.c 2009-02-08 00:00:34.000000000 -0500 +++ linux-2.6.24.7-rt27/mm/page_alloc.c 2009-02-08 00:04:54.000000000 -0500 @@ -159,6 +159,65 @@ static unsigned long __meminitdata dma_r EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_PREEMPT_RT +static DEFINE_PER_CPU_LOCKED(int, pcp_locks); +#endif + +static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spin_lock(&__get_cpu_lock(pcp_locks, cpu)); + flags = 0; +#else + local_irq_save(*flags); +#endif +} + +static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + spinlock_t *lock; + int cpu; + +again: + cpu = raw_smp_processor_id(); + lock = &__get_cpu_lock(pcp_locks, cpu); + + spin_lock(lock); + if (unlikely(!cpu_online(cpu))) { + spin_unlock(lock); + goto again; + } + *this_cpu = cpu; + flags = 0; +#else + local_irq_save(*flags); + *this_cpu = smp_processor_id(); +#endif +} + +static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) +{ +#ifdef CONFIG_PREEMPT_RT + put_cpu_var_locked(pcp_locks, this_cpu); +#else + local_irq_restore(flags); +#endif +} + +static struct per_cpu_pageset * +get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) +{ + lock_cpu_pcp(flags, this_cpu); + return zone_pcp(zone, *this_cpu); +} + +static void +put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) +{ + unlock_cpu_pcp(flags, this_cpu); +} + #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); @@ -410,8 +469,8 @@ static inline int page_is_buddy(struct p * -- wli */ -static inline void __free_one_page(struct page *page, - struct zone *zone, unsigned int order) +static inline void +__free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; int order_size = 1 << order; @@ -515,8 +574,9 @@ static void free_one_page(struct zone *z static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int i; int reserved = 0; + int this_cpu; + int i; for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); @@ -528,10 +588,10 @@ static void __free_pages_ok(struct page arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); + lock_cpu_pcp(&flags, &this_cpu); + count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); - local_irq_restore(flags); + unlock_cpu_pcp(flags, this_cpu); } /* @@ -876,23 +936,19 @@ static int rmqueue_bulk(struct zone *zon */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain; - local_irq_save(flags); if (pcp->count >= pcp->batch) to_drain = pcp->batch; else to_drain = pcp->count; free_pages_bulk(zone, to_drain, &pcp->list, 0); pcp->count -= to_drain; - local_irq_restore(flags); } #endif static void __drain_pages(unsigned int cpu) { - unsigned long flags; struct zone *zone; int i; @@ -903,14 +959,16 @@ static void __drain_pages(unsigned int c continue; pset = zone_pcp(zone, cpu); + if (!pset) { + WARN_ON(1); + continue; + } for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - local_irq_save(flags); free_pages_bulk(zone, pcp->count, &pcp->list, 0); pcp->count = 0; - local_irq_restore(flags); } } } @@ -957,10 +1015,11 @@ void mark_free_pages(struct zone *zone) void drain_local_pages(void) { unsigned long flags; + int this_cpu; - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); + lock_cpu_pcp(&flags, &this_cpu); + __drain_pages(this_cpu); + unlock_cpu_pcp(flags, this_cpu); } void smp_drain_local_pages(void *arg) @@ -973,6 +1032,38 @@ void smp_drain_local_pages(void *arg) */ void drain_all_local_pages(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * HACK!!!!! + * For RT we can't use IPIs to run drain_local_pages, since + * that code will call spin_locks that will now sleep. + * But, schedule_on_each_cpu will call kzalloc, which will + * call page_alloc which was what calls this. + * + * Luckily, there's a condition to get here, and that is if + * the order passed in to alloc_pages is greater than 0 + * (alloced more than a page size). The slabs only allocate + * what is needed, and the allocation made by schedule_on_each_cpu + * does an alloc of "sizeof(void *)*nr_cpu_ids". + * + * So we can safely call schedule_on_each_cpu if that number + * is less than a page. Otherwise don't bother. At least warn of + * this issue. + * + * And yes, this is one big hack. Please fix ;-) + */ + if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) + schedule_on_each_cpu(smp_drain_local_pages, NULL, 0, 1); + else { + static int once; + if (!once) { + printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); + once = 1; + } + drain_local_pages(); + } + +#else unsigned long flags; local_irq_save(flags); @@ -980,6 +1071,7 @@ void drain_all_local_pages(void) local_irq_restore(flags); smp_call_function(smp_drain_local_pages, NULL, 0, 1); +#endif } /* @@ -988,8 +1080,10 @@ void drain_all_local_pages(void) static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); + struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; unsigned long flags; + int this_cpu; if (PageAnon(page)) page->mapping = NULL; @@ -1001,9 +1095,11 @@ static void fastcall free_hot_cold_page( arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; - local_irq_save(flags); - __count_vm_event(PGFREE); + pset = get_zone_pcp(zone, &flags, &this_cpu); + pcp = &pset->pcp[cold]; + + count_vm_event(PGFREE); + list_add(&page->lru, &pcp->list); set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; @@ -1011,8 +1107,7 @@ static void fastcall free_hot_cold_page( free_pages_bulk(zone, pcp->batch, &pcp->list, 0); pcp->count -= pcp->batch; } - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); } void fastcall free_hot_page(struct page *page) @@ -1054,16 +1149,15 @@ static struct page *buffered_rmqueue(str unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; + struct per_cpu_pageset *pset; int migratetype = allocflags_to_migratetype(gfp_flags); + int this_cpu; again: - cpu = get_cpu(); + pset = get_zone_pcp(zone, &flags, &this_cpu); if (likely(order == 0)) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = &pset->pcp[cold]; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; - local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype); @@ -1086,7 +1180,7 @@ again: list_del(&page->lru); pcp->count--; } else { - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) @@ -1095,8 +1189,7 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1104,8 +1197,7 @@ again: return page; failed: - local_irq_restore(flags); - put_cpu(); + put_zone_pcp(zone, flags, this_cpu); return NULL; } @@ -2701,12 +2793,17 @@ static inline void free_zone_pagesets(in struct zone *zone; for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + struct per_cpu_pageset *pset; + unsigned long flags; + + __lock_cpu_pcp(&flags, cpu); + pset = zone_pcp(zone, cpu); + zone_pcp(zone, cpu) = NULL; + unlock_cpu_pcp(flags, cpu); /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) kfree(pset); - zone_pcp(zone, cpu) = NULL; } } @@ -2732,6 +2829,7 @@ static int __cpuinit pageset_cpuup_callb default: break; } + return ret; } @@ -3978,10 +4076,11 @@ static int page_alloc_cpu_notify(struct int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - local_irq_disable(); + unsigned long flags; + __lock_cpu_pcp(&flags, cpu); __drain_pages(cpu); vm_events_fold_cpu(cpu); - local_irq_enable(); + unlock_cpu_pcp(flags, cpu); refresh_cpu_vm_stats(cpu); } return NOTIFY_OK; @@ -4317,6 +4416,14 @@ void *__init alloc_large_system_hash(con if (numentries > max) numentries = max; + /* + * we will allocate at least a page (even on low memory systems) + * so do a fixup here to ensure we utilise the space that will be + * allocated, this also prevents us reporting -ve orders + */ + if (bucketsize * numentries < PAGE_SIZE) + numentries = (PAGE_SIZE + bucketsize - 1) / bucketsize; + log2qty = ilog2(numentries); do { Index: linux-2.6.24.7-rt27/kernel/sched.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/sched.c 2009-02-08 00:00:32.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/sched.c 2009-02-08 00:05:23.000000000 -0500 @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,12 +17,15 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements * by Peter Williams * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz */ #include @@ -57,16 +61,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include +#include "sched_cpupri.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -86,6 +94,11 @@ unsigned long long __attribute__((weak)) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#define __PRIO(prio) \ + ((prio) <= 99 ? 199 - (prio) : (prio) - 120) + +#define PRIO(p) __PRIO((p)->prio) + /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -104,6 +117,20 @@ unsigned long long __attribute__((weak)) #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT +#if (BITS_PER_LONG < 64) +#define JIFFIES_TO_NS64(TIME) \ + ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) + +#define NS64_TO_JIFFIES(TIME) \ + ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ + (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) +#else /* BITS_PER_LONG < 64 */ + +#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) +#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) + +#endif /* BITS_PER_LONG < 64 */ + /* * These are the 'tuning knobs' of the scheduler: * @@ -133,6 +160,32 @@ static inline void sg_inc_cpu_power(stru } #endif +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Tweaks for current + */ + +#ifdef CURRENT_PTR +struct task_struct * const ___current = &init_task; +struct task_struct ** const current_ptr = (struct task_struct ** const)&___current; +struct thread_info * const current_ti = &init_thread_union.thread_info; +struct thread_info ** const current_ti_ptr = (struct thread_info ** const)¤t_ti; + +EXPORT_SYMBOL(___current); +EXPORT_SYMBOL(current_ti); + +/* + * The scheduler itself doesnt want 'current' to be cached + * during context-switches: + */ +# undef current +# define current __current() +# undef current_thread_info +# define current_thread_info() __current_thread_info() +#endif + static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -186,12 +239,12 @@ static struct cfs_rq *init_cfs_rq_p[NR_C * Every task in system belong to this group at bootup. */ struct task_group init_task_group = { - .se = init_sched_entity_p, + .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, }; #ifdef CONFIG_FAIR_USER_SCHED -# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD +# define INIT_TASK_GRP_LOAD (2*NICE_0_LOAD) #else # define INIT_TASK_GRP_LOAD NICE_0_LOAD #endif @@ -266,9 +319,49 @@ struct rt_rq { struct rt_prio_array active; int rt_load_balance_idx; struct list_head *rt_load_balance_head, *rt_load_balance_curr; + unsigned long rt_nr_running; + unsigned long rt_nr_migratory; + unsigned long rt_nr_uninterruptible; + /* highest queued rt task prio */ + int highest_prio; + int overloaded; +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + cpumask_t span; + cpumask_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_t rto_mask; + atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif + +/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues @@ -277,7 +370,7 @@ struct rt_rq { */ struct rq { /* runqueue lock: */ - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -310,6 +403,8 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long switch_timestamp; + unsigned long slice_avg; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -325,6 +420,7 @@ struct rq { atomic_t nr_iowait; #ifdef CONFIG_SMP + struct root_domain *rd; struct sched_domain *sd; /* For active balancing */ @@ -332,6 +428,7 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; struct task_struct *migration_thread; struct list_head migration_queue; @@ -358,6 +455,15 @@ struct rq { /* BKL stats */ unsigned int bkl_count; + + /* RT-overload stats: */ + unsigned long rto_schedule; + unsigned long rto_schedule_tail; + unsigned long rto_wakeup; + unsigned long rto_pulled; + unsigned long rto_pushed; + + unsigned long lb_breaks; #endif struct lock_class_key rq_lock_key; }; @@ -379,6 +485,8 @@ static inline int cpu_of(struct rq *rq) #endif } +#include "sched_trace.h" + /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: @@ -450,6 +558,31 @@ static void update_rq_clock(struct rq *r # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + +#ifndef CONFIG_SMP +int task_is_current(struct task_struct *task) +{ + return task_rq(task)->curr == task; +} +#endif + /* * Debugging: various feature bits */ @@ -459,6 +592,7 @@ enum { SCHED_FEAT_START_DEBIT = 4, SCHED_FEAT_TREE_AVG = 8, SCHED_FEAT_APPROX_AVG = 16, + SCHED_FEAT_LB_BREAK = 32, }; const_debug unsigned int sysctl_sched_features = @@ -466,7 +600,8 @@ const_debug unsigned int sysctl_sched_fe SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0; + SCHED_FEAT_APPROX_AVG * 0 | + SCHED_FEAT_LB_BREAK * 1; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) @@ -474,13 +609,17 @@ const_debug unsigned int sysctl_sched_fe * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): */ -unsigned long long cpu_clock(int cpu) +unsigned long long notrace cpu_clock(int cpu) { unsigned long long now; unsigned long flags; @@ -501,11 +640,23 @@ unsigned long long cpu_clock(int cpu) } EXPORT_SYMBOL_GPL(cpu_clock); +/* + * We really dont want to do anything complex within switch_to() + * on PREEMPT_RT - this check enforces this. + */ +#ifdef prepare_arch_switch +# ifdef CONFIG_PREEMPT_RT +# error FIXME +# else +# define _finish_arch_switch finish_arch_switch +# endif +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif #ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) +# define _finish_arch_switch(prev) do { } while (0) #endif static inline int task_current(struct rq *rq, struct task_struct *p) @@ -513,18 +664,39 @@ static inline int task_current(struct rq return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->oncpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -536,18 +708,10 @@ static inline void finish_lock_switch(st */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { @@ -577,8 +741,8 @@ static inline void finish_lock_switch(st smp_wmb(); prev->oncpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); #endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -743,7 +907,7 @@ void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); - if (cpu == smp_processor_id()) + if (cpu == raw_smp_processor_id()) return; /* @@ -882,7 +1046,7 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); +static void activate_task(struct rq *rq, struct task_struct *p, int flags); /* * runqueue iterator, to support SMP load-balancing between different @@ -899,7 +1063,7 @@ struct rq_iterator { static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, + enum cpu_idle_type idle, int *lb_flags, int *this_best_prio, struct rq_iterator *iterator); static int @@ -914,6 +1078,13 @@ static void cpuacct_charge(struct task_s static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long cpu_avg_load_per_task(int cpu); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +#endif /* CONFIG_SMP */ + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -923,6 +1094,8 @@ static inline void cpuacct_charge(struct #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) /* * Update delta_exec, delta_fair fields for rq. @@ -982,16 +1155,16 @@ static void set_load_weight(struct task_ p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, flags); p->se.on_rq = 1; } -static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - p->sched_class->dequeue_task(rq, p, sleep); + p->sched_class->dequeue_task(rq, p, flags); p->se.on_rq = 0; } @@ -1018,6 +1191,8 @@ static inline int normal_prio(struct tas prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); + +// trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); return prio; } @@ -1044,24 +1219,26 @@ static int effective_prio(struct task_st /* * activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +static void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); + ftrace_event_task_activate(p, cpu_of(rq)); + enqueue_task(rq, p, flags); inc_nr_running(p, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) +static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(rq, p, sleep); + ftrace_event_task_deactivate(p, cpu_of(rq)); + dequeue_task(rq, p, flags); dec_nr_running(p, rq); } @@ -1094,12 +1271,24 @@ static inline void __set_task_cpu(struct #endif } +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio, int running) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p, running); + p->sched_class->switched_to(rq, p, running); + } else + p->sched_class->prio_changed(rq, p, oldprio, running); +} + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: */ -static inline int +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; @@ -1226,6 +1415,7 @@ void wait_task_inactive(struct task_stru * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_kernel_sched_wait(p); running = task_running(rq, p); on_rq = p->se.on_rq; task_rq_unlock(rq, &flags); @@ -1324,7 +1514,7 @@ static unsigned long target_load(int cpu /* * Return the average load per task on the cpu's run queue */ -static inline unsigned long cpu_avg_load_per_task(int cpu) +static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -1481,58 +1671,6 @@ static int sched_balance_self(int cpu, i #endif /* CONFIG_SMP */ -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) -static int wake_idle(int cpu, struct task_struct *p) -{ - cpumask_t tmp; - struct sched_domain *sd; - int i; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -1547,18 +1685,21 @@ static inline int wake_idle(int cpu, str * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; long old_state; struct rq *rq; -#ifdef CONFIG_SMP - struct sched_domain *sd, *this_sd = NULL; - unsigned long load, this_load; - int new_cpu; -#endif +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + if (rt_task(p)) + sync = 0; +#endif rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1575,92 +1716,9 @@ static int try_to_wake_up(struct task_st if (unlikely(task_running(rq, p))) goto out_activate; - new_cpu = cpu; - - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - goto out_set_cpu; - } - - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - this_sd = sd; - break; - } - } - - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; - - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } - - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } - } - } - - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: - new_cpu = wake_idle(new_cpu, p); - if (new_cpu != cpu) { - set_task_cpu(p, new_cpu); + cpu = p->sched_class->select_task_rq(p, sync); + if (cpu != orig_cpu) { + set_task_cpu(p, cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ rq = task_rq_lock(p, &flags); @@ -1674,6 +1732,21 @@ out_set_cpu: cpu = task_cpu(p); } +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_count); + if (cpu == this_cpu) + schedstat_inc(rq, ttwu_local); + else { + struct sched_domain *sd; + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif + out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); @@ -1686,12 +1759,30 @@ out_activate: else schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); - activate_task(rq, p, 1); + activate_task(rq, p, ENQUEUE_WAKEUP); check_preempt_curr(rq, p); success = 1; out_running: - p->state = TASK_RUNNING; + trace_kernel_sched_wakeup(rq, p); + + /* + * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task + * state to preserve the original state, so a real wakeup + * still can see the (UN)INTERRUPTIBLE bits in the state check + * above. We dont have to worry about the | TASK_RUNNING_MUTEX + * here. The waiter is serialized by the mutex lock and nobody + * else can fiddle with p->state as we hold rq lock. + */ + if (mutex) + p->state |= TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; + +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif out: task_rq_unlock(rq, &flags); @@ -1701,13 +1792,34 @@ out: int fastcall wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 0); } EXPORT_SYMBOL(wake_up_process); +int fastcall wake_up_process_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 1, 0); +} +EXPORT_SYMBOL(wake_up_process_sync); + +int fastcall wake_up_process_mutex(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex); + +int fastcall wake_up_process_mutex_sync(struct task_struct * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 1, 1); +} +EXPORT_SYMBOL(wake_up_process_mutex_sync); + int fastcall wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 0); } /* @@ -1775,7 +1887,7 @@ void sched_fork(struct task_struct *p, i if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) p->oncpu = 0; #endif #ifdef CONFIG_PREEMPT @@ -1813,7 +1925,12 @@ void fastcall wake_up_new_task(struct ta p->sched_class->task_new(rq, p); inc_nr_running(p, rq); } + trace_kernel_sched_wakeup_new(rq, p); check_preempt_curr(rq, p); +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif task_rq_unlock(rq, &flags); } @@ -1846,8 +1963,17 @@ static void fire_sched_in_preempt_notifi struct preempt_notifier *notifier; struct hlist_node *node; + if (hlist_empty(&curr->preempt_notifiers)) + return; + + /* + * The KVM sched in notifier expects to be called with + * interrupts enabled. + */ + local_irq_enable(); hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); + local_irq_disable(); } static void @@ -1875,6 +2001,26 @@ fire_sched_out_preempt_notifiers(struct #endif +#ifdef CONFIG_DEBUG_PREEMPT +void notrace preempt_enable_no_resched(void) +{ + static int once = 1; + + barrier(); + dec_preempt_count(); + + if (once && !preempt_count()) { + once = 0; + printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", + current->comm, current->pid); + dump_stack(); + } +} + +EXPORT_SYMBOL(preempt_enable_no_resched); +#endif + + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -1932,11 +2078,20 @@ static void finish_task_switch(struct rq * Manfred Spraul */ prev_state = prev->state; - finish_arch_switch(prev); + _finish_arch_switch(prev); finish_lock_switch(rq, prev); +#ifdef CONFIG_SMP + if (current->sched_class->post_schedule) + current->sched_class->post_schedule(rq); +#endif + fire_sched_in_preempt_notifiers(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this @@ -1954,12 +2109,15 @@ static void finish_task_switch(struct rq asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + preempt_disable(); // TODO: move this to fork setup + finish_task_switch(this_rq(), prev); + __preempt_enable_no_resched(); + local_irq_enable(); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); +#else + preempt_check_resched(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); @@ -1976,6 +2134,8 @@ context_switch(struct rq *rq, struct tas struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + + trace_kernel_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2006,6 +2166,11 @@ context_switch(struct rq *rq, struct tas spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif +#ifdef CURRENT_PTR + barrier(); + *current_ptr = next; + *current_ti_ptr = next->thread_info; +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2052,6 +2217,11 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + unsigned long long nr_context_switches(void) { int i; @@ -2070,6 +2240,13 @@ unsigned long nr_iowait(void) for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + return sum; } @@ -2120,6 +2297,10 @@ static void update_cpu_load(struct rq *t #ifdef CONFIG_SMP +#define LB_ALL_PINNED 0x01 +#define LB_COMPLETE 0x02 +#define LB_START 0x03 + /* * double_rq_lock - safely lock two runqueues * @@ -2167,11 +2348,13 @@ static void double_rq_unlock(struct rq * /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static void double_lock_balance(struct rq *this_rq, struct rq *busiest) +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { + int ret = 0; + if (unlikely(!irqs_disabled())) { /* printk() doesn't work good under rq->lock */ spin_unlock(&this_rq->lock); @@ -2182,9 +2365,11 @@ static void double_lock_balance(struct r spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); + ret = 1; } else spin_lock(&busiest->lock); } + return ret; } /* @@ -2204,6 +2389,7 @@ static void sched_migrate_task(struct ta || unlikely(cpu_is_offline(dest_cpu))) goto out; + trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -2257,7 +2443,7 @@ static void pull_task(struct rq *src_rq, static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { /* * We do not migrate tasks that are: @@ -2269,7 +2455,7 @@ int can_migrate_task(struct task_struct schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } - *all_pinned = 0; + *lb_flags &= ~LB_ALL_PINNED; if (task_running(rq, p)) { schedstat_inc(p, se.nr_failed_migrations_running); @@ -2303,7 +2489,7 @@ int can_migrate_task(struct task_struct static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, + enum cpu_idle_type idle, int *lb_flags, int *this_best_prio, struct rq_iterator *iterator) { int loops = 0, pulled = 0, pinned = 0, skip_for_load; @@ -2313,15 +2499,24 @@ balance_tasks(struct rq *this_rq, int th if (max_load_move == 0) goto out; - pinned = 1; - /* * Start the load-balancing iterator: */ - p = iterator->start(iterator->arg); + if (*lb_flags & LB_START) + p = iterator->start(iterator->arg); + else + p = iterator->next(iterator->arg); + + if (p) + pinned = 1; next: - if (!p || loops++ > sysctl_sched_nr_migrate) + if (!p) + goto out; + + if (loops++ > sysctl_sched_nr_migrate) { + *lb_flags &= ~LB_COMPLETE; goto out; + } /* * To help distribute high priority tasks across CPUs we don't * skip a task if it will be the highest priority task (i.e. smallest @@ -2356,12 +2551,27 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; + if (pinned) + *lb_flags |= LB_ALL_PINNED; return max_load_move - rem_load_move; } +static int is_runnable(struct rq *this_rq, const struct sched_class *target_class) +{ + const struct sched_class *class = sched_class_highest; + + for (; class; class = class->next) { + if (class->is_runnable(this_rq)) + return 1; + + if (class == target_class) + break; + } + + return 0; +} + /* * move_tasks tries to move up to max_load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -2372,18 +2582,41 @@ out: static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { const struct sched_class *class = sched_class_highest; unsigned long total_load_moved = 0; int this_best_prio = this_rq->curr->prio; + *lb_flags |= LB_START; + do { - total_load_moved += - class->load_balance(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); - class = class->next; + *lb_flags |= LB_COMPLETE; + + total_load_moved += class->load_balance(this_rq, this_cpu, + busiest, max_load_move - total_load_moved, + sd, idle, lb_flags, &this_best_prio); + + if (idle == CPU_NEWLY_IDLE && + is_runnable(this_rq, class)) + return 1; + + if (*lb_flags & LB_COMPLETE) { + class = class->next; + *lb_flags |= LB_START; + } else if (sched_feat(LB_BREAK)) { + *lb_flags &= ~LB_START; + schedstat_inc(this_rq, lb_breaks); + + double_rq_unlock(this_rq, busiest); + local_irq_enable(); + + if (!in_atomic()) + cond_resched(); + + local_irq_disable(); + double_rq_lock(this_rq, busiest); + } } while (class && max_load_move > total_load_moved); return total_load_moved > 0; @@ -2784,7 +3017,7 @@ static int load_balance(int this_cpu, st struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, lb_flags = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -2827,6 +3060,9 @@ redo: ld_moved = 0; if (busiest->nr_running > 1) { + + WARN_ON(irqs_disabled()); + /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is @@ -2836,7 +3072,7 @@ redo: local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &all_pinned); + imbalance, sd, idle, &lb_flags); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -2847,7 +3083,7 @@ redo: resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) { + if (unlikely(lb_flags & LB_ALL_PINNED)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; @@ -2868,7 +3104,7 @@ redo: */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); - all_pinned = 1; + lb_flags |= LB_ALL_PINNED; goto out_one_pinned; } @@ -2916,7 +3152,8 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + if (((lb_flags & LB_ALL_PINNED) && + sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -2941,7 +3178,7 @@ load_balance_newidle(int this_cpu, struc unsigned long imbalance; int ld_moved = 0; int sd_idle = 0; - int all_pinned = 0; + int lb_flags = 0; cpumask_t cpus = CPU_MASK_ALL; /* @@ -2982,10 +3219,10 @@ redo: update_rq_clock(busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, CPU_NEWLY_IDLE, - &all_pinned); + &lb_flags); spin_unlock(&busiest->lock); - if (unlikely(all_pinned)) { + if (unlikely(lb_flags & LB_ALL_PINNED)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; @@ -3256,7 +3493,7 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); + int this_cpu = raw_smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; @@ -3408,7 +3645,9 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (rt_task(p)) + cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp); + else if (TASK_NICE(p) > 0) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -3463,10 +3702,12 @@ void account_system_time(struct task_str /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) + if (hardirq_count() - hardirq_offset || (p->flags & PF_HARDIRQ)) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->flags & PF_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (rt_task(p)) + cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp); else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); else if (atomic_read(&rq->nr_iowait) > 0) @@ -3523,6 +3764,8 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; u64 next_tick = rq->tick_timestamp + TICK_NSEC; + BUG_ON(!irqs_disabled()); + spin_lock(&rq->lock); __update_rq_clock(rq); /* @@ -3532,7 +3775,7 @@ void scheduler_tick(void) rq->clock = next_tick; rq->tick_timestamp = rq->clock; update_cpu_load(rq); - if (curr != rq->idle) /* FIXME: needed? */ + if (curr != rq->idle && curr->se.on_rq) curr->sched_class->task_tick(rq, curr); spin_unlock(&rq->lock); @@ -3542,26 +3785,56 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void fastcall add_preempt_count(int val) { + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_ip(CALLER_ADDR1); + +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(eip, parent_eip); } EXPORT_SYMBOL(add_preempt_count); void fastcall sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -3573,7 +3846,10 @@ void fastcall sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); @@ -3587,8 +3863,8 @@ static noinline void __schedule_bug(stru { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", + prev->comm, preempt_count(), prev->pid, smp_processor_id()); debug_show_held_locks(prev); if (irqs_disabled()) @@ -3605,6 +3881,8 @@ static noinline void __schedule_bug(stru */ static inline void schedule_debug(struct task_struct *prev) { + WARN_ON(system_state == SYSTEM_BOOTING); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3659,14 +3937,15 @@ pick_next_task(struct rq *rq, struct tas /* * schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +asmlinkage void __sched __schedule(void) { struct task_struct *prev, *next; long *switch_count; struct rq *rq; int cpu; -need_resched: + rcu_preempt_boost(); + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3675,7 +3954,6 @@ need_resched: switch_count = &prev->nivcsw; release_kernel_lock(prev); -need_resched_nonpreemptible: schedule_debug(prev); @@ -3685,18 +3963,30 @@ need_resched_nonpreemptible: local_irq_disable(); __update_rq_clock(rq); spin_lock(&rq->lock); + cpu = smp_processor_id(); clear_tsk_need_resched(prev); + clear_tsk_need_resched_delayed(prev); - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state && + !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { + unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; } else { - deactivate_task(rq, prev, 1); + touch_softlockup_watchdog(); + deactivate_task(rq, prev, DEQUEUE_SLEEP); } switch_count = &prev->nvcsw; } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + +#ifdef CONFIG_SMP + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +#endif + if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -3711,21 +4001,90 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ - } else - spin_unlock_irq(&rq->lock); + __preempt_enable_no_resched(); + } else { + __preempt_enable_no_resched(); + spin_unlock(&rq->lock); + } - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - goto need_resched_nonpreemptible; + reacquire_kernel_lock(current); + if (!irqs_disabled()) { + static int once = 1; + if (once) { + once = 0; + print_irqtrace_events(current); + WARN_ON(1); + } + } +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + WARN_ON(system_state == SYSTEM_BOOTING); + /* + * Test if we have interrupts disabled. + */ + if (unlikely(irqs_disabled())) { + printk(KERN_ERR "BUG: scheduling with irqs disabled: " + "%s/0x%08x/%d\n", current->comm, preempt_count(), + current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); + } + + if (unlikely(current->flags & PF_NOSCHED)) { + current->flags &= ~PF_NOSCHED; + printk(KERN_ERR "%s:%d userspace BUG: scheduling in " + "user-atomic context!\n", current->comm, current->pid); + dump_stack(); + send_sig(SIGUSR2, current, 1); } - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + + local_irq_disable(); + + do { + __schedule(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || + test_thread_flag(TIF_NEED_RESCHED_DELAYED))); + + local_irq_enable(); } EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT + +/* + * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: + */ +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk(KERN_INFO "turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk(KERN_INFO "turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -3738,6 +4097,8 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -3746,6 +4107,7 @@ asmlinkage void __sched preempt_schedule return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* @@ -3757,11 +4119,11 @@ asmlinkage void __sched preempt_schedule saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - schedule(); + __schedule(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); /* * Check again in case we missed a preemption opportunity @@ -3773,10 +4135,10 @@ asmlinkage void __sched preempt_schedule EXPORT_SYMBOL(preempt_schedule); /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. */ asmlinkage void __sched preempt_schedule_irq(void) { @@ -3785,10 +4147,18 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* @@ -3800,13 +4170,12 @@ asmlinkage void __sched preempt_schedule saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); + __schedule(); + local_irq_disable(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -3821,7 +4190,7 @@ asmlinkage void __sched preempt_schedule int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -3861,8 +4230,9 @@ void fastcall __wake_up(wait_queue_head_ unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); + __wake_up_common(q, mode, nr_exclusive, 1, key); spin_unlock_irqrestore(&q->lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(__wake_up); @@ -3912,8 +4282,9 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); + 1, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete); @@ -3924,11 +4295,18 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); + 0, 1, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); + preempt_check_resched_delayed(); } EXPORT_SYMBOL(complete_all); +unsigned int fastcall completion_done(struct completion *x) +{ + return x->done; +} +EXPORT_SYMBOL(completion_done); + static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { @@ -4045,10 +4423,8 @@ long __sched sleep_on_timeout(wait_queue } EXPORT_SYMBOL(sleep_on_timeout); -#ifdef CONFIG_RT_MUTEXES - /* - * rt_mutex_setprio - set the current priority of a task + * task_setprio - set the current priority of a task * @p: task * @prio: prio value (kernel-internal form) * @@ -4057,15 +4433,35 @@ EXPORT_SYMBOL(sleep_on_timeout); * * Used by the rt_mutex code to implement priority inheritance logic. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void task_setprio(struct task_struct *p, int prio) { unsigned long flags; - int oldprio, on_rq, running; + int oldprio, prev_resched, on_rq, running, down; struct rq *rq; + const struct sched_class *prev_class = p->sched_class; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + update_rq_clock(rq); oldprio = p->prio; @@ -4081,29 +4477,24 @@ void rt_mutex_setprio(struct task_struct else p->sched_class = &fair_sched_class; + down = (prio > p->prio) ? ENQUEUE_HEAD : 0; p->prio = prio; +// trace_special_pid(p->pid, __PRIO(oldprio), PRIO(p)); + prev_resched = _need_resched(); + if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + enqueue_task(rq, p, down); + check_class_changed(rq, p, prev_class, oldprio, running); } +// trace_special(prev_resched, _need_resched(), 0); + +out_unlock: task_rq_unlock(rq, &flags); } -#endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -4300,6 +4691,7 @@ int sched_setscheduler(struct task_struc { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; + const struct sched_class *prev_class = p->sched_class; struct rq *rq; /* may grab non-irq protected spin_locks */ @@ -4393,17 +4785,7 @@ recheck: p->sched_class->set_curr_task(rq); if (on_rq) { activate_task(rq, p, 0); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4697,19 +5079,19 @@ asmlinkage long sys_sched_yield(void) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(&rq->lock); - schedule(); + __schedule(); + + local_irq_enable(); + preempt_check_resched(); return 0; } static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) __might_sleep(__FILE__, __LINE__); #endif /* @@ -4718,10 +5100,11 @@ static void __cond_resched(void) * cond_resched() call. */ do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + local_irq_enable(); } int __sched cond_resched(void) @@ -4743,32 +5126,53 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_raw_spinlock(raw_spinlock_t *lock) { int ret = 0; - if (need_lockbreak(lock)) { + if (need_lockbreak_raw(lock)) { spin_unlock(lock); cpu_relax(); ret = 1; spin_lock(lock); } if (need_resched() && system_state == SYSTEM_RUNNING) { - spin_release(&lock->dep_map, 1, _THIS_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); + spin_unlock_no_resched(lock); __cond_resched(); ret = 1; spin_lock(lock); } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_raw_spinlock); -int __sched cond_resched_softirq(void) +#ifdef CONFIG_PREEMPT_RT + +int __cond_resched_spinlock(spinlock_t *lock) { - BUG_ON(!in_softirq()); +#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock_no_resched(lock); + __cond_resched(); + spin_lock(lock); + return 1; + } +#endif + return 0; +} +EXPORT_SYMBOL(__cond_resched_spinlock); + +#endif +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ +int __sched cond_resched_softirq(void) +{ +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); +#endif if (need_resched() && system_state == SYSTEM_RUNNING) { local_bh_enable(); __cond_resched(); @@ -4779,17 +5183,102 @@ int __sched cond_resched_softirq(void) } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ + WARN_ON_ONCE(!in_softirq()); + + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +/* + * Preempt a hardirq context if necessary (possible with hardirq threading): + */ +int cond_resched_hardirq_context(void) +{ + WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (hardirq_need_resched()) { +#ifndef CONFIG_PREEMPT_RT + irq_exit(); +#endif + local_irq_enable(); + __cond_resched(); +#ifndef CONFIG_PREEMPT_RT + local_irq_disable(); + __irq_enter(); +#endif + + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_hardirq_context); + +#ifdef CONFIG_PREEMPT_VOLUNTARY + +int voluntary_preemption = 1; + +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif + /** * yield - yield the current processor to other threads. * * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void __sched yield(void) +void __sched __yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } + +void __sched yield(void) +{ + static int once = 1; + + /* + * it's a bug to rely on yield() with RT priorities. We print + * the first occurance after bootup ... this will still give + * us an idea about the scope of the problem, without spamming + * the syslog: + */ + if (once && rt_task(current)) { + once = 0; + printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", + current->comm, current->pid); + dump_stack(); + } + __yield(); +} EXPORT_SYMBOL(yield); /* @@ -4909,7 +5398,7 @@ long sys_sched_rr_get_interval(pid_t pid time_slice = 0; if (p->policy == SCHED_RR) { time_slice = DEF_TIMESLICE; - } else { + } else if (p->policy != SCHED_FIFO) { struct sched_entity *se = &p->se; unsigned long flags; struct rq *rq; @@ -4929,7 +5418,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; static void show_task(struct task_struct *p) { @@ -4937,19 +5426,24 @@ static void show_task(struct task_struct unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); + printk("%-13.13s %c (%03lx) [%p]", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', + (unsigned long) p->state, p); #if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(KERN_CONT " running "); else printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(KERN_CONT " running task "); else printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif + if (task_curr(p)) + printk("[curr] "); + else if (p->se.on_rq) + printk("[on rq #%d] ", task_cpu(p)); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long *n = end_of_stack(p); @@ -4968,6 +5462,7 @@ static void show_task(struct task_struct void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; + int do_unlock = 1; #if BITS_PER_LONG == 32 printk(KERN_INFO @@ -4976,7 +5471,16 @@ void show_state_filter(unsigned long sta printk(KERN_INFO " task PC stack pid father\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -4992,7 +5496,8 @@ void show_state_filter(unsigned long sta #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: */ @@ -5027,13 +5532,15 @@ void __cpuinit init_idle(struct task_str spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) idle->oncpu = 1; #endif spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); #else task_thread_info(idle)->preempt_count = 0; @@ -5118,7 +5625,13 @@ int set_cpus_allowed(struct task_struct goto out; } - p->cpus_allowed = new_mask; + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, &new_mask); + else { + p->cpus_allowed = new_mask; + p->nr_cpus_allowed = cpus_weight(new_mask); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpu_isset(task_cpu(p), new_mask)) goto out; @@ -5152,11 +5665,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; + unsigned long flags; int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; + /* + * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) + * disabling interrupts - which on PREEMPT_RT does not do: + */ + local_irq_save(flags); + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -5180,6 +5700,8 @@ static int __migrate_task(struct task_st ret = 1; out: double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + return ret; } @@ -5385,7 +5907,11 @@ void idle_task_exit(void) if (mm != &init_mm) switch_mm(mm, &init_mm, current); +#ifdef CONFIG_PREEMPT_RT + mmdrop_rcu(mm); +#else mmdrop(mm); +#endif } /* called under rq->lock with disabled interrupts */ @@ -5597,6 +6123,36 @@ static void unregister_sched_domain_sysc } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5631,6 +6187,16 @@ migration_call(struct notifier_block *nf case CPU_ONLINE_FROZEN: /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); + + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpu_isset(cpu, rq->rd->span)); + + set_rq_online(rq); + } + spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -5681,6 +6247,17 @@ migration_call(struct notifier_block *nf } spin_unlock_irq(&rq->lock); break; + + case CPU_DYING: + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpu_isset(cpu, rq->rd->span)); + set_rq_offline(rq); + } + spin_unlock_irqrestore(&rq->lock, flags); + break; #endif case CPU_LOCK_RELEASE: mutex_unlock(&sched_hotcpu_mutex); @@ -5872,11 +6449,75 @@ sd_parent_degenerate(struct sched_domain return 1; } +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + unsigned long flags; + struct root_domain *reap = NULL; + + spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + struct root_domain *old_rd = rq->rd; + + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpu_clear(rq->cpu, old_rd->span); + + if (atomic_dec_and_test(&old_rd->refcount)) + reap = old_rd; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpu_set(rq->cpu, rd->span); + if (cpu_isset(rq->cpu, cpu_online_map)) + set_rq_online(rq); + + spin_unlock_irqrestore(&rq->lock, flags); + + /* Don't try to free the memory while in-atomic() */ + if (unlikely(reap)) + kfree(reap); +} + +static void init_rootdomain(struct root_domain *rd) +{ + memset(rd, 0, sizeof(*rd)); + + cpus_clear(rd->span); + cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); + +} + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain); + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + init_rootdomain(rd); + + return rd; +} + /* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -static void cpu_attach_domain(struct sched_domain *sd, int cpu) +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -5901,6 +6542,7 @@ static void cpu_attach_domain(struct sch sched_domain_debug(sd, cpu); + rq_attach_root(rq, rd); rcu_assign_pointer(rq->sd, sd); } @@ -6269,6 +6911,7 @@ static void init_sched_groups_power(int static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct root_domain *rd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6285,6 +6928,12 @@ static int build_sched_domains(const cpu sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif + rd = alloc_rootdomain(); + if (!rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + return -ENOMEM; + } + /* * Set up domains for cpus specified by the cpu_map. */ @@ -6501,7 +7150,7 @@ static int build_sched_domains(const cpu #else sd = &per_cpu(phys_domains, i); #endif - cpu_attach_domain(sd, i); + cpu_attach_domain(sd, rd, i); } return 0; @@ -6559,8 +7208,7 @@ static void detach_destroy_domains(const unregister_sched_domain_sysctl(); for_each_cpu_mask(i, *cpu_map) - cpu_attach_domain(NULL, i); - synchronize_sched(); + cpu_attach_domain(NULL, &def_root_domain, i); arch_destroy_sched_domains(cpu_map); } @@ -6792,6 +7440,10 @@ void __init sched_init(void) int highest_cpu = 0; int i, j; +#ifdef CONFIG_SMP + init_defrootdomain(); +#endif + for_each_possible_cpu(i) { struct rt_prio_array *array; struct rq *rq; @@ -6831,12 +7483,17 @@ void __init sched_init(void) rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; + rq->rd = NULL; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); + rq->rt.highest_prio = MAX_RT_PRIO; + rq->rt.overloaded = 0; + rq_attach_root(rq, &def_root_domain); #endif atomic_set(&rq->nr_iowait, 0); @@ -6871,6 +7528,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -6884,7 +7544,7 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line) { #ifdef in_atomic @@ -6892,13 +7552,16 @@ void __might_sleep(char *file, int line) if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (debug_direct_keyboard && hardirq_count()) + return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + " context %s(%d) at %s:%d\n", + current->comm, current->pid, file, line); + printk("in_atomic():%d [%08x], irqs_disabled():%d\n", + in_atomic(), preempt_count(), irqs_disabled()); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); Index: linux-2.6.24.7-rt27/kernel/sched_rt.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/sched_rt.c 2009-02-08 00:00:32.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/sched_rt.c 2009-02-08 00:05:00.000000000 -0500 @@ -3,6 +3,57 @@ * policies) */ +#ifdef CONFIG_SMP + +static inline int rt_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->rto_count); +} + +static inline void rt_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpu_set(rq->cpu, rq->rd->rto_mask); + /* + * Make sure the mask is visible before we set + * the overload count. That is checked to determine + * if we should look at the mask. It would be a shame + * if we looked at the mask, but the mask was not + * updated yet. + */ + wmb(); + atomic_inc(&rq->rd->rto_count); +} + +static inline void rt_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + /* the order here really doesn't matter */ + atomic_dec(&rq->rd->rto_count); + cpu_clear(rq->cpu, rq->rd->rto_mask); +} + +static void update_rt_migration(struct rq *rq) +{ + if (unlikely(num_online_cpus() == 1)) + return; + + if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } + } else if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } +} +#endif /* CONFIG_SMP */ + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -26,26 +77,143 @@ static void update_curr_rt(struct rq *rq cpuacct_charge(curr, delta_exec); } -static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) +static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) +{ + WARN_ON(!rt_task(p)); + rq->rt.rt_nr_running++; +#ifdef CONFIG_SMP + if (p->prio < rq->rt.highest_prio) { + rq->rt.highest_prio = p->prio; + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + p->prio); + } + if (p->nr_cpus_allowed > 1) + rq->rt.rt_nr_migratory++; + + update_rt_migration(rq); +#endif /* CONFIG_SMP */ +} + +static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq) +{ +#ifdef CONFIG_SMP + int highest_prio = rq->rt.highest_prio; +#endif + WARN_ON(!rt_task(p)); + WARN_ON(!rq->rt.rt_nr_running); + rq->rt.rt_nr_running--; +#ifdef CONFIG_SMP + if (rq->rt.rt_nr_running) { + struct rt_prio_array *array; + + WARN_ON(p->prio < rq->rt.highest_prio); + if (p->prio == rq->rt.highest_prio) { + /* recalculate */ + array = &rq->rt.active; + rq->rt.highest_prio = + sched_find_first_bit(array->bitmap); + } /* otherwise leave rq->highest prio alone */ + } else + rq->rt.highest_prio = MAX_RT_PRIO; + if (p->nr_cpus_allowed > 1) { + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } + + if (rq->rt.highest_prio != highest_prio) { + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rq->rt.highest_prio); + } + + update_rt_migration(rq); +#endif /* CONFIG_SMP */ +} + +static inline void incr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible++; +} + +static inline void decr_rt_nr_uninterruptible(struct task_struct *p, + struct rq *rq) +{ + rq->rt.rt_nr_uninterruptible--; +} + +unsigned long rt_nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_running; + + return sum; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_running; +} + +unsigned long rt_nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->rt.rt_nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long rt_nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->rt.rt_nr_uninterruptible; +} + +static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct rt_prio_array *array = &rq->rt.active; - list_add_tail(&p->run_list, array->queue + p->prio); + + if (unlikely(flags & ENQUEUE_HEAD)) + list_add(&p->run_list, array->queue + p->prio); + else + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + inc_rt_tasks(p, rq); + + if (p->state == TASK_UNINTERRUPTIBLE) + decr_rt_nr_uninterruptible(p, rq); } /* * Adding/removing a task to/from a priority array: */ -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct rt_prio_array *array = &rq->rt.active; update_curr_rt(rq); + if (p->state == TASK_UNINTERRUPTIBLE) + incr_rt_nr_uninterruptible(p, rq); + list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + dec_rt_tasks(p, rq); } /* @@ -65,6 +233,45 @@ yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr); } +#ifdef CONFIG_SMP +static int find_lowest_rq(struct task_struct *task); + +static int select_task_rq_rt(struct task_struct *p, int sync) +{ + struct rq *rq = task_rq(p); + + /* + * If the current task is an RT task, then + * try to see if we can wake this RT task up on another + * runqueue. Otherwise simply start this RT task + * on its current runqueue. + * + * We want to avoid overloading runqueues. Even if + * the RT task is of higher priority than the current RT task. + * RT tasks behave differently than other tasks. If + * one gets preempted, we try to push it off to another queue. + * So trying to keep a preempting RT task on the same + * cache hot CPU will force the running RT task to + * a cold CPU. So we waste all the cache for the lower + * RT task in hopes of saving some of a RT task + * that is just being woken and probably will have + * cold cache anyway. + */ + if (unlikely(rt_task(rq->curr)) && + (p->nr_cpus_allowed > 1)) { + int cpu = find_lowest_rq(p); + + return (cpu == -1) ? task_cpu(p) : cpu; + } + + /* + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + */ + return task_cpu(p); +} +#endif /* CONFIG_SMP */ + /* * Preempt the current task with a newly woken task if needed: */ @@ -100,111 +307,583 @@ static void put_prev_task_rt(struct rq * } #ifdef CONFIG_SMP -/* - * Load-balancing iterator. Note: while the runqueue stays locked - * during the whole iteration, the current task might be - * dequeued so the iterator has to be dequeue-safe. Here we - * achieve that by always pre-iterating before returning - * the current task: - */ -static struct task_struct *load_balance_start_rt(void *arg) +/* Only try algorithms three times */ +#define RT_MAX_TRIES 3 + +static int double_lock_balance(struct rq *this_rq, struct rq *busiest); +static void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + return 0; +} + +/* Return the second highest RT task, NULL otherwise */ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) { - struct rq *rq = arg; struct rt_prio_array *array = &rq->rt.active; - struct list_head *head, *curr; - struct task_struct *p; + struct task_struct *next; + struct list_head *queue; int idx; + if (likely(rq->rt.rt_nr_running < 2)) + return NULL; + idx = sched_find_first_bit(array->bitmap); - if (idx >= MAX_RT_PRIO) + if (unlikely(idx >= MAX_RT_PRIO)) { + WARN_ON(1); /* rt_nr_running is bad */ return NULL; + } - head = array->queue + idx; - curr = head->prev; + queue = array->queue + idx; + BUG_ON(list_empty(queue)); - p = list_entry(curr, struct task_struct, run_list); + next = list_entry(queue->next, struct task_struct, run_list); + if (unlikely(pick_rt_task(rq, next, cpu))) + goto out; - curr = curr->prev; + if (queue->next->next != queue) { + /* same prio task */ + next = list_entry(queue->next->next, struct task_struct, + run_list); + if (pick_rt_task(rq, next, cpu)) + goto out; + } + + retry: + /* slower, but more flexible */ + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + if (unlikely(idx >= MAX_RT_PRIO)) + return NULL; + + queue = array->queue + idx; + BUG_ON(list_empty(queue)); + + list_for_each_entry(next, queue, run_list) { + if (pick_rt_task(rq, next, cpu)) + goto out; + } - rq->rt.rt_load_balance_idx = idx; - rq->rt.rt_load_balance_head = head; - rq->rt.rt_load_balance_curr = curr; + goto retry; - return p; + out: + return next; } -static struct task_struct *load_balance_next_rt(void *arg) +static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); + +static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) { - struct rq *rq = arg; - struct rt_prio_array *array = &rq->rt.active; - struct list_head *head, *curr; - struct task_struct *p; - int idx; + int count; + + count = cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask); + + /* + * cpupri cannot efficiently tell us how many bits are set, so it only + * returns a boolean. However, the caller of this function will + * special case the value "1", so we want to return a positive integer + * other than one if there are bits to look at + */ + return count ? 2 : 0; +} + +static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +{ + int first; + + /* "this_cpu" is cheaper to preempt than a remote processor */ + if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) + return this_cpu; + + first = first_cpu(*mask); + if (first != NR_CPUS) + return first; + + return -1; +} + +static int find_lowest_rq(struct task_struct *task) +{ + struct sched_domain *sd; + cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + int this_cpu = smp_processor_id(); + int cpu = task_cpu(task); + int count; + + if (task->nr_cpus_allowed == 1) + return -1; /* No other targets possible */ + + count = find_lowest_cpus(task, lowest_mask); + if (!count) + return -1; /* No targets found */ + + /* + * There is no sense in performing an optimal search if only one + * target is found. + */ + if (count == 1) + return first_cpu(*lowest_mask); + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + * + * We prioritize the last cpu that the task executed on since + * it is most likely cache-hot in that location. + */ + if (cpu_isset(cpu, *lowest_mask)) + return cpu; + + /* + * Otherwise, we consult the sched_domains span maps to figure + * out which cpu is logically closest to our hot cache data. + */ + if (this_cpu == cpu) + this_cpu = -1; /* Skip this_cpu opt if the same */ - idx = rq->rt.rt_load_balance_idx; - head = rq->rt.rt_load_balance_head; - curr = rq->rt.rt_load_balance_curr; + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + cpumask_t domain_mask; + int best_cpu; + + cpus_and(domain_mask, sd->span, *lowest_mask); + + best_cpu = pick_optimal_cpu(this_cpu, + &domain_mask); + if (best_cpu != -1) + return best_cpu; + } + } /* - * If we arrived back to the head again then - * iterate to the next queue (if any): + * And finally, if there were no matches within the domains + * just give the caller *something* to work with from the compatible + * locations. */ - if (unlikely(head == curr)) { - int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + return pick_optimal_cpu(this_cpu, lowest_mask); +} + +/* Will lock the rq it finds */ +static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *lowest_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < RT_MAX_TRIES; tries++) { + cpu = find_lowest_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + lowest_rq = cpu_rq(cpu); + + /* if the prio of this runqueue changed, try again */ + if (double_lock_balance(rq, lowest_rq)) { + /* + * We had to unlock the run queue. In + * the mean time, task could have + * migrated already or had its affinity changed. + * Also make sure that it wasn't scheduled on its rq. + */ + if (unlikely(task_rq(task) != rq || + !cpu_isset(lowest_rq->cpu, + task->cpus_allowed) || + task_running(rq, task) || + !task->se.on_rq)) { + + spin_unlock(&lowest_rq->lock); + lowest_rq = NULL; + break; + } + } + + /* If this rq is still suitable use it. */ + if (lowest_rq->rt.highest_prio > task->prio) + break; + + /* try again */ + spin_unlock(&lowest_rq->lock); + lowest_rq = NULL; + } + + return lowest_rq; +} - if (next_idx >= MAX_RT_PRIO) - return NULL; +/* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +static int push_rt_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *lowest_rq; + int ret = 0; + int paranoid = RT_MAX_TRIES; + + if (!rq->rt.overloaded) + return 0; + + next_task = pick_next_highest_task_rt(rq, -1); + if (!next_task) + return 0; + + retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_task(rq->curr); + return 0; + } - idx = next_idx; - head = array->queue + idx; - curr = head->prev; + /* We might release rq lock */ + get_task_struct(next_task); - rq->rt.rt_load_balance_idx = idx; - rq->rt.rt_load_balance_head = head; + /* find_lock_lowest_rq locks the rq if found */ + lowest_rq = find_lock_lowest_rq(next_task, rq); + if (!lowest_rq) { + struct task_struct *task; + /* + * find lock_lowest_rq releases rq->lock + * so it is possible that next_task has changed. + * If it has, then try again. + */ + task = pick_next_highest_task_rt(rq, -1); + if (unlikely(task != next_task) && task && paranoid--) { + put_task_struct(next_task); + next_task = task; + goto retry; + } + goto out; } - p = list_entry(curr, struct task_struct, run_list); + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); + + resched_task(lowest_rq->curr); + + schedstat_inc(rq, rto_pushed); + + spin_unlock(&lowest_rq->lock); + + ret = 1; +out: + put_task_struct(next_task); - curr = curr->prev; + return ret; +} - rq->rt.rt_load_balance_curr = curr; +/* + * TODO: Currently we just use the second highest prio task on + * the queue, and stop when it can't migrate (or there's + * no more RT tasks). There may be a case where a lower + * priority RT task has a different affinity than the + * higher RT task. In this case the lower RT task could + * possibly be able to migrate where as the higher priority + * RT task could not. We currently ignore this issue. + * Enhancements are welcome! + */ +static void push_rt_tasks(struct rq *rq) +{ + /* push_rt_task will return true if it moved an RT */ + while (push_rt_task(rq)) + ; +} + +static int pull_rt_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p, *next; + struct rq *src_rq; + + if (likely(!rt_overloaded(this_rq))) + return 0; + + next = pick_next_task_rt(this_rq); + + for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could + * steal our next task - hence we must cause + * the caller to recalculate the next task + * in that case: + */ + if (double_lock_balance(this_rq, src_rq)) { + struct task_struct *old_next = next; + + next = pick_next_task_rt(this_rq); + if (next != old_next) + ret = 1; + } + + /* + * Are there still pullable RT tasks? + */ + if (src_rq->rt.rt_nr_running <= 1) { + spin_unlock(&src_rq->lock); + continue; + } + + p = pick_next_highest_task_rt(src_rq, this_cpu); + + /* + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ + if (p && (!next || (p->prio < next->prio))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->se.on_rq); + + /* + * There's a chance that p is higher in priority + * than what's currently running on its cpu. + * This is just that p is wakeing up and hasn't + * had a chance to schedule. We only pull + * p if it is lower in priority than the + * current task on the run queue or + * this_rq next task is lower in prio than + * the current task on that rq. + */ + if (p->prio < src_rq->curr->prio || + (next && next->prio < src_rq->curr->prio)) + goto out; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. (low likelyhood + * but possible) + * + * Update next so that we won't pick a task + * on another cpu with a priority lower (or equal) + * than the one we just picked. + */ + next = p; + + schedstat_inc(src_rq, rto_pulled); + } + out: + spin_unlock(&src_rq->lock); + } + + return ret; +} - return p; +static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) { + pull_rt_task(rq); + schedstat_inc(rq, rto_schedule); + } +} + +static void post_schedule_rt(struct rq *rq) +{ + /* + * If we have more than one rt_task queued, then + * see if we can push the other rt_tasks off to other CPUS. + * Note we may release the rq lock, and since + * the lock was owned by prev, we need to release it + * first via finish_lock_switch and then reaquire it here. + */ + if (unlikely(rq->rt.overloaded)) { + spin_lock(&rq->lock); + push_rt_tasks(rq); + schedstat_inc(rq, rto_schedule_tail); + spin_unlock(&rq->lock); + } +} + + +static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + rq->rt.overloaded) { + push_rt_tasks(rq); + schedstat_inc(rq, rto_wakeup); + } } static unsigned long load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *lb_flags, int *this_best_prio) { - struct rq_iterator rt_rq_iterator; - - rt_rq_iterator.start = load_balance_start_rt; - rt_rq_iterator.next = load_balance_next_rt; - /* pass 'busiest' rq argument into - * load_balance_[start|next]_rt iterators - */ - rt_rq_iterator.arg = busiest; - - return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, - idle, all_pinned, this_best_prio, &rt_rq_iterator); + /* don't touch RT tasks */ + return 0; } static int move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle) { - struct rq_iterator rt_rq_iterator; + /* don't touch RT tasks */ + return 0; +} +static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) +{ + int weight = cpus_weight(*new_mask); + + BUG_ON(!rt_task(p)); + + /* + * Update the migration status of the RQ if we have an RT task + * which is running AND changing its weight value. + */ + if (p->se.on_rq && (weight != p->nr_cpus_allowed)) { + struct rq *rq = task_rq(p); + + if ((p->nr_cpus_allowed <= 1) && (weight > 1)) + rq->rt.rt_nr_migratory++; + else if((p->nr_cpus_allowed > 1) && (weight <= 1)) { + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } + + update_rt_migration(rq); + } + + p->cpus_allowed = *new_mask; + p->nr_cpus_allowed = weight; +} +/* Assumes rq->lock is held */ +static void rq_online_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_set_overload(rq); - rt_rq_iterator.start = load_balance_start_rt; - rt_rq_iterator.next = load_balance_next_rt; - rt_rq_iterator.arg = busiest; + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); +} + +/* Assumes rq->lock is held */ +static void rq_offline_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_clear_overload(rq); - return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, - &rt_rq_iterator); + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } -#endif + +/* + * When switch from the rt queue, we bring ourselves to a position + * that we might want to pull RT tasks from other runqueues. + */ +static void switched_from_rt(struct rq *rq, struct task_struct *p, + int running) +{ + /* + * If there are other RT tasks then we will reschedule + * and the scheduling of the other RT tasks will handle + * the balancing. But if we are the last RT task + * we may need to handle the pulling of RT tasks + * now. + */ + if (!rq->rt.rt_nr_running) + pull_rt_task(rq); +} + +static int is_runnable_rt(struct rq *rq) +{ + return !!rq->rt.rt_nr_running; +} +#endif /* CONFIG_SMP */ + +/* + * When switching a task to RT, we may overload the runqueue + * with RT tasks. In this case we try to push them off to + * other runqueues. + */ +static void switched_to_rt(struct rq *rq, struct task_struct *p, + int running) +{ + int check_resched = 1; + + /* + * If we are already running, then there's nothing + * that needs to be done. But if we are not running + * we may need to preempt the current running task. + * If that current running task is also an RT task + * then see if we can move to another run queue. + */ + if (!running) { +#ifdef CONFIG_SMP + if (rq->rt.overloaded && push_rt_task(rq) && + /* Don't resched if we changed runqueues */ + rq != task_rq(p)) + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + +/* + * Priority of the task has changed. This may cause + * us to initiate a push or pull. + */ +static void prio_changed_rt(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + if (running) { +#ifdef CONFIG_SMP + /* + * If our priority decreases while running, we + * may need to pull tasks to this runqueue. + */ + if (oldprio < p->prio) + pull_rt_task(rq); + /* + * If there's a higher priority task waiting to run + * then reschedule. Note, the above pull_rt_task + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. + */ + if (p->prio > rq->rt.highest_prio && task_rq(p) == rq) + resched_task(p); +#else + /* For UP simply resched on drop of prio */ + if (oldprio < p->prio) + resched_task(p); +#endif /* CONFIG_SMP */ + } else { + /* + * This task is not running, but if it is + * greater than the current running task + * then reschedule. + */ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + static void task_tick_rt(struct rq *rq, struct task_struct *p) { @@ -244,6 +923,9 @@ const struct sched_class rt_sched_class .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_rt, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_rt, @@ -253,8 +935,19 @@ const struct sched_class rt_sched_class #ifdef CONFIG_SMP .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, + .set_cpus_allowed = set_cpus_allowed_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, + .pre_schedule = pre_schedule_rt, + .post_schedule = post_schedule_rt, + .task_wake_up = task_wake_up_rt, + .switched_from = switched_from_rt, + .is_runnable = is_runnable_rt, #endif .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, + + .prio_changed = prio_changed_rt, + .switched_to = switched_to_rt, }; Index: linux-2.6.24.7-rt27/include/linux/init_task.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/init_task.h 2009-02-08 00:00:31.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/init_task.h 2009-02-08 00:04:25.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #define INIT_FDTABLE \ { \ @@ -87,6 +88,24 @@ extern struct nsproxy init_nsproxy; .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh), \ } +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define INIT_RCU_BOOST_PRIO .rcu_prio = MAX_PRIO, +#define INIT_PREEMPT_RCU_BOOST(tsk) \ + .rcub_rbdp = NULL, \ + .rcub_state = RCU_BOOST_IDLE, \ + .rcub_entry = LIST_HEAD_INIT(tsk.rcub_entry), +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +#define INIT_RCU_BOOST_PRIO +#define INIT_PREEMPT_RCU_BOOST(tsk) +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + +#ifdef CONFIG_PREEMPT_RT +# define INIT_RW_OWNERS(tsk) .owned_read_locks = { \ + [0 ... (MAX_RWLOCK_DEPTH - 1) ] = { .task = &tsk } }, +#else +# define INIT_RW_OWNERS(tsk) +#endif + extern struct group_info init_groups; #define INIT_STRUCT_PID { \ @@ -129,7 +148,9 @@ extern struct group_info init_groups; .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ + INIT_RCU_BOOST_PRIO \ .cpus_allowed = CPU_MASK_ALL, \ + .nr_cpus_allowed = NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ @@ -164,7 +185,8 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ - .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .posix_timer_list = NULL, \ + .pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ @@ -173,6 +195,8 @@ extern struct group_info init_groups; .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PREEMPT_RCU_BOOST(tsk) \ + INIT_RW_OWNERS(tsk) \ } Index: linux-2.6.24.7-rt27/include/linux/sched.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/sched.h 2009-02-08 00:00:31.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/sched.h 2009-02-08 00:05:21.000000000 -0500 @@ -91,6 +91,27 @@ struct sched_param { #include +#ifdef CONFIG_PREEMPT +extern int kernel_preemption; +#else +# define kernel_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY +extern int voluntary_preemption; +#else +# define voluntary_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif +#ifdef CONFIG_PREEMPT_HARDIRQS +extern int hardirq_preemption; +#else +# define hardirq_preemption 0 +#endif + struct exec_domain; struct futex_pi_state; struct bio; @@ -157,6 +178,10 @@ print_cfs_rq(struct seq_file *m, int cpu } #endif +#ifdef CONFIG_PREEMPT_BKL +extern struct semaphore kernel_sem; +#endif + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -168,21 +193,44 @@ print_cfs_rq(struct seq_file *m, int cpu * mistake. */ #define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define TASK_STOPPED 4 -#define TASK_TRACED 8 +#define TASK_RUNNING_MUTEX 1 +#define TASK_INTERRUPTIBLE 2 +#define TASK_UNINTERRUPTIBLE 4 +#define TASK_STOPPED 8 +#define TASK_TRACED 16 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 64 /* in tsk->state again */ -#define TASK_DEAD 64 +#define TASK_DEAD 128 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) +// #define PREEMPT_DIRECT + +#ifdef CONFIG_X86_LOCAL_APIC +extern void nmi_show_all_regs(void); +#else +# define nmi_show_all_regs() do { } while (0) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct exec_domain; + /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to @@ -220,6 +268,8 @@ extern void sched_init_smp(void); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern int runqueue_is_locked(void); + extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); @@ -241,6 +291,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern int irq_show_regs_callback(int cpu, struct pt_regs *regs); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -258,6 +309,12 @@ extern void account_process_tick(struct extern void update_process_times(int user); extern void scheduler_tick(void); +#ifdef CONFIG_GENERIC_HARDIRQS +extern int debug_direct_keyboard; +#else +# define debug_direct_keyboard 0 +#endif + #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void spawn_softlockup_task(void); @@ -294,6 +351,11 @@ extern signed long FASTCALL(schedule_tim extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +/* + * This one can be called with interrupts disabled, only + * to be used by lowlevel arch code! + */ +asmlinkage void __sched __schedule(void); struct nsproxy; struct user_namespace; @@ -522,6 +584,19 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +#ifdef CONFIG_PREEMPT_RCU_BOOST +#define set_rcu_prio(p, prio) /* cpp to avoid #include hell */ \ + do { \ + (p)->rcu_prio = (prio); \ + } while (0) +#define get_rcu_prio(p) (p)->rcu_prio /* cpp to avoid #include hell */ +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +static inline void set_rcu_prio(struct task_struct *p, int prio) +{ +} +#define get_rcu_prio(p) (MAX_PRIO) /* cpp to use MAX_PRIO before it's defined */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ + /* * Some day this will be a full-fledged user tracking system.. */ @@ -821,12 +896,18 @@ struct uts_namespace; struct rq; struct sched_domain; +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_HEAD 0x02 + +#define DEQUEUE_SLEEP 0x01 + struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); + int (*select_task_rq)(struct task_struct *p, int sync); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); @@ -842,11 +923,27 @@ struct sched_class { int (*move_one_task) (struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq); + void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); + + int (*is_runnable) (struct rq *this_rq); #endif void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); void (*task_new) (struct rq *rq, struct task_struct *p); + void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task, + int running); + void (*switched_to) (struct rq *this_rq, struct task_struct *task, + int running); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio, int running); }; struct load_weight { @@ -914,6 +1011,16 @@ struct sched_entity { #endif }; +#ifdef CONFIG_PREEMPT_RT +struct rw_mutex; +struct reader_lock_struct { + struct rw_mutex *lock; + struct list_head list; + struct task_struct *task; + int count; +}; + +#endif struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -924,12 +1031,13 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; #endif -#endif int prio, static_prio, normal_prio; +#ifdef CONFIG_PREEMPT_RCU_BOOST + int rcu_prio; +#endif struct list_head run_list; const struct sched_class *sched_class; struct sched_entity se; @@ -956,11 +1064,23 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; + int nr_cpus_allowed; unsigned int time_slice; +#ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; + int rcu_flipctr_idx; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif +#ifdef CONFIG_PREEMPT_RCU_BOOST + struct rcu_boost_dat *rcub_rbdp; + enum rcu_boost_state rcub_state; + struct list_head rcub_entry; + unsigned long rcu_preempt_counter; +#endif struct list_head tasks; /* @@ -1024,6 +1144,8 @@ struct task_struct { unsigned long long it_sched_expires; struct list_head cpu_timers[3]; + struct task_struct* posix_timer_list; + /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; @@ -1080,7 +1202,7 @@ struct task_struct { spinlock_t alloc_lock; /* Protection of the PI data structures: */ - spinlock_t pi_lock; + raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ @@ -1093,6 +1215,7 @@ struct task_struct { /* mutex deadlock detection */ struct mutex_waiter *blocked_on; #endif + int pagefault_disabled; #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; int hardirqs_enabled; @@ -1116,6 +1239,38 @@ struct task_struct { unsigned int lockdep_recursion; #endif +#define MAX_PREEMPT_TRACE 25 +#define MAX_RWLOCK_DEPTH 5 + +#ifdef CONFIG_PREEMPT_RT + int reader_lock_count; + struct reader_lock_struct owned_read_locks[MAX_RWLOCK_DEPTH]; +#endif + +#ifdef CONFIG_PREEMPT_TRACE + unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; + unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; +#endif + +#define MAX_LOCK_STACK MAX_PREEMPT_TRACE +#ifdef CONFIG_DEBUG_PREEMPT + atomic_t lock_count; +# ifdef CONFIG_PREEMPT_RT + struct rt_mutex *owned_lock[MAX_LOCK_STACK]; +# endif +#endif +#ifdef CONFIG_DETECT_SOFTLOCKUP + unsigned long softlockup_count; /* Count to keep track how long the + * thread is in the kernel without + * sleeping. + */ +#endif + /* realtime bits */ + +#ifdef CONFIG_DEBUG_RT_MUTEXES + void *last_kernel_lock; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1178,26 +1333,22 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; +#ifdef CONFIG_PREEMPT_RT + /* + * Temporary hack, until we find a solution to + * handle printk in atomic operations. + */ + int in_printk; +#endif }; -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#ifdef CONFIG_PREEMPT_RT +# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) +#else +# define set_printk_might_sleep(x) do { } while(0) +#endif -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +#include static inline int rt_prio(int prio) { @@ -1345,6 +1496,15 @@ extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#ifdef CONFIG_PREEMPT_RT +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} +#else extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) @@ -1352,6 +1512,7 @@ static inline void put_task_struct(struc if (atomic_dec_and_test(&t->usage)) __put_task_struct(t); } +#endif /* * Per process flags @@ -1362,6 +1523,7 @@ static inline void put_task_struct(struc #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_NOSCHED 0x00000020 /* Userspace does not expect scheduling */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1369,6 +1531,7 @@ static inline void put_task_struct(struc #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_KMAP 0x00004000 /* this context has a kmap */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ @@ -1380,6 +1543,8 @@ static inline void put_task_struct(struc #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_SOFTIRQ 0x04000000 /* softirq context */ +#define PF_HARDIRQ 0x08000000 /* hardirq context */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ @@ -1472,9 +1637,14 @@ int sched_nr_latency_handler(struct ctl_ extern unsigned int sysctl_sched_compat_yield; +extern void task_setprio(struct task_struct *p, int prio); + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); +static inline void rt_mutex_setprio(struct task_struct *p, int prio) +{ + task_setprio(p, prio); +} extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline int rt_mutex_getprio(struct task_struct *p) @@ -1496,6 +1666,7 @@ extern struct task_struct *curr_task(int extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); +void __yield(void); /* * The default (Linux) execution domain. @@ -1567,6 +1738,9 @@ extern void do_timer(unsigned long ticks extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk)); extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, unsigned long clone_flags)); #ifdef CONFIG_SMP @@ -1661,12 +1835,27 @@ extern struct mm_struct * mm_alloc(void) /* mmdrop drops the mm and the page tables */ extern void FASTCALL(__mmdrop(struct mm_struct *)); +extern void FASTCALL(__mmdrop_delayed(struct mm_struct *)); +extern void FASTCALL(__mmdrop_rcu(struct mm_struct *)); + static inline void mmdrop(struct mm_struct * mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline void mmdrop_delayed(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_delayed(mm); +} + +static inline void mmdrop_rcu(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop_rcu(mm); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* Grab a reference to a task's mm, if it is not already going away */ @@ -1839,14 +2028,39 @@ static inline void clear_tsk_need_resche clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } +static inline int test_tsk_need_resched(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); +} + static inline int signal_pending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } +static inline int _need_resched(void) +{ + return unlikely(test_tsk_need_resched(current)); +} + static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return _need_resched(); +} + +static inline void set_tsk_need_resched_delayed(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED); +} + +static inline int need_resched_delayed(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED)); } /* @@ -1857,27 +2071,51 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int cond_resched(void); -extern int cond_resched_lock(spinlock_t * lock); +extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock); +extern int __cond_resched_spinlock(spinlock_t *spinlock); + +#define cond_resched_lock(lock) \ + PICK_SPIN_OP_RET(__cond_resched_raw_spinlock, __cond_resched_spinlock,\ + lock) + extern int cond_resched_softirq(void); +extern int cond_resched_softirq_context(void); +extern int cond_resched_hardirq_context(void); /* * Does a critical section need to be broken due to another * task waiting?: */ -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) -# define need_lockbreak(lock) ((lock)->break_lock) +#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT) +# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; }) #else # define need_lockbreak(lock) 0 #endif +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; }) +#else +# define need_lockbreak_raw(lock) 0 +#endif + /* * Does a critical section need to be broken due to another * task waiting or preemption being signalled: */ -static inline int lock_need_resched(spinlock_t *lock) +#define lock_need_resched(lock) \ + unlikely(need_lockbreak(lock) || need_resched()) + +static inline int softirq_need_resched(void) { - if (need_lockbreak(lock) || need_resched()) - return 1; + if (softirq_preemption && (current->flags & PF_SOFTIRQ)) + return need_resched(); + return 0; +} + +static inline int hardirq_need_resched(void) +{ + if (hardirq_preemption && (current->flags & PF_HARDIRQ)) + return need_resched(); return 0; } @@ -1928,6 +2166,18 @@ static inline void arch_pick_mmap_layout } #endif +#ifdef CONFIG_TRACING +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); +#else +static inline void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +} +#endif + extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); @@ -1985,6 +2235,12 @@ static inline void inc_syscw(struct task } #endif +#ifdef CONFIG_PREEMPT_TRACE +void print_preempt_trace(struct task_struct *tsk); +#else +# define print_preempt_trace(tsk) do { } while (0) +#endif + #ifdef CONFIG_SMP void migration_init(void); #else @@ -1993,6 +2249,15 @@ static inline void migration_init(void) } #endif +#ifdef CONFIG_SMP +static inline int task_is_current(struct task_struct *task) +{ + return task->oncpu; +} +#endif + +#define TASK_STATE_TO_CHAR_STR "RMSDTtZX" + #endif /* __KERNEL__ */ #endif Index: linux-2.6.24.7-rt27/kernel/fork.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/fork.c 2009-02-08 00:00:31.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/fork.c 2009-02-08 00:04:55.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,8 @@ #include #include #include +#include +#include #include #include #include @@ -71,6 +74,15 @@ DEFINE_PER_CPU(unsigned long, process_co __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +/* + * Delayed mmdrop. In the PREEMPT_RT case we + * dont want to do this from the scheduling + * context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); + +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + int nr_processes(void) { int cpu; @@ -115,10 +127,13 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct(struct task_struct *tsk) +#ifdef CONFIG_PREEMPT_RT +void __put_task_struct_cb(struct rcu_head *rhp) { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + BUG_ON(atomic_read(&tsk->usage)); WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); security_task_free(tsk); @@ -130,8 +145,27 @@ void __put_task_struct(struct task_struc free_task(tsk); } +#else + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + BUG_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + + if (!profile_handoff_task(tsk)) + free_task(tsk); +} +#endif + void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -159,6 +193,9 @@ void __init fork_init(unsigned long memp init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < NR_CPUS; i++) + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -354,6 +391,7 @@ static struct mm_struct * mm_init(struct spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; @@ -393,6 +431,18 @@ void fastcall __mmdrop(struct mm_struct free_mm(mm); } +#ifdef CONFIG_PREEMPT_RT +static void ___mmdrop_rcu(struct rcu_head *head) +{ + __mmdrop(container_of(head, struct mm_struct, rcu_head)); +} + +void fastcall __mmdrop_rcu(struct mm_struct *mm) +{ + call_rcu_preempt_online(&mm->rcu_head, ___mmdrop_rcu); +} +#endif + /* * Decrement the use count and release all resources for an mm. */ @@ -959,6 +1009,9 @@ static void rt_mutex_init_task(struct ta #ifdef CONFIG_RT_MUTEXES plist_head_init(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + p->last_kernel_lock = NULL; +# endif #endif } @@ -1010,7 +1063,7 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif @@ -1045,6 +1098,16 @@ static struct task_struct *copy_process( copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); +#ifdef CONFIG_PREEMPT_RCU + p->rcu_read_lock_nesting = 0; + p->rcu_flipctr_idx = 0; +#ifdef CONFIG_PREEMPT_RCU_BOOST + p->rcu_prio = MAX_PRIO; + p->rcub_rbdp = NULL; + p->rcub_state = RCU_BOOST_IDLE; + INIT_LIST_HEAD(&p->rcub_entry); +#endif +#endif /* CONFIG_PREEMPT_RCU */ p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); @@ -1074,7 +1137,7 @@ static struct task_struct *copy_process( INIT_LIST_HEAD(&p->cpu_timers[0]); INIT_LIST_HEAD(&p->cpu_timers[1]); INIT_LIST_HEAD(&p->cpu_timers[2]); - + p->posix_timer_list = NULL; p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; @@ -1113,6 +1176,7 @@ static struct task_struct *copy_process( p->hardirq_context = 0; p->softirq_context = 0; #endif + p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; @@ -1150,6 +1214,22 @@ static struct task_struct *copy_process( retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespaces; +#ifdef CONFIG_DEBUG_PREEMPT + atomic_set(&p->lock_count, 0); +#endif + +#ifdef CONFIG_PREEMPT_RT + p->reader_lock_count = 0; + { + int i; + for (i = 0; i < MAX_RWLOCK_DEPTH; i++) { + INIT_LIST_HEAD(&p->owned_read_locks[i].list); + p->owned_read_locks[i].count = 0; + p->owned_read_locks[i].lock = NULL; + p->owned_read_locks[i].task = p; + } + } +#endif if (pid != &init_struct_pid) { retval = -ENOMEM; @@ -1236,10 +1316,13 @@ static struct task_struct *copy_process( * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; + p->nr_cpus_allowed = current->nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) @@ -1301,7 +1384,9 @@ static struct task_struct *copy_process( attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; @@ -1732,3 +1817,124 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static int desched_thread(void * __bind_cpu) +{ + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (mmdrop_complete()) + continue; + schedule(); + + /* + * This must be called from time to time on ia64, and is a + * no-op on other archs. Used to be in cpu_idle(), but with + * the new -rt semantics it can't stay there. + */ + check_pgt_cache(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + Index: linux-2.6.24.7-rt27/kernel/sched_idletask.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/sched_idletask.c 2009-02-08 00:00:31.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/sched_idletask.c 2009-02-08 00:05:00.000000000 -0500 @@ -5,6 +5,12 @@ * handled in sched_fair.c) */ +#ifdef CONFIG_SMP +static int select_task_rq_idle(struct task_struct *p, int sync) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: */ @@ -25,7 +31,7 @@ static struct task_struct *pick_next_tas * message if some code attempts to do it: */ static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); @@ -42,7 +48,7 @@ static unsigned long load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *lb_flags, int *this_best_prio) { return 0; } @@ -53,6 +59,12 @@ move_one_task_idle(struct rq *this_rq, i { return 0; } + +static int +is_runnable_idle(struct rq *this_rq) +{ + return 1; +} #endif static void task_tick_idle(struct rq *rq, struct task_struct *curr) @@ -63,6 +75,33 @@ static void set_curr_task_idle(struct rq { } +static void switched_to_idle(struct rq *rq, struct task_struct *p, + int running) +{ + /* Can this actually happen?? */ + if (running) + resched_task(rq->curr); + else + check_preempt_curr(rq, p); +} + +static void prio_changed_idle(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + /* This can happen for hot plug CPUS */ + + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p); +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -72,6 +111,9 @@ const struct sched_class idle_sched_clas /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_curr_idle, @@ -81,9 +123,14 @@ const struct sched_class idle_sched_clas #ifdef CONFIG_SMP .load_balance = load_balance_idle, .move_one_task = move_one_task_idle, + .is_runnable = is_runnable_idle, #endif .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, + /* no .task_new for idle tasks */ }; Index: linux-2.6.24.7-rt27/include/linux/topology.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/topology.h 2009-02-08 00:00:28.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/topology.h 2009-02-08 00:01:06.000000000 -0500 @@ -5,7 +5,7 @@ * * Copyright (C) 2002, IBM Corp. * - * All rights reserved. + * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by Index: linux-2.6.24.7-rt27/kernel/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/Makefile 2009-02-08 00:00:28.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/Makefile 2009-02-08 00:04:46.000000000 -0500 @@ -7,14 +7,30 @@ obj-y = sched.o fork.o exec_domain.o sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ utsname.o notifier.o +CFLAGS_REMOVE_sched.o = -mno-spe + +ifdef CONFIG_FTRACE +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_marker.o = -pg +endif + obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o @@ -26,6 +42,7 @@ endif obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_PREEMPT_RT) += rt.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o @@ -51,12 +68,22 @@ obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RWLOCK_TORTURE_TEST) += rwlock_torture.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o +obj-$(CONFIG_PREEMPT_RCU) += rcuclassic.o rcupreempt.o +obj-$(CONFIG_PREEMPT_RCU_BOOST) += rcupreempt-boost.o +ifeq ($(CONFIG_PREEMPT_RCU),y) +obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o +endif obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is Index: linux-2.6.24.7-rt27/kernel/sched_cpupri.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/sched_cpupri.c 2009-02-08 00:01:08.000000000 -0500 @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + Index: linux-2.6.24.7-rt27/kernel/sched_cpupri.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/sched_cpupri.h 2009-02-08 00:05:26.000000000 -0500 @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + raw_spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ Index: linux-2.6.24.7-rt27/include/linux/immediate.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/include/linux/immediate.h 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,97 @@ +#ifndef _LINUX_IMMEDIATE_H +#define _LINUX_IMMEDIATE_H + +/* + * Immediate values, can be updated at runtime and save cache lines. + * + * (C) Copyright 2007 Mathieu Desnoyers + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#ifdef CONFIG_IMMEDIATE + +#include + +/** + * imv_set - set immediate variable (with locking) + * @name: immediate value name + * @i: required value + * + * Sets the value of @name, taking the module_mutex if required by + * the architecture. + */ +#define imv_set(name, i) \ + do { \ + name##__imv = (i); \ + core_imv_update(); \ + module_imv_update(); \ + } while (0) + +/* + * Internal update functions. + */ +extern void core_imv_update(void); +extern void imv_update_range(struct __imv *begin, struct __imv *end); +extern void imv_unref_core_init(void); +extern void imv_unref(struct __imv *begin, struct __imv *end, void *start, + unsigned long size); +extern int _is_imv_cond_end(unsigned long *begin, unsigned long *end, + unsigned long addr1, unsigned long addr2); +extern int is_imv_cond_end(unsigned long addr1, unsigned long addr2); + +#else + +/* + * Generic immediate values: a simple, standard, memory load. + */ + +/** + * imv_read - read immediate variable + * @name: immediate value name + * + * Reads the value of @name. + */ +#define imv_read(name) _imv_read(name) + +/** + * imv_cond - read immediate variable use as condition for if() + * @name: immediate value name + * + * Reads the value of @name. + */ +#define imv_cond(name) _imv_read(name) +#define imv_cond_end() + +/** + * imv_set - set immediate variable (with locking) + * @name: immediate value name + * @i: required value + * + * Sets the value of @name, taking the module_mutex if required by + * the architecture. + */ +#define imv_set(name, i) (name##__imv = (i)) + +static inline void core_imv_update(void) { } +static inline void imv_unref_core_init(void) { } + +#endif + +#define DECLARE_IMV(type, name) extern __typeof__(type) name##__imv +#define DEFINE_IMV(type, name) __typeof__(type) name##__imv + +#define EXPORT_IMV_SYMBOL(name) EXPORT_SYMBOL(name##__imv) +#define EXPORT_IMV_SYMBOL_GPL(name) EXPORT_SYMBOL_GPL(name##__imv) + +/** + * _imv_read - Read immediate value with standard memory load. + * @name: immediate value name + * + * Force a data read of the immediate value instead of the immediate value + * based mechanism. Useful for __init and __exit section data read. + */ +#define _imv_read(name) (name##__imv) + +#endif Index: linux-2.6.24.7-rt27/include/linux/marker.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/marker.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/marker.h 2009-02-08 00:01:09.000000000 -0500 @@ -12,6 +12,7 @@ * See the file COPYING for more details. */ +#include #include struct module; @@ -19,25 +20,35 @@ struct marker; /** * marker_probe_func - Type of a marker probe function - * @mdata: pointer of type struct marker - * @private_data: caller site private data + * @probe_private: probe private data + * @call_private: call site private data * @fmt: format string - * @...: variable argument list + * @args: variable argument list pointer. Use a pointer to overcome C's + * inability to pass this around as a pointer in a portable manner in + * the callee otherwise. * * Type of marker probe functions. They receive the mdata and need to parse the * format string to recover the variable argument list. */ -typedef void marker_probe_func(const struct marker *mdata, - void *private_data, const char *fmt, ...); +typedef void marker_probe_func(void *probe_private, void *call_private, + const char *fmt, va_list *args); + +struct marker_probe_closure { + marker_probe_func *func; /* Callback */ + void *probe_private; /* Private probe data */ +}; struct marker { const char *name; /* Marker name */ const char *format; /* Marker format string, describing the * variable argument list. */ - char state; /* Marker state. */ - marker_probe_func *call;/* Probe handler function pointer */ - void *private; /* Private probe data */ + DEFINE_IMV(char, state);/* Immediate value state. */ + char ptype; /* probe type : 0 : single, 1 : multi */ + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; } __attribute__((aligned(8))); #ifdef CONFIG_MARKERS @@ -48,51 +59,73 @@ struct marker { * Make sure the alignment of the structure in the __markers section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. + * + * The "generic" argument controls which marker enabling mechanism must be used. + * If generic is true, a variable read is used. + * If generic is false, immediate values are used. */ -#define __trace_mark(name, call_data, format, args...) \ +#define __trace_mark(generic, name, call_private, format, args...) \ do { \ - static const char __mstrtab_name_##name[] \ + static const char __mstrtab_##name[] \ __attribute__((section("__markers_strings"))) \ - = #name; \ - static const char __mstrtab_format_##name[] \ - __attribute__((section("__markers_strings"))) \ - = format; \ + = #name "\0" format; \ static struct marker __mark_##name \ __attribute__((section("__markers"), aligned(8))) = \ - { __mstrtab_name_##name, __mstrtab_format_##name, \ - 0, __mark_empty_function, NULL }; \ + { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ + 0, 0, marker_probe_cb, \ + { __mark_empty_function, NULL}, NULL }; \ __mark_check_format(format, ## args); \ - if (unlikely(__mark_##name.state)) { \ - preempt_disable(); \ - (*__mark_##name.call) \ - (&__mark_##name, call_data, \ - format, ## args); \ - preempt_enable(); \ + if (!generic) { \ + if (unlikely(imv_cond(__mark_##name.state))) { \ + imv_cond_end(); \ + (*__mark_##name.call) \ + (&__mark_##name, call_private, \ + ## args); \ + } else \ + imv_cond_end(); \ + } else { \ + if (unlikely(_imv_read(__mark_##name.state))) \ + (*__mark_##name.call) \ + (&__mark_##name, call_private, \ + ## args); \ } \ } while (0) extern void marker_update_probe_range(struct marker *begin, - struct marker *end, struct module *probe_module, int *refcount); + struct marker *end); #else /* !CONFIG_MARKERS */ -#define __trace_mark(name, call_data, format, args...) \ +#define __trace_mark(generic, name, call_private, format, args...) \ __mark_check_format(format, ## args) static inline void marker_update_probe_range(struct marker *begin, - struct marker *end, struct module *probe_module, int *refcount) + struct marker *end) { } #endif /* CONFIG_MARKERS */ /** - * trace_mark - Marker + * trace_mark - Marker using code patching * @name: marker name, not quoted. * @format: format string * @args...: variable argument list * - * Places a marker. + * Places a marker using optimized code patching technique (imv_read()) + * to be enabled when immediate values are present. */ #define trace_mark(name, format, args...) \ - __trace_mark(name, NULL, format, ## args) + __trace_mark(0, name, NULL, format, ## args) -#define MARK_MAX_FORMAT_LEN 1024 +/** + * _trace_mark - Marker using variable read + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker using a standard memory read (_imv_read()) to be + * enabled. Should be used for markers in code paths where instruction + * modification based enabling is not welcome. (__init and __exit functions, + * lockdep, some traps, printk). + */ +#define _trace_mark(name, format, args...) \ + __trace_mark(1, name, NULL, format, ## args) /** * MARK_NOARGS - Format string for a marker with no argument. @@ -100,30 +133,42 @@ static inline void marker_update_probe_r #define MARK_NOARGS " " /* To be used for string format validity checking with gcc */ -static inline void __printf(1, 2) __mark_check_format(const char *fmt, ...) +static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...) { } +#define __mark_check_format(format, args...) \ + do { \ + if (0) \ + ___mark_check_format(format, ## args); \ + } while (0) + extern marker_probe_func __mark_empty_function; +extern void marker_probe_cb(const struct marker *mdata, + void *call_private, ...); +extern void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...); + /* * Connect a probe to a marker. * private data pointer must be a valid allocated memory address, or NULL. */ extern int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *private); + marker_probe_func *probe, void *probe_private); /* * Returns the private data given to marker_probe_register. */ -extern void *marker_probe_unregister(const char *name); +extern int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private); /* * Unregister a marker by providing the registered private data. */ -extern void *marker_probe_unregister_private_data(void *private); +extern int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private); -extern int marker_arm(const char *name); -extern int marker_disarm(const char *name); -extern void *marker_get_private_data(const char *name); +extern void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num); #endif Index: linux-2.6.24.7-rt27/include/linux/module.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/module.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/module.h 2009-02-08 00:01:09.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -355,6 +356,12 @@ struct module /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; +#ifdef CONFIG_IMMEDIATE + struct __imv *immediate; + unsigned int num_immediate; + unsigned long *immediate_cond_end; + unsigned int num_immediate_cond_end; +#endif #ifdef CONFIG_MARKERS struct marker *markers; unsigned int num_markers; @@ -462,7 +469,7 @@ int unregister_module_notifier(struct no extern void print_modules(void); -extern void module_update_markers(struct module *probe_module, int *refcount); +extern void module_update_markers(void); #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) @@ -563,13 +570,30 @@ static inline void print_modules(void) { } -static inline void module_update_markers(struct module *probe_module, - int *refcount) +static inline void module_update_markers(void) { } #endif /* CONFIG_MODULES */ +#if defined(CONFIG_MODULES) && defined(CONFIG_IMMEDIATE) +extern void _module_imv_update(void); +extern void module_imv_update(void); +extern int is_imv_cond_end_module(unsigned long addr1, unsigned long addr2); +#else +static inline void _module_imv_update(void) +{ +} +static inline void module_imv_update(void) +{ +} +static inline int is_imv_cond_end_module(unsigned long addr1, + unsigned long addr2) +{ + return 0; +} +#endif + struct device_driver; #ifdef CONFIG_SYSFS struct module; Index: linux-2.6.24.7-rt27/kernel/marker.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/marker.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/marker.c 2009-02-08 00:01:09.000000000 -0500 @@ -23,39 +23,48 @@ #include #include #include +#include +#include extern struct marker __start___markers[]; extern struct marker __stop___markers[]; +/* Set to 1 to enable marker debug output */ +static const int marker_debug; + /* * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers, the hash table and deferred_sync. + * and module markers and the hash table. */ static DEFINE_MUTEX(markers_mutex); /* - * Marker deferred synchronization. - * Upon marker probe_unregister, we delay call to synchronize_sched() to - * accelerate mass unregistration (only when there is no more reference to a - * given module do we call synchronize_sched()). However, we need to make sure - * every critical region has ended before we re-arm a marker that has been - * unregistered and then registered back with a different probe data. - */ -static int deferred_sync; - -/* * Marker hash table, containing the active markers. * Protected by module_mutex. */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ struct marker_entry { struct hlist_node hlist; char *format; - marker_probe_func *probe; - void *private; + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -63,7 +72,8 @@ static struct hlist_head marker_table[MA /** * __mark_empty_function - Empty probe callback - * @mdata: pointer of type const struct marker + * @probe_private: probe private data + * @call_private: call site private data * @fmt: format string * @...: variable argument list * @@ -72,13 +82,265 @@ static struct hlist_head marker_table[MA * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(const struct marker *mdata, void *private, - const char *fmt, ...) +void __mark_empty_function(void *probe_private, void *call_private, + const char *fmt, va_list *args) { } EXPORT_SYMBOL_GPL(__mark_empty_function); /* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +void marker_probe_cb(const struct marker *mdata, void *call_private, ...) +{ + va_list args; + char ptype; + + /* + * preempt_disable does two things : disabling preemption to make sure + * the teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. + */ + preempt_disable(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, call_private); + func(mdata->single.probe_private, call_private, mdata->format, + &args); + va_end(args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = mdata->multi; + for (i = 0; multi[i].func; i++) { + va_start(args, call_private); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); + va_end(args); + } + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +{ + va_list args; /* not initialized */ + char ptype; + + preempt_disable(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata->single.probe_private, call_private, mdata->format, + &args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = mdata->multi; + for (i = 0; multi[i].func; i++) + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_entry *entry = container_of(head, + struct marker_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi[i].func, + entry->multi[i].probe_private); + } +} + +static struct marker_probe_closure * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_closure *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe + && old[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new[0] = entry->single; + else + memcpy(new, old, + nr_probes * sizeof(struct marker_probe_closure)); + new[nr_probes].func = probe; + new[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_closure * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_closure *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if ((!probe || old[nr_probes].func == probe) + && old[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + entry->single = old[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + +/* * Get marker if the marker is present in the marker hash table. * Must be called with markers_mutex held. * Returns NULL if not present. @@ -102,8 +364,7 @@ static struct marker_entry *get_marker(c * Add the marker to the marker hash table. Must be called with markers_mutex * held. */ -static int add_marker(const char *name, const char *format, - marker_probe_func *probe, void *private) +static struct marker_entry *add_marker(const char *name, const char *format) { struct hlist_head *head; struct hlist_node *node; @@ -118,9 +379,8 @@ static int add_marker(const char *name, hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE - "Marker %s busy, probe %p already installed\n", - name, e->probe); - return -EBUSY; /* Already there */ + "Marker %s busy\n", name); + return ERR_PTR(-EBUSY); /* Already there */ } } /* @@ -130,34 +390,42 @@ static int add_marker(const char *name, e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memcpy(&e->name[0], name, name_len); if (format) { e->format = &e->name[name_len]; memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; trace_mark(core_marker_format, "name %s format %s", e->name, e->format); - } else + } else { e->format = NULL; - e->probe = probe; - e->private = private; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; e->refcount = 0; + e->rcu_pending = 0; hlist_add_head(&e->hlist, head); - return 0; + return e; } /* * Remove the marker from the marker hash table. Must be called with mutex_lock * held. */ -static void *remove_marker(const char *name) +static int remove_marker(const char *name) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; int found = 0; size_t len = strlen(name) + 1; - void *private = NULL; u32 hash = jhash(name, len-1, 0); head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; @@ -167,12 +435,16 @@ static void *remove_marker(const char *n break; } } - if (found) { - private = e->private; - hlist_del(&e->hlist); - kfree(e); - } - return private; + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; } /* @@ -184,6 +456,7 @@ static int marker_set_format(struct mark size_t name_len = strlen((*entry)->name) + 1; size_t format_len = strlen(format) + 1; + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) @@ -191,11 +464,20 @@ static int marker_set_format(struct mark memcpy(&e->name[0], (*entry)->name, name_len); e->format = &e->name[name_len]; memcpy(e->format, format, format_len); - e->probe = (*entry)->probe; - e->private = (*entry)->private; + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + e->single = (*entry)->single; + e->multi = (*entry)->multi; + e->ptype = (*entry)->ptype; e->refcount = (*entry)->refcount; + e->rcu_pending = 0; hlist_add_before(&e->hlist, &(*entry)->hlist); hlist_del(&(*entry)->hlist); + /* Make sure the call_rcu has been executed */ + if ((*entry)->rcu_pending) + rcu_barrier(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -206,7 +488,8 @@ static int marker_set_format(struct mark /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem) +static int set_marker(struct marker_entry **entry, struct marker *elem, + int active) { int ret; WARN_ON(strcmp((*entry)->name, elem->name) != 0); @@ -226,26 +509,64 @@ static int set_marker(struct marker_entr if (ret) return ret; } - elem->call = (*entry)->probe; - elem->private = (*entry)->private; - elem->state = 1; + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = (*entry)->call; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private + != (*entry)->single.probe_private && + !elem->ptype); + elem->single.probe_private = (*entry)->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = (*entry)->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, (*entry)->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = (*entry)->ptype; + elem->state__imv = active; + return 0; } /* * Disable a marker and its probe callback. - * Note: only after a synchronize_sched() issued after setting elem->call to the - * empty function insures that the original callback is not used anymore. This - * insured by preemption disabling around the call site. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. */ static void disable_marker(struct marker *elem) { - elem->state = 0; - elem->call = __mark_empty_function; + /* leave "call" as is. It is known statically. */ + elem->state__imv = 0; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and - * should be done only after a synchronize_sched(). These are never used - * until the next initialization anyway. + * should be done only after an RCU period. These are never used until + * the next initialization anyway. */ } @@ -253,14 +574,11 @@ static void disable_marker(struct marker * marker_update_probe_range - Update a probe range * @begin: beginning of the range * @end: end of the range - * @probe_module: module address of the probe being updated - * @refcount: number of references left to the given probe_module (out) * * Updates the probe callback corresponding to a range of markers. */ void marker_update_probe_range(struct marker *begin, - struct marker *end, struct module *probe_module, - int *refcount) + struct marker *end) { struct marker *iter; struct marker_entry *mark_entry; @@ -268,15 +586,12 @@ void marker_update_probe_range(struct ma mutex_lock(&markers_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); - if (mark_entry && mark_entry->refcount) { - set_marker(&mark_entry, iter); + if (mark_entry) { + set_marker(&mark_entry, iter, + !!mark_entry->refcount); /* * ignore error, continue */ - if (probe_module) - if (probe_module == - __module_text_address((unsigned long)mark_entry->probe)) - (*refcount)++; } else { disable_marker(iter); } @@ -286,23 +601,30 @@ void marker_update_probe_range(struct ma /* * Update probes, removing the faulty probes. - * Issues a synchronize_sched() when no reference to the module passed - * as parameter is found in the probes so the probe module can be - * safely unloaded from now on. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). */ -static void marker_update_probes(struct module *probe_module) +static void marker_update_probes(void) { - int refcount = 0; - /* Core kernel markers */ - marker_update_probe_range(__start___markers, - __stop___markers, probe_module, &refcount); + marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ - module_update_markers(probe_module, &refcount); - if (probe_module && refcount == 0) { - synchronize_sched(); - deferred_sync = 0; - } + module_update_markers(); + /* Update immediate values */ + core_imv_update(); + module_imv_update(); } /** @@ -310,33 +632,52 @@ static void marker_update_probes(struct * @name: marker name * @format: format string * @probe: probe handler - * @private: probe private data + * @probe_private: probe private data * * private data must be a valid allocated memory address, or NULL. * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. */ int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *private) + marker_probe_func *probe, void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); entry = get_marker(name); - if (entry && entry->refcount) { - ret = -EBUSY; - goto end; - } - if (deferred_sync) { - synchronize_sched(); - deferred_sync = 0; + if (!entry) { + entry = add_marker(name, format); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } } - ret = add_marker(name, format, probe, private); - if (ret) + /* + * If we detect that a call_rcu is pending for this marker, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); goto end; + } mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -346,171 +687,173 @@ EXPORT_SYMBOL_GPL(marker_probe_register) /** * marker_probe_unregister - Disconnect a probe from a marker * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data * * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -void *marker_probe_unregister(const char *name) +int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private) { - struct module *probe_module; struct marker_entry *entry; - void *private; + struct marker_probe_closure *old; + int ret = -ENOENT; mutex_lock(&markers_mutex); entry = get_marker(name); - if (!entry) { - private = ERR_PTR(-ENOENT); + if (!entry) goto end; - } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(name); - deferred_sync = 1; + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) + goto end; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); + remove_marker(name); /* Ignore busy error message */ + ret = 0; end: mutex_unlock(&markers_mutex); - return private; + return ret; } EXPORT_SYMBOL_GPL(marker_probe_unregister); -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @private: probe private data - * - * Unregister a marker by providing the registered private data. - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - */ -void *marker_probe_unregister_private_data(void *private) +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) { - struct module *probe_module; - struct hlist_head *head; - struct hlist_node *node; struct marker_entry *entry; - int found = 0; unsigned int i; + struct hlist_head *head; + struct hlist_node *node; - mutex_lock(&markers_mutex); for (i = 0; i < MARKER_TABLE_SIZE; i++) { head = &marker_table[i]; hlist_for_each_entry(entry, node, head, hlist) { - if (entry->private == private) { - found = 1; - goto iter_end; + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_closure *closure; + closure = entry->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func == probe && + closure[i].probe_private + == probe_private) + return entry; + } } } } -iter_end: - if (!found) { - private = ERR_PTR(-ENOENT); - goto end; - } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(entry->name); - deferred_sync = 1; - mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; -end: - mutex_unlock(&markers_mutex); - return private; + return NULL; } -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** - * marker_arm - Arm a marker - * @name: marker name + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data * - * Activate a marker. It keeps a reference count of the number of - * arming/disarming done. - * Returns 0 if ok, error value on error. + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -int marker_arm(const char *name) +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); - entry = get_marker(name); + entry = get_marker_from_private_data(probe, probe_private); if (!entry) { ret = -ENOENT; goto end; } - /* - * Only need to update probes when refcount passes from 0 to 1. - */ - if (entry->refcount++) - goto end; -end: + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; -} -EXPORT_SYMBOL_GPL(marker_arm); - -/** - * marker_disarm - Disarm a marker - * @name: marker name - * - * Disarm a marker. It keeps a reference count of the number of arming/disarming - * done. - * Returns 0 if ok, error value on error. - */ -int marker_disarm(const char *name) -{ - struct marker_entry *entry; - int ret = 0; - + marker_update_probes(); /* may update entry */ mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - ret = -ENOENT; - goto end; - } - /* - * Only permit decrement refcount if higher than 0. - * Do probe update only on 1 -> 0 transition. - */ - if (entry->refcount) { - if (--entry->refcount) - goto end; - } else { - ret = -EPERM; - goto end; - } + entry = get_marker_from_private_data(probe, probe_private); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); + remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); - marker_update_probes(NULL); return ret; } -EXPORT_SYMBOL_GPL(marker_disarm); +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** * marker_get_private_data - Get a marker's probe private data * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. * Returns the private data pointer, or an ERR_PTR. * The private data pointer should _only_ be dereferenced if the caller is the * owner of the data, or its content could vanish. This is mostly used to * confirm that a caller is the owner of a registered probe. */ -void *marker_get_private_data(const char *name) +void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); - int found = 0; + int i; head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { - found = 1; - return e->private; + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + else + break; + } else { + struct marker_probe_closure *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func != probe) + continue; + if (match++ == num) + return closure[i].probe_private; + } + } } } return ERR_PTR(-ENOENT); Index: linux-2.6.24.7-rt27/kernel/module.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/module.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/module.c 2009-02-08 00:05:15.000000000 -0500 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,9 @@ #include #include #include +#include +#include +#include extern int module_sysfs_initialized; @@ -1675,8 +1679,11 @@ static struct module *load_module(void _ unsigned int unusedcrcindex; unsigned int unusedgplindex; unsigned int unusedgplcrcindex; + unsigned int immediateindex; + unsigned int immediatecondendindex; unsigned int markersindex; unsigned int markersstringsindex; + unsigned int mcountindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1773,6 +1780,9 @@ static struct module *load_module(void _ #ifdef ARCH_UNWIND_SECTION_NAME unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); #endif + immediateindex = find_sec(hdr, sechdrs, secstrings, "__imv"); + immediatecondendindex = find_sec(hdr, sechdrs, secstrings, + "__imv_cond_end"); /* Don't keep modinfo section */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1924,6 +1934,16 @@ static struct module *load_module(void _ mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; if (gplfuturecrcindex) mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; +#ifdef CONFIG_IMMEDIATE + mod->immediate = (void *)sechdrs[immediateindex].sh_addr; + mod->num_immediate = + sechdrs[immediateindex].sh_size / sizeof(*mod->immediate); + mod->immediate_cond_end = + (void *)sechdrs[immediatecondendindex].sh_addr; + mod->num_immediate_cond_end = + sechdrs[immediatecondendindex].sh_size + / sizeof(*mod->immediate_cond_end); +#endif mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; if (unusedcrcindex) @@ -1947,6 +1967,9 @@ static struct module *load_module(void _ markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + mcountindex = find_sec(hdr, sechdrs, secstrings, + "__mcount_loc"); + /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { const char *strtab = (char *)sechdrs[strindex].sh_addr; @@ -1991,11 +2014,23 @@ static struct module *load_module(void _ add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + if (!(mod->taints & TAINT_FORCED_MODULE)) { #ifdef CONFIG_MARKERS - if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, NULL, NULL); + mod->markers + mod->num_markers); #endif +#ifdef CONFIG_IMMEDIATE + /* Immediate values must be updated after markers */ + imv_update_range(mod->immediate, + mod->immediate + mod->num_immediate); +#endif + } + + if (mcountindex) { + void *mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + } + err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2142,6 +2177,10 @@ sys_init_module(void __user *umod, /* Drop initial reference. */ module_put(mod); unwind_remove_table(mod->unwind_info, 1); +#ifdef CONFIG_IMMEDIATE + imv_unref(mod->immediate, mod->immediate + mod->num_immediate, + mod->module_init, mod->init_size); +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2596,7 +2635,7 @@ EXPORT_SYMBOL(struct_module); #endif #ifdef CONFIG_MARKERS -void module_update_markers(struct module *probe_module, int *refcount) +void module_update_markers(void) { struct module *mod; @@ -2604,8 +2643,61 @@ void module_update_markers(struct module list_for_each_entry(mod, &modules, list) if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, - probe_module, refcount); + mod->markers + mod->num_markers); mutex_unlock(&module_mutex); } #endif + +#ifdef CONFIG_IMMEDIATE +/** + * _module_imv_update - update all immediate values in the kernel + * + * Iterate on the kernel core and modules to update the immediate values. + * Module_mutex must be held be the caller. + */ +void _module_imv_update(void) +{ + struct module *mod; + + list_for_each_entry(mod, &modules, list) { + if (mod->taints) + continue; + imv_update_range(mod->immediate, + mod->immediate + mod->num_immediate); + } +} +EXPORT_SYMBOL_GPL(_module_imv_update); + +/** + * module_imv_update - update all immediate values in the kernel + * + * Iterate on the kernel core and modules to update the immediate values. + * Takes module_mutex. + */ +void module_imv_update(void) +{ + mutex_lock(&module_mutex); + _module_imv_update(); + mutex_unlock(&module_mutex); +} +EXPORT_SYMBOL_GPL(module_imv_update); + +/** + * is_imv_cond_end_module + * + * Check if the two given addresses are located in the immediate value condition + * end table. Addresses should be in the same object. + * The module mutex should be held. + */ +int is_imv_cond_end_module(unsigned long addr1, unsigned long addr2) +{ + struct module *mod = __module_text_address(addr1); + + if (!mod) + return 0; + + return _is_imv_cond_end(mod->immediate_cond_end, + mod->immediate_cond_end + mod->num_immediate_cond_end, + addr1, addr2); +} +#endif Index: linux-2.6.24.7-rt27/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/Makefile 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/Makefile 2009-02-08 00:05:26.000000000 -0500 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 24 -EXTRAVERSION = .7 +EXTRAVERSION = .7-rt27 NAME = Err Metey! A Heury Beelge-a Ret! # *DOCUMENTATION* @@ -520,6 +520,10 @@ KBUILD_CFLAGS += -g KBUILD_AFLAGS += -gdwarf-2 endif +ifdef CONFIG_FTRACE +KBUILD_CFLAGS += -pg +endif + # Force gcc to behave correct even for buggy distributions KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) Index: linux-2.6.24.7-rt27/arch/x86/Kconfig =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/Kconfig 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/Kconfig 2009-02-08 00:05:15.000000000 -0500 @@ -19,6 +19,9 @@ config X86_64 config X86 bool default y + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_DYNAMIC_FTRACE + select HAVE_FTRACE config GENERIC_TIME bool @@ -94,10 +97,19 @@ config DMI default y config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + bool + depends on !X86_XADD || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y + config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + bool + depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y config ARCH_HAS_ILOG2_U32 def_bool n @@ -1219,6 +1231,10 @@ config OUT_OF_LINE_PFN_TO_PAGE def_bool X86_64 depends on DISCONTIGMEM +config HARDIRQS_SW_RESEND + bool + default y + menu "Power management options" depends on !X86_VOYAGER Index: linux-2.6.24.7-rt27/arch/x86/kernel/Makefile_32 =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/Makefile_32 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/Makefile_32 2009-02-08 00:01:09.000000000 -0500 @@ -10,6 +10,14 @@ obj-y := process_32.o signal_32.o entry_ pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\ quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc_32.o = -pg +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif +endif + obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ @@ -28,6 +36,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse_32. obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o Index: linux-2.6.24.7-rt27/arch/x86/kernel/Makefile_64 =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/Makefile_64 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/Makefile_64 2009-02-08 00:01:09.000000000 -0500 @@ -13,6 +13,14 @@ obj-y := process_64.o signal_64.o entry_ pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \ i8253.o +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc_64.o = -pg +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif +endif + obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ @@ -22,6 +30,7 @@ obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o obj-y += apic_64.o nmi_64.o obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o obj-$(CONFIG_PM) += suspend_64.o Index: linux-2.6.24.7-rt27/arch/x86/kernel/alternative.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/alternative.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/alternative.c 2009-02-08 00:01:09.000000000 -0500 @@ -65,7 +65,8 @@ __setup("noreplace-paravirt", setup_nore get them easily into strings. */ asm("\t.section .rodata, \"a\"\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8); + GENERIC_NOP7 GENERIC_NOP8 + "\t.previous"); extern const unsigned char intelnops[]; static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { NULL, @@ -83,7 +84,8 @@ static const unsigned char *const intel_ #ifdef K8_NOP1 asm("\t.section .rodata, \"a\"\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); + K8_NOP7 K8_NOP8 + "\t.previous"); extern const unsigned char k8nops[]; static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { NULL, @@ -101,7 +103,8 @@ static const unsigned char *const k8_nop #ifdef K7_NOP1 asm("\t.section .rodata, \"a\"\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); + K7_NOP7 K7_NOP8 + "\t.previous"); extern const unsigned char k7nops[]; static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { NULL, @@ -119,7 +122,8 @@ static const unsigned char *const k7_nop #ifdef P6_NOP1 asm("\t.section .rodata, \"a\"\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8); + P6_NOP7 P6_NOP8 + "\t.previous"); extern const unsigned char p6nops[]; static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { NULL, @@ -137,7 +141,7 @@ static const unsigned char *const p6_nop #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; @@ -156,7 +160,7 @@ static const struct nop { { -1, NULL } }; -static const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { const unsigned char *const *noptable = intel_nops; int i; @@ -173,7 +177,7 @@ static const unsigned char*const * find_ #endif /* CONFIG_X86_64 */ /* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void add_nops(void *insns, unsigned int len) +void add_nops(void *insns, unsigned int len) { const unsigned char *const *noptable = find_nop_table(); @@ -186,6 +190,7 @@ static void add_nops(void *insns, unsign len -= noplen; } } +EXPORT_SYMBOL_GPL(add_nops); extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; Index: linux-2.6.24.7-rt27/arch/x86/kernel/entry_32.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/entry_32.S 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/entry_32.S 2009-02-08 00:05:15.000000000 -0500 @@ -265,14 +265,18 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + DISABLE_INTERRUPTS(CLBR_ANY) + call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -330,6 +334,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -345,6 +354,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_EVENT_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -368,6 +382,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -466,20 +485,19 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + call __schedule # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and @@ -1110,6 +1128,61 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + .section .rodata,"a" #include "syscall_table_32.S" Index: linux-2.6.24.7-rt27/arch/x86/kernel/ftrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/x86/kernel/ftrace.c 2009-02-08 00:05:15.000000000 -0500 @@ -0,0 +1,135 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define CALL_BACK 5 + +/* Long is fine, even if it is only 4 bytes ;-) */ +static long *ftrace_nop; + +union ftrace_code_union { + char code[5]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip + CALL_BACK, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; +} + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned replaced; + unsigned old = *(unsigned *)old_code; /* 4 bytes */ + unsigned new = *(unsigned *)new_code; /* 4 bytes */ + unsigned char newch = new_code[4]; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %3, (%2)\n" + " jnz 2f\n" + " movb %b4, 4(%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: movl $1, %0\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "r"(newch), + "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old && replaced != new) + faulted = 2; + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[5], *new; + int ret; + + memcpy(old, &ftrace_call, 5); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + *data = 0; + return 0; +} + +asm("\t.section .rodata, \"a\"\nftrace_nop5: " + P6_NOP5 + "\t.previous"); +extern const unsigned char ftrace_nop5[]; + +int __init ftrace_dyn_arch_init(void *data) +{ + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + + ftrace_nop = (unsigned long *)ftrace_nop5; + + return 0; +} + Index: linux-2.6.24.7-rt27/arch/x86/kernel/i386_ksyms_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/i386_ksyms_32.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/i386_ksyms_32.c 2009-02-08 00:02:04.000000000 -0500 @@ -1,13 +1,21 @@ +#include #include #include #include #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -22,7 +30,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); Index: linux-2.6.24.7-rt27/arch/x86/kernel/nmi_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/nmi_32.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/nmi_32.c 2009-02-08 00:05:02.000000000 -0500 @@ -25,6 +25,7 @@ #include #include +#include #include "mach_traps.h" @@ -42,7 +43,7 @@ static cpumask_t backtrace_mask = CPU_MA atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -58,7 +59,12 @@ static int endflag __initdata = 0; static __init void nmi_cpu_busy(void *data) { #ifdef CONFIG_SMP + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -93,7 +99,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks + mdelay((100*1000)/nmi_hz); /* wait 100 ticks */ for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -318,7 +324,60 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace int irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return 0; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", + per_cpu(irq_stat, cpu).apic_timer_irqs); + show_regs(regs); + spin_unlock(&nmi_print_lock); + nmi_show_regs[cpu] = 0; + return 1; +} + +void nmi_show_all_regs(void) +{ + struct pt_regs *regs; + int i, cpu; + + if (system_state == SYSTEM_BOOTING) + return; + + preempt_disable(); + + regs = get_irq_regs(); + cpu = smp_processor_id(); + + printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", cpu); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + if (regs) + irq_show_regs_callback(cpu, regs); + else + nmi_show_regs[cpu] = 0; + + smp_send_nmi_allbutself(); + preempt_enable(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +notrace __kprobes int +nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { /* @@ -329,7 +388,10 @@ __kprobes int nmi_watchdog_tick(struct p unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - int rc=0; + int rc; + + rc = irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -339,11 +401,11 @@ __kprobes int nmi_watchdog_tick(struct p } if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); + show_regs(regs); spin_unlock(&lock); cpu_clear(cpu, backtrace_mask); } @@ -355,6 +417,7 @@ __kprobes int nmi_watchdog_tick(struct p sum = per_cpu(irq_stat, cpu).apic_timer_irqs + per_cpu(irq_stat, cpu).irq0_irqs; + /* if the apic timer isn't firing, this cpu isn't doing much */ /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* @@ -362,11 +425,36 @@ __kprobes int nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI watchdog detected lockup on " + "CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], + 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + } + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + printk(KERN_WARNING "NMI watchdog running again ...\n"); + for_each_online_cpu(i) + alert_counter[i] = 0; + + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; @@ -464,5 +552,17 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + cpumask_t mask = cpu_online_map; + preempt_disable(); + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); + preempt_enable(); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); Index: linux-2.6.24.7-rt27/arch/x86/kernel/nmi_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/nmi_64.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/nmi_64.c 2009-02-08 00:05:02.000000000 -0500 @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include +#include int unknown_nmi_panic; int nmi_watchdog_enabled; @@ -42,7 +44,7 @@ atomic_t nmi_active = ATOMIC_INIT(0); / int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static DEFINE_PER_CPU(short, wd_enabled); @@ -50,7 +52,7 @@ static DEFINE_PER_CPU(short, wd_enabled) static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) +static inline void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -66,7 +68,9 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -301,7 +305,7 @@ void touch_nmi_watchdog(void) unsigned cpu; /* - * Tell other CPUs to reset their alert counters. We cannot + * Tell other CPUs to reset their alert counters. We cannot * do it ourselves because the alert count increase is not * atomic. */ @@ -314,12 +318,67 @@ void touch_nmi_watchdog(void) touch_softlockup_watchdog(); } -int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace int irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return 0; + + spin_lock(&nmi_print_lock); + printk(KERN_WARNING "NMI show regs on CPU#%d:\n", cpu); + printk(KERN_WARNING "apic_timer_irqs: %d\n", read_pda(apic_timer_irqs)); + show_regs(regs); + spin_unlock(&nmi_print_lock); + nmi_show_regs[cpu] = 0; + return 1; +} + +void nmi_show_all_regs(void) +{ + struct pt_regs *regs; + int i, cpu; + + if (system_state == SYSTEM_BOOTING) + return; + + preempt_disable(); + + regs = get_irq_regs(); + cpu = smp_processor_id(); + + printk(KERN_WARNING "nmi_show_all_regs(): start on CPU#%d.\n", cpu); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + if (regs) + irq_show_regs_callback(cpu, regs); + else + nmi_show_regs[cpu] = 0; + + smp_send_nmi_allbutself(); + preempt_enable(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +notrace int __kprobes +nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; int cpu = smp_processor_id(); - int rc = 0; + int rc; + + rc = irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -328,14 +387,13 @@ int __kprobes nmi_watchdog_tick(struct p touched = 1; } - sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; } if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk("NMI backtrace for cpu %d\n", cpu); @@ -344,6 +402,12 @@ int __kprobes nmi_watchdog_tick(struct p cpu_clear(cpu, backtrace_mask); } + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = read_pda(apic_timer_irqs) + kstat_cpu(cpu).irqs[0]; + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ @@ -357,9 +421,26 @@ int __kprobes nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + } + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, panic_on_timeout); + } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -477,6 +558,15 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + preempt_disable(); + send_IPI_allbutself(NMI_VECTOR); + preempt_enable(); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); Index: linux-2.6.24.7-rt27/arch/x86/kernel/process_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/process_32.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/process_32.c 2009-02-08 00:04:57.000000000 -0500 @@ -113,7 +113,7 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) safe_halt(); /* enables interrupts racelessly */ else local_irq_enable(); @@ -134,7 +134,9 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { - cpu_relax(); + do { + cpu_relax(); + } while (!need_resched() && !need_resched_delayed()); } #ifdef CONFIG_HOTPLUG_CPU @@ -177,14 +179,13 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); - while (!need_resched()) { + tick_nohz_stop_sched_tick(1); + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = pm_idle; @@ -195,12 +196,17 @@ void cpu_idle(void) play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); } + local_irq_disable(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -257,10 +263,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -339,9 +345,10 @@ void __show_registers(struct pt_regs *re regs->eax, regs->ebx, regs->ecx, regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x" + " preempt:%08x\n", regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->xfs & 0xffff, gs, ss, preempt_count()); if (!all) return; @@ -413,15 +420,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux-2.6.24.7-rt27/arch/x86/kernel/process_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/process_64.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/process_64.c 2009-02-08 00:04:57.000000000 -0500 @@ -115,7 +115,7 @@ static void default_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { /* Enables interrupts one instruction before HLT. x86 special cases this so there is no race. */ safe_halt(); @@ -212,8 +212,8 @@ void cpu_idle (void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); - while (!need_resched()) { + tick_nohz_stop_sched_tick(1); + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -232,7 +232,10 @@ void cpu_idle (void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ @@ -240,9 +243,11 @@ void cpu_idle (void) } tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -258,10 +263,10 @@ void cpu_idle (void) */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -269,10 +274,10 @@ void mwait_idle_with_hints(unsigned long /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __sti_mwait(0, 0); else local_irq_enable(); @@ -390,7 +395,7 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; @@ -398,6 +403,7 @@ void exit_thread(void) /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: linux-2.6.24.7-rt27/arch/x86/kernel/tsc_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/tsc_32.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/tsc_32.c 2009-02-08 00:01:09.000000000 -0500 @@ -92,7 +92,7 @@ static inline void set_cyc2ns_scale(unsi /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long native_sched_clock(void) +unsigned long long notrace native_sched_clock(void) { unsigned long long this_offset; Index: linux-2.6.24.7-rt27/arch/x86/kernel/tsc_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/tsc_64.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/tsc_64.c 2009-02-08 00:01:09.000000000 -0500 @@ -25,12 +25,12 @@ static inline void set_cyc2ns_scale(unsi cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; } -static unsigned long long cycles_2_ns(unsigned long long cyc) +static unsigned long long notrace cycles_2_ns(unsigned long long cyc) { return (cyc * cyc2ns_scale) >> NS_SCALE; } -unsigned long long sched_clock(void) +unsigned long long notrace sched_clock(void) { unsigned long a = 0; Index: linux-2.6.24.7-rt27/arch/x86/kernel/x8664_ksyms_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/x8664_ksyms_64.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/x8664_ksyms_64.c 2009-02-08 00:02:05.000000000 -0500 @@ -1,6 +1,7 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ +#include #include #include @@ -9,12 +10,19 @@ #include #include +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); Index: linux-2.6.24.7-rt27/arch/x86/lib/Makefile_32 =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/lib/Makefile_32 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/lib/Makefile_32 2009-02-08 00:01:09.000000000 -0500 @@ -4,7 +4,7 @@ lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \ - bitops_32.o semaphore_32.o string_32.o + bitops_32.o semaphore_32.o string_32.o thunk_32.o lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o Index: linux-2.6.24.7-rt27/arch/x86/lib/thunk_32.S =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/x86/lib/thunk_32.S 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,47 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif Index: linux-2.6.24.7-rt27/arch/x86/lib/thunk_64.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/lib/thunk_64.S 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/lib/thunk_64.S 2009-02-08 00:02:05.000000000 -0500 @@ -40,15 +40,31 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC Index: linux-2.6.24.7-rt27/arch/x86/mm/init_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/mm/init_32.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/mm/init_32.c 2009-02-08 00:02:09.000000000 -0500 @@ -47,7 +47,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -795,7 +795,7 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES +#if !defined(CONFIG_KPROBES) && !defined(CONFIG_DYNAMIC_FTRACE) #ifdef CONFIG_HOTPLUG_CPU /* It must still be possible to apply SMP alternatives. */ if (num_possible_cpus() <= 1) Index: linux-2.6.24.7-rt27/arch/x86/mm/init_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/mm/init_64.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/mm/init_64.c 2009-02-08 00:02:09.000000000 -0500 @@ -53,7 +53,7 @@ EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the @@ -600,7 +600,7 @@ void mark_rodata_ro(void) start = (unsigned long)_etext; #endif -#ifdef CONFIG_KPROBES +#if defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) start = (unsigned long)__start_rodata; #endif Index: linux-2.6.24.7-rt27/arch/x86/vdso/vclock_gettime.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/vdso/vclock_gettime.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/vdso/vclock_gettime.c 2009-02-08 00:01:09.000000000 -0500 @@ -24,7 +24,7 @@ #define gtod vdso_vsyscall_gtod_data -static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; asm("syscall" : "=a" (ret) : @@ -32,7 +32,7 @@ static long vdso_fallback_gettime(long c return ret; } -static inline long vgetns(void) +notrace static inline long vgetns(void) { long v; cycles_t (*vread)(void); @@ -41,7 +41,7 @@ static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -static noinline int do_realtime(struct timespec *ts) +notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { @@ -55,7 +55,8 @@ static noinline int do_realtime(struct t } /* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +notrace static void +vset_normalized_timespec(struct timespec *ts, long sec, long nsec) { while (nsec >= NSEC_PER_SEC) { nsec -= NSEC_PER_SEC; @@ -69,7 +70,7 @@ static void vset_normalized_timespec(str ts->tv_nsec = nsec; } -static noinline int do_monotonic(struct timespec *ts) +notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { @@ -83,7 +84,7 @@ static noinline int do_monotonic(struct return 0; } -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled && gtod->clock.vread)) switch (clock) { @@ -97,7 +98,7 @@ int __vdso_clock_gettime(clockid_t clock int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { Index: linux-2.6.24.7-rt27/arch/x86/vdso/vgetcpu.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/vdso/vgetcpu.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/vdso/vgetcpu.c 2009-02-08 00:01:09.000000000 -0500 @@ -13,7 +13,8 @@ #include #include "vextern.h" -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int dummy, p; Index: linux-2.6.24.7-rt27/include/asm-x86/alternative_32.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/alternative_32.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/alternative_32.h 2009-02-08 00:01:09.000000000 -0500 @@ -151,4 +151,6 @@ apply_paravirt(struct paravirt_patch_sit extern void text_poke(void *addr, unsigned char *opcode, int len); +const unsigned char *const *find_nop_table(void); + #endif /* _I386_ALTERNATIVE_H */ Index: linux-2.6.24.7-rt27/include/asm-x86/alternative_64.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/alternative_64.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/alternative_64.h 2009-02-08 00:01:09.000000000 -0500 @@ -156,4 +156,6 @@ apply_paravirt(struct paravirt_patch *st extern void text_poke(void *addr, unsigned char *opcode, int len); +const unsigned char *const *find_nop_table(void); + #endif /* _X86_64_ALTERNATIVE_H */ Index: linux-2.6.24.7-rt27/include/asm-x86/asm.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/include/asm-x86/asm.h 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,39 @@ +#ifndef _ASM_X86_ASM_H +#define _ASM_X86_ASM_H + +#ifdef CONFIG_X86_32 +/* 32 bits */ + +# define _ASM_PTR " .long " +# define _ASM_ALIGN " .balign 4 " +# define _ASM_MOV_UL " movl " + +# define _ASM_INC " incl " +# define _ASM_DEC " decl " +# define _ASM_ADD " addl " +# define _ASM_SUB " subl " +# define _ASM_XADD " xaddl " + +#else +/* 64 bits */ + +# define _ASM_PTR " .quad " +# define _ASM_ALIGN " .balign 8 " +# define _ASM_MOV_UL " movq " + +# define _ASM_INC " incq " +# define _ASM_DEC " decq " +# define _ASM_ADD " addq " +# define _ASM_SUB " subq " +# define _ASM_XADD " xaddq " + +#endif /* CONFIG_X86_32 */ + +/* Exception table entry */ +# define _ASM_EXTABLE(from,to) \ + " .section __ex_table,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .previous\n" + +#endif /* _ASM_X86_ASM_H */ Index: linux-2.6.24.7-rt27/include/asm-x86/irqflags_32.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/irqflags_32.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/irqflags_32.h 2009-02-08 00:01:09.000000000 -0500 @@ -157,25 +157,8 @@ static inline void trace_hardirqs_fixup( * C function, so save all the C-clobbered registers: */ #ifdef CONFIG_TRACE_IRQFLAGS - -# define TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -# define TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else # define TRACE_IRQS_ON # define TRACE_IRQS_OFF Index: linux-2.6.24.7-rt27/include/asm-x86/vsyscall.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/vsyscall.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/vsyscall.h 2009-02-08 00:01:09.000000000 -0500 @@ -24,7 +24,7 @@ enum vsyscall_num { ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) #define __section_vsyscall_clock __attribute__ \ ((unused, __section__ (".vsyscall_clock"),aligned(16))) -#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) +#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 Index: linux-2.6.24.7-rt27/include/linux/ftrace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/include/linux/ftrace.h 2009-02-08 00:05:15.000000000 -0500 @@ -0,0 +1,232 @@ +#ifndef _LINUX_FTRACE_H +#define _LINUX_FTRACE_H + +#include + +#ifdef CONFIG_FTRACE + +#include +#include + +extern int ftrace_enabled; +extern int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + +typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); + +struct ftrace_ops { + ftrace_func_t func; + struct ftrace_ops *next; +}; + +/* + * The ftrace_ops must be a static and should also + * be read_mostly. These functions do modify read_mostly variables + * so use them sparely. Never free an ftrace_op or modify the + * next pointer after it has been registered. Even after unregistering + * it, the next pointer may still be used internally. + */ +int register_ftrace_function(struct ftrace_ops *ops); +int unregister_ftrace_function(struct ftrace_ops *ops); +void clear_ftrace_function(void); + +extern void ftrace_stub(unsigned long a0, unsigned long a1); +extern void mcount(void); + +void ftrace_enable(void); +void ftrace_disable(void); + +/* totally disable ftrace - can not re-enable after this */ +void ftrace_kill(void); +void __ftrace_kill(void); + +#else /* !CONFIG_FTRACE */ +# define register_ftrace_function(ops) do { } while (0) +# define unregister_ftrace_function(ops) do { } while (0) +# define clear_ftrace_function(ops) do { } while (0) +# define ftrace_enable() do { } while (0) +# define ftrace_disable() do { } while (0) +# define ftrace_kill() do { } while (0) +# define __ftrace_kill() do { } while (0) +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_HASHBITS 10 +# define FTRACE_HASHSIZE (1< + +static inline void ftrace_event_irq(int irq, int user, unsigned long ip) +{ + trace_mark(ftrace_event_irq, "%d %d %ld", irq, user, ip); +} + +static inline void ftrace_event_fault(unsigned long ip, unsigned long error, + unsigned long addr) +{ + trace_mark(ftrace_event_fault, "%ld %ld %ld", ip, error, addr); +} + +static inline void ftrace_event_timer_set(void *p1, void *p2) +{ + trace_mark(ftrace_event_timer_set, "%p %p", p1, p2); +} + +static inline void ftrace_event_timer_triggered(void *p1, void *p2) +{ + trace_mark(ftrace_event_timer_triggered, "%p %p", p1, p2); +} + +static inline void ftrace_event_timestamp(ktime_t *time) +{ + trace_mark(ftrace_event_hrtimer, "%p", time); +} + +static inline void ftrace_event_task_activate(struct task_struct *p, int cpu) +{ + trace_mark(ftrace_event_task_activate, "%p %d", p, cpu); +} + +static inline void ftrace_event_task_deactivate(struct task_struct *p, int cpu) +{ + trace_mark(ftrace_event_task_deactivate, "%p %d", p, cpu); +} + +static inline void ftrace_event_program_event(ktime_t *expires, int64_t *delta) +{ + trace_mark(ftrace_event_timer, "%p %p", expires, delta); +} + +#else +# define ftrace_event_irq(irq, user, ip) do { } while (0) +# define ftrace_event_fault(ip, error, addr) do { } while (0) +# define ftrace_event_timer_set(p1, p2) do { } while (0) +# define ftrace_event_timer_triggered(p1, p2) do { } while (0) +# define ftrace_event_timestamp(now) do { } while (0) +# define ftrace_event_task_activate(p, cpu) do { } while (0) +# define ftrace_event_task_deactivate(p, cpu) do { } while (0) +# define ftrace_event_program_event(p, d) do { } while (0) +#endif /* CONFIG_TRACE_EVENTS */ + +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +extern void ftrace_init(void); +extern void ftrace_init_module(unsigned long *start, unsigned long *end); +#else +static inline void ftrace_init(void) { } +static inline void +ftrace_init_module(unsigned long *start, unsigned long *end) { } +#endif + +/* Upstream has include/asm-x86/ftrace.h, but we'll hack this for now */ +#ifdef CONFIG_X86 +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* + * call mcount is "e8 <4 byte offset>" + * The addr points to the 4 byte offset and the caller of this + * function wants the pointer to e8. Simply subtract one. + */ + return addr - 1; +} +#else +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr; +} +#endif +#endif /* _LINUX_FTRACE_H */ Index: linux-2.6.24.7-rt27/include/linux/irqflags.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/irqflags.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/irqflags.h 2009-02-08 00:01:59.000000000 -0500 @@ -11,6 +11,12 @@ #ifndef _LINUX_TRACE_IRQFLAGS_H #define _LINUX_TRACE_IRQFLAGS_H +#define BUILD_CHECK_IRQ_FLAGS(flags) \ + do { \ + BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long)); \ + typecheck(unsigned long, flags); \ + } while (0) + #ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); @@ -41,6 +47,15 @@ # define INIT_TRACE_IRQFLAGS #endif +#if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) + extern void stop_critical_timings(void); + extern void start_critical_timings(void); +#else +# define stop_critical_timings() do { } while (0) +# define start_critical_timings() do { } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #include @@ -50,10 +65,15 @@ #define local_irq_disable() \ do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0) #define local_irq_save(flags) \ - do { raw_local_irq_save(flags); trace_hardirqs_off(); } while (0) + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_irq_save(flags); \ + trace_hardirqs_off(); \ + } while (0) #define local_irq_restore(flags) \ do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ if (raw_irqs_disabled_flags(flags)) { \ raw_local_irq_restore(flags); \ trace_hardirqs_off(); \ @@ -69,8 +89,16 @@ */ # define raw_local_irq_disable() local_irq_disable() # define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) local_irq_save(flags) -# define raw_local_irq_restore(flags) local_irq_restore(flags) +# define raw_local_irq_save(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_save(flags); \ + } while (0) +# define raw_local_irq_restore(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + local_irq_restore(flags); \ + } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT @@ -80,7 +108,11 @@ raw_safe_halt(); \ } while (0) -#define local_save_flags(flags) raw_local_save_flags(flags) +#define local_save_flags(flags) \ + do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_local_save_flags(flags); \ + } while (0) #define irqs_disabled() \ ({ \ @@ -90,7 +122,11 @@ raw_irqs_disabled_flags(flags); \ }) -#define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags) +#define irqs_disabled_flags(flags) \ +({ \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + raw_irqs_disabled_flags(flags); \ +}) #endif /* CONFIG_X86 */ #endif Index: linux-2.6.24.7-rt27/include/linux/ktime.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/ktime.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/ktime.h 2009-02-08 00:01:09.000000000 -0500 @@ -326,4 +326,10 @@ extern void ktime_get_ts(struct timespec /* Get the real (wall-) time in timespec format: */ #define ktime_get_real_ts(ts) getnstimeofday(ts) +static inline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + #endif Index: linux-2.6.24.7-rt27/include/linux/linkage.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/linkage.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/linkage.h 2009-02-08 00:01:09.000000000 -0500 @@ -3,6 +3,8 @@ #include +#define notrace __attribute__((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else Index: linux-2.6.24.7-rt27/include/linux/mmiotrace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/include/linux/mmiotrace.h 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,85 @@ +#ifndef MMIOTRACE_H +#define MMIOTRACE_H + +#include +#include + +struct kmmio_probe; +struct pt_regs; + +typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, + struct pt_regs *, unsigned long addr); +typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, + unsigned long condition, struct pt_regs *); + +struct kmmio_probe { + struct list_head list; /* kmmio internal list */ + unsigned long addr; /* start location of the probe point */ + unsigned long len; /* length of the probe region */ + kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ + kmmio_post_handler_t post_handler; /* Called after addr is executed */ + void *private; +}; + +/* kmmio is active by some kmmio_probes? */ +static inline int is_kmmio_active(void) +{ + extern unsigned int kmmio_count; + return kmmio_count; +} + +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +/* Called from page fault handler. */ +extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); + +/* Called from ioremap.c */ +#ifdef CONFIG_MMIOTRACE +extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr); +extern void mmiotrace_iounmap(volatile void __iomem *addr); +#else +static inline void mmiotrace_ioremap(resource_size_t offset, + unsigned long size, void __iomem *addr) +{ +} + +static inline void mmiotrace_iounmap(volatile void __iomem *addr) +{ +} +#endif /* CONFIG_MMIOTRACE_HOOKS */ + +enum mm_io_opcode { + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ + MMIO_MARKER = 0x5, /* raw char data */ + MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ +}; + +struct mmiotrace_rw { + resource_size_t phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ +}; + +struct mmiotrace_map { + resource_size_t phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ +}; + +/* in kernel/trace/trace_mmiotrace.c */ +extern void enable_mmiotrace(void); +extern void disable_mmiotrace(void); +extern void mmio_trace_rw(struct mmiotrace_rw *rw); +extern void mmio_trace_mapping(struct mmiotrace_map *map); + +#endif /* MMIOTRACE_H */ Index: linux-2.6.24.7-rt27/include/linux/preempt.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/preempt.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/preempt.h 2009-02-08 00:02:08.000000000 -0500 @@ -9,8 +9,10 @@ #include #include #include +#include -#ifdef CONFIG_DEBUG_PREEMPT +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) || \ + defined(CONFIG_PREEMPT_TRACE) extern void fastcall add_preempt_count(int val); extern void fastcall sub_preempt_count(int val); #else @@ -21,11 +23,12 @@ #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) +#define preempt_count() (current_thread_info()->preempt_count) #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); +asmlinkage void preempt_schedule_irq(void); #define preempt_disable() \ do { \ @@ -33,21 +36,62 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define __preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) + +#ifdef CONFIG_DEBUG_PREEMPT +extern void notrace preempt_enable_no_resched(void); +#else +# define preempt_enable_no_resched() __preempt_enable_no_resched() +#endif + #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ preempt_schedule(); \ } while (0) +#define preempt_check_resched_delayed() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \ + preempt_schedule(); \ +} while (0) + #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ + __preempt_enable_no_resched(); \ + barrier(); \ + preempt_check_resched(); \ +} while (0) + +/* For debugging and tracer internals only! */ +#define add_preempt_count_notrace(val) \ + do { preempt_count() += (val); } while (0) +#define sub_preempt_count_notrace(val) \ + do { preempt_count() -= (val); } while (0) +#define inc_preempt_count_notrace() add_preempt_count_notrace(1) +#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) + +#define preempt_disable_notrace() \ +do { \ + inc_preempt_count_notrace(); \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched_notrace() \ +do { \ + barrier(); \ + dec_preempt_count_notrace(); \ +} while (0) + +/* preempt_check_resched is OK to trace */ +#define preempt_enable_notrace() \ +do { \ + preempt_enable_no_resched_notrace(); \ barrier(); \ preempt_check_resched(); \ } while (0) @@ -56,8 +100,16 @@ do { \ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) +#define __preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) +#define preempt_check_resched_delayed() do { } while (0) + +#define preempt_disable_notrace() do { } while (0) +#define preempt_enable_no_resched_notrace() do { } while (0) +#define preempt_enable_notrace() do { } while (0) + +#define preempt_schedule_irq() do { } while (0) #endif Index: linux-2.6.24.7-rt27/include/linux/writeback.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/writeback.h 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/writeback.h 2009-02-08 00:01:09.000000000 -0500 @@ -103,6 +103,8 @@ extern int dirty_expire_interval; extern int block_dump; extern int laptop_mode; +extern unsigned long determine_dirtyable_memory(void); + extern int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); Index: linux-2.6.24.7-rt27/kernel/lockdep.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/lockdep.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/lockdep.c 2009-02-08 00:04:33.000000000 -0500 @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -66,7 +67,7 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static __raw_spinlock_t lockdep_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { @@ -81,6 +82,8 @@ static int graph_lock(void) __raw_spin_unlock(&lockdep_lock); return 0; } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; return 1; } @@ -89,6 +92,7 @@ static inline int graph_unlock(void) if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); + current->lockdep_recursion--; __raw_spin_unlock(&lockdep_lock); return 0; } @@ -508,7 +512,11 @@ static void print_lock(struct held_lock static void lockdep_print_held_locks(struct task_struct *curr) { - int i, depth = curr->lockdep_depth; + int i, depth; + + if (!curr) + curr = current; + depth = curr->lockdep_depth; if (!depth) { printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); @@ -573,7 +581,7 @@ static void print_lock_dependencies(stru static void print_kernel_version(void) { - printk("%s %.*s\n", init_utsname()->release, + printk("[ %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); } @@ -809,6 +817,21 @@ out_unlock_set: return class; } +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS) + +#define RECURSION_LIMIT 40 + +static int noinline print_infinite_recursion_bug(void) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); + + return 0; +} +#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */ + #ifdef CONFIG_PROVE_LOCKING /* * Allocate a lockdep entry. (assumes the graph_lock held, returns @@ -939,18 +962,6 @@ static noinline int print_circular_bug_t return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - /* * Prove that the dependency graph starting at can not * lead to . Print an error and return 0 if it does. @@ -978,7 +989,7 @@ check_noncircular(struct lock_class *sou return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1068,6 +1079,7 @@ find_usage_backwards(struct lock_class * return 1; } +#ifdef CONFIG_PROVE_LOCKING static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, @@ -1128,6 +1140,7 @@ print_bad_irq_dependency(struct task_str return 0; } +#endif /* CONFIG_PROVE_LOCKING */ static int check_usage(struct task_struct *curr, struct held_lock *prev, @@ -1676,7 +1689,7 @@ valid_state(struct task_struct *curr, st static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2009,11 +2022,12 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2051,16 +2065,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2078,7 +2099,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* @@ -2518,6 +2544,55 @@ static int check_unlock(struct task_stru return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class = class; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock->class->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2681,6 +2756,29 @@ static void check_flags(unsigned long fl #endif } +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat && !prove_locking)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2791,7 +2889,7 @@ found_it: stats = get_lock_stats(hlock->class); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); @@ -3039,13 +3137,13 @@ void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + printk("... MAX_LOCKDEP_SUBCLASSES: %6lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %6lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %6lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %6lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %6lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %6lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %6lu\n", CHAINHASH_SIZE); printk(" memory used by lock dependency info: %lu kB\n", (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + @@ -3212,7 +3310,8 @@ void debug_show_held_locks(struct task_s printk("INFO: lockdep is turned off.\n"); return; } - lockdep_print_held_locks(task); + if (task == current) + lockdep_print_held_locks(task); } EXPORT_SYMBOL_GPL(debug_show_held_locks); Index: linux-2.6.24.7-rt27/kernel/sched_trace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/sched_trace.h 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,41 @@ +#include + +static inline void trace_kernel_sched_wait(struct task_struct *p) +{ + trace_mark(kernel_sched_wait_task, "pid %d state %ld", + p->pid, p->state); +} + +static inline +void trace_kernel_sched_wakeup(struct rq *rq, struct task_struct *p) +{ + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); +} + +static inline +void trace_kernel_sched_wakeup_new(struct rq *rq, struct task_struct *p) +{ + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); +} + +static inline void trace_kernel_sched_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); +} + +static inline void +trace_kernel_sched_migrate_task(struct task_struct *p, int src, int dst) +{ + trace_mark(kernel_sched_migrate_task, + "pid %d state %ld dest_cpu %d", + p->pid, p->state, dst); +} Index: linux-2.6.24.7-rt27/kernel/sysctl.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/sysctl.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/sysctl.c 2009-02-08 00:05:08.000000000 -0500 @@ -46,6 +46,8 @@ #include #include #include +#include +#include #include #include @@ -66,6 +68,7 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; extern int sysctl_oom_kill_allocating_task; +extern int futex_performance_hack; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -147,6 +150,10 @@ static int parse_table(int __user *, int void __user *, size_t, struct ctl_table *); #endif +#ifdef CONFIG_PREEMPT_RT +extern int rt_rwlock_limit; +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, @@ -338,6 +345,82 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_FUTEX + { + .ctl_name = CTL_UNNUMBERED, + .procname = "futex_performance_hack", + .data = &futex_performance_hack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "futex_rt_pi_warning", + .data = &futex_rt_pi_warning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prof_pid", + .data = &prof_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PREEMPT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kernel_preemption", + .data = &kernel_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY + { + .ctl_name = CTL_UNNUMBERED, + .procname = "voluntary_preemption", + .data = &voluntary_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "softirq_preemption", + .data = &softirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hardirq_preemption", + .data = &hardirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_RT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rwlock_reader_limit", + .data = &rt_rwlock_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC, .procname = "panic", @@ -346,6 +429,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_GENERIC_HARDIRQS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "debug_direct_keyboard", + .data = &debug_direct_keyboard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", @@ -470,6 +563,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FTRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ftrace_enable_sysctl, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, Index: linux-2.6.24.7-rt27/kernel/trace/Kconfig =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/Kconfig 2009-02-08 00:05:15.000000000 -0500 @@ -0,0 +1,173 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool + +config HAVE_DYNAMIC_FTRACE + bool + +config HAVE_FTRACE_MCOUNT_RECORD + bool + +config TRACER_MAX_TRACE + bool + +config TRACING + bool + select DEBUG_FS + select STACKTRACE + +config FTRACE + bool "Kernel Function Tracer" + depends on HAVE_FTRACE + select FRAME_POINTER + select TRACING + select CONTEXT_SWITCH_TRACER + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction to the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. + +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + depends on HAVE_FTRACE + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + depends on HAVE_FTRACE + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config SCHED_TRACER + bool "Scheduling Latency Tracer" + depends on HAVE_FTRACE + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + +config EVENT_TRACER + bool "trace kernel events" + depends on DEBUG_KERNEL + select CONTEXT_SWITCH_TRACER + help + This option activates the event tracer of the latency_tracer. + It activates markers through out the kernel for tracing. + This option has a fairly low overhead when enabled. + +config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" + depends on HAVE_FTRACE + select TRACING + select MARKERS + help + This tracer gets called from the context switch and records + all switching of tasks. + +config DYNAMIC_FTRACE + bool "enable/disable ftrace tracepoints dynamically" + depends on FTRACE + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replaces them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. + + This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + has native performance as long as no tracing is active. + + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. + +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on TRACING + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. + +config INTERRUPT_OFF_HIST + bool "Interrupts off critical timings histogram" + depends on IRQSOFF_TRACER + help + This option uses the infrastructure of the critical + irqs off timings to create a histogram of latencies. + +config PREEMPT_OFF_HIST + bool "Preempt off critical timings histogram" + depends on PREEMPT_TRACER + help + This option uses the infrastructure of the critical + preemption off timings to create a histogram of latencies. + +config WAKEUP_LATENCY_HIST + bool "Wakeup latencies histogram" + select TRACING + select MARKERS + help + This option uses the infrastructure of the wakeup tracer + to create a histogram of latencies. + +config PREEMPT_TRACE + bool "Keep a record of preempt disabled spots" + depends on DEBUG_KERNEL + select TRACING + help + Keeps a record of the last 25 preempt disabled locations. Index: linux-2.6.24.7-rt27/kernel/trace/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/Makefile 2009-02-08 00:01:16.000000000 -0500 @@ -0,0 +1,30 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FTRACE +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = -pg +obj-y += trace_selftest_dynamic.o +endif + +obj-$(CONFIG_FTRACE) += libftrace.o + +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_EVENT_TRACER) += trace_events.o + +obj-$(CONFIG_INTERRUPT_OFF_HIST) += trace_hist.o +obj-$(CONFIG_PREEMPT_OFF_HIST) += trace_hist.o +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += trace_hist.o + +obj-$(CONFIG_PREEMPT_TRACE) += preempt-trace.o + +libftrace-y := ftrace.o Index: linux-2.6.24.7-rt27/kernel/trace/ftrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/ftrace.c 2009-02-08 00:05:15.000000000 -0500 @@ -0,0 +1,1654 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; +static int last_ftrace_enabled; + +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + +static DEFINE_RAW_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_sysctl_lock); + +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); + + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + + if (ftrace_enabled) { + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + } + + spin_unlock(&ftrace_lock); + + return 0; +} + +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + struct ftrace_ops **p; + int ret = 0; + + spin_lock(&ftrace_lock); + + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + if (ftrace_enabled) { + /* If we only have one func left, then call that directly */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + } + + out: + spin_unlock(&ftrace_lock); + + return ret; +} + +static int ftrace_disabled_count; +static int save_ftrace_enabled; + +void ftrace_disable(void) +{ + mutex_lock(&ftrace_sysctl_lock); + + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; +} + +void ftrace_enable(void) +{ + /* ftrace_enable must be paired with ftrace_disable */ + if (!mutex_is_locked(&ftrace_sysctl_lock)) { + WARN_ON(1); + return; + } + + ftrace_enabled = save_ftrace_enabled; + + mutex_unlock(&ftrace_sysctl_lock); +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct task_struct *ftraced_task; + +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + +static int ftrace_filtered; + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_RAW_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); +static DEFINE_MUTEX(ftrace_regex_lock); + +struct ftrace_page { + struct ftrace_page *next; + unsigned long index; + struct dyn_ftrace records[]; +}; + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static int ftraced_trigger; +static int ftraced_suspend; +static int ftraced_stop; + +static int ftrace_record_suspend; + +static struct dyn_ftrace *ftrace_free_records; + +static inline int +ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head_rcu(&node->node, &ftrace_hash[key]); +} + +static void ftrace_free_rec(struct dyn_ftrace *rec) +{ + /* no locking, only called from kstop_machine */ + + rec->ip = (unsigned long)ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + +static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +{ + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + WARN_ON_ONCE(1); + ftrace_free_records = NULL; + ftrace_disabled = 1; + ftrace_enabled = 0; + return NULL; + } + + ftrace_free_records = (void *)rec->ip; + memset(rec, 0, sizeof(*rec)); + return rec; + } + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + +static void +ftrace_record_ip(unsigned long ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + int cpu; + + if (!ftrace_enabled || ftrace_disabled) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + /* + * We simply need to protect against recursion. + * Use the the raw version of smp_processor_id and not + * __get_cpu_var which can call debug hooks that can + * cause a recursive crash here. + */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +#define FTRACE_ADDR ((long)(ftrace_caller)) +#define MCOUNT_ADDR ((long)(mcount)) + +static void +__ftrace_replace_code(struct dyn_ftrace *rec, + unsigned char *old, unsigned char *new, int enable) +{ + unsigned long ip, fl; + int failed; + + ip = rec->ip; + + if (ftrace_filtered && enable) { + /* + * If filtering is on: + * + * If this record is set to be filtered and + * is enabled then do nothing. + * + * If this record is set to be filtered and + * it is not enabled, enable it. + * + * If this record is not set to be filtered + * and it is not enabled do nothing. + * + * If this record is set not to trace then + * do nothing. + * + * If this record is not set to be filtered and + * it is enabled, disable it. + */ + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || + (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) + return; + + /* + * If it is enabled disable it, + * otherwise enable it! + */ + if (fl == FTRACE_FL_ENABLED) { + /* swap new and old */ + new = old; + old = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags &= ~FTRACE_FL_ENABLED; + } else { + new = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags |= FTRACE_FL_ENABLED; + } + } else { + + if (enable) { + /* + * If this record is set not to trace and is + * not enabled, do nothing. + */ + fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); + if (fl == FTRACE_FL_NOTRACE) + return; + + new = ftrace_call_replace(ip, FTRACE_ADDR); + } else + old = ftrace_call_replace(ip, FTRACE_ADDR); + + if (enable) { + if (rec->flags & FTRACE_FL_ENABLED) + return; + rec->flags |= FTRACE_FL_ENABLED; + } else { + if (!(rec->flags & FTRACE_FL_ENABLED)) + return; + rec->flags &= ~FTRACE_FL_ENABLED; + } + } + + failed = ftrace_modify_code(ip, old, new); + if (failed) { + unsigned long key; + /* It is possible that the function hasn't been converted yet */ + key = hash_long(ip, FTRACE_HASHBITS); + if (!ftrace_ip_in_hash(ip, key)) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } + + } +} + +static void ftrace_replace_code(int enable) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int i; + + if (enable) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + __ftrace_replace_code(rec, old, new, enable); + } + } +} + +static void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} + +static int +ftrace_code_disable(struct dyn_ftrace *rec) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + return 0; + } + return 1; +} + +static int __ftrace_update_code(void *ignore); + +static int __ftrace_modify_code(void *data) +{ + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) { + /* + * Update any recorded ips now that we have the + * machine stopped + */ + __ftrace_update_code(NULL); + ftrace_replace_code(1); + } else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; +} + +static void ftrace_run_update_code(int command) +{ + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); +} + +void ftrace_disable_daemon(void) +{ + /* Stop the daemon from calling kstop_machine */ + mutex_lock(&ftraced_lock); + ftraced_stop = 1; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +void ftrace_enable_daemon(void) +{ + mutex_lock(&ftraced_lock); + ftraced_stop = 0; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +static ftrace_func_t saved_ftrace_func; + +static void ftrace_startup(void) +{ + int command = 0; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); + out: + mutex_unlock(&ftraced_lock); +} + +static void ftrace_shutdown(void) +{ + int command = 0; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); + out: + mutex_unlock(&ftraced_lock); +} + +static void ftrace_startup_sysctl(void) +{ + int command = FTRACE_ENABLE_MCOUNT; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftraced_suspend is true if we want ftrace running */ + if (ftraced_suspend) + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); +} + +static void ftrace_shutdown_sysctl(void) +{ + int command = FTRACE_DISABLE_MCOUNT; + + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if ftrace is running */ + if (ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + int save_ftrace_enabled; + cycle_t start, stop; + int i; + + /* Don't be recording funcs now */ + ftrace_record_suspend++; + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; + + start = ftrace_now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + if (ftrace_code_disable(p)) + ftrace_update_cnt++; + } + + } + + stop = ftrace_now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + ftraced_trigger = 0; + + ftrace_enabled = save_ftrace_enabled; + ftrace_record_suspend--; + + return 0; +} + +static int ftrace_update_code(void) +{ + if (unlikely(ftrace_disabled) || + !ftrace_enabled || !ftraced_trigger) + return 0; + + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + + return 1; +} + +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld hash entries in %d pages\n", + num_to_init, cnt); + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_CONT = (1 << 1), + FTRACE_ITER_NOTRACE = (1 << 2), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + struct ftrace_page *pg; + unsigned idx; + unsigned flags; + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned buffer_idx; + unsigned filtered; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FAILED) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && + !(rec->flags & FTRACE_FL_NOTRACE))) { + rec = NULL; + goto retry; + } + } + + iter->pos = *pos; + + return rec; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l = -1; + + if (*pos != iter->pos) { + for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) + ; + } else { + l = *pos; + p = t_next(m, p, &l); + } + + return p; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + if (!rec) + return 0; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->pos = -1; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + +int ftrace_avail_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter = m->private; + + seq_release(inode, file); + kfree(iter); + + return 0; +} + +static void ftrace_filter_reset(int enable) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned i; + + /* keep kstop machine from running */ + preempt_disable(); + if (enable) + ftrace_filtered = 0; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~type; + } + pg = pg->next; + } + preempt_enable(); +} + +static int +ftrace_regex_open(struct inode *inode, struct file *file, int enable) +{ + struct ftrace_iterator *iter; + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&ftrace_regex_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_filter_reset(enable); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + iter->pos = -1; + iter->flags = enable ? FTRACE_ITER_FILTER : + FTRACE_ITER_NOTRACE; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + } else + file->private_data = iter; + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 1); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 0); +} + +static ssize_t +ftrace_regex_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static loff_t +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +enum { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +static void +ftrace_match(unsigned char *buff, int len, int enable) +{ + char str[KSYM_SYMBOL_LEN]; + char *search = NULL; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned i, match = 0, search_len = 0; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + search = buff + i + 1; + type = MATCH_END_ONLY; + search_len = len - (i + 1); + } else { + if (type == MATCH_END_ONLY) { + type = MATCH_MIDDLE_ONLY; + } else { + match = i; + type = MATCH_FRONT_ONLY; + } + buff[i] = 0; + break; + } + } + } + + /* keep kstop machine from running */ + preempt_disable(); + if (enable) + ftrace_filtered = 1; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + int matched = 0; + char *ptr; + + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; + } + if (matched) + rec->flags |= flag; + } + pg = pg->next; + } + preempt_enable(); +} + +static ssize_t +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) +{ + struct ftrace_iterator *iter; + char ch; + size_t read = 0; + ssize_t ret; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&ftrace_regex_lock); + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (!*ppos) { + iter->flags &= ~FTRACE_ITER_CONT; + iter->buffer_idx = 0; + } + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + file->f_pos += read; + ret = read; + goto out; + } + + iter->buffer_idx = 0; + } + + while (cnt && !isspace(ch)) { + if (iter->buffer_idx < FTRACE_BUFF_MAX) + iter->buffer[iter->buffer_idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx, enable); + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; + + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static void +ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +{ + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(enable); + if (buf) + ftrace_match(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); +} + +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 1); +} + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 0); +} + +static int +ftrace_regex_release(struct inode *inode, struct file *file, int enable) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + + mutex_lock(&ftrace_regex_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx, enable); + } + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (iter->filtered && ftraced_suspend && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + kfree(iter); + mutex_unlock(&ftrace_regex_lock); + return 0; +} + +static int +ftrace_filter_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 1); +} + +static int +ftrace_notrace_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 0); +} + +static ssize_t +ftraced_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* don't worry about races */ + char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; + int r = strlen(buf); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftraced_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (strncmp(buf, "enable", 6) == 0) + val = 1; + else if (strncmp(buf, "disable", 7) == 0) + val = 0; + else { + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + } + + if (val) + ftrace_enable_daemon(); + else + ftrace_disable_daemon(); + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + +static struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = ftrace_regex_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_filter_release, +}; + +static struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = ftrace_regex_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +}; + +static struct file_operations ftraced_fops = { + .open = tracing_open_generic, + .read = ftraced_read, + .write = ftraced_write, +}; + +/** + * ftrace_force_update - force an update to all recording ftrace functions + */ +int ftrace_force_update(void) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + + /* + * If ftraced_trigger is not set, then there is nothing + * to update. + */ + if (ftraced_trigger && !ftrace_update_code()) + ret = -EBUSY; + + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +static void ftrace_force_shutdown(void) +{ + struct task_struct *task; + int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; + + mutex_lock(&ftraced_lock); + task = ftraced_task; + ftraced_task = NULL; + ftraced_suspend = -1; + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); + + if (task) + kthread_stop(task); +} + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_filter_functions' entry\n"); + + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_filter' entry\n"); + + entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_notrace' entry\n"); + + entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, + NULL, &ftraced_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ftraced_enabled' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +static int ftrace_convert_nops(unsigned long *start, + unsigned long *end) +{ + unsigned long *p; + unsigned long addr; + unsigned long flags; + + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + ftrace_record_ip(addr); + ftrace_shutdown_replenish(); + } + + /* p is ignored */ + local_irq_save(flags); + __ftrace_update_code(p); + local_irq_restore(flags); + + return 0; +} + +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ + ftrace_convert_nops(start, end); +} + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ + unsigned long count, addr, flags; + int ret; + + /* Keep the ftrace pointer to the stub */ + addr = (unsigned long)ftrace_stub; + + local_irq_save(flags); + ftrace_dyn_arch_init(&addr); + local_irq_restore(flags); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + + ret = ftrace_dyn_table_alloc(count); + if (ret) + goto failed; + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_convert_nops(__start_mcount_loc, + __stop_mcount_loc); + + return; + failed: + ftrace_disabled = 1; +} +#else /* CONFIG_FTRACE_MCOUNT_RECORD */ +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init ftrace_dynamic_init(void) +{ + struct task_struct *p; + unsigned long addr; + int ret; + + addr = (unsigned long)ftrace_record_ip; + + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) { + ret = (int)addr; + goto failed; + } + + ret = ftrace_dyn_table_alloc(NR_TO_INIT); + if (ret) + goto failed; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) { + ret = -1; + goto failed; + } + + last_ftrace_enabled = ftrace_enabled = 1; + ftraced_task = p; + + return 0; + + failed: + ftrace_disabled = 1; + return ret; +} + +core_initcall(ftrace_dynamic_init); +#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ + +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_force_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/** + * ftrace_kill - totally shutdown ftrace + * + * This is a safety measure. If something was detected that seems + * wrong, calling this function will keep ftrace from doing + * any more modifications, and updates. + * used when something went wrong. + */ +void ftrace_kill(void) +{ + mutex_lock(&ftrace_sysctl_lock); + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); + mutex_unlock(&ftrace_sysctl_lock); + + /* Try to totally disable ftrace */ + ftrace_force_shutdown(); +} + +/** + * __ftrace_kill - shutdown ftrace in a mean fashion + * + * In case of system failure we want to stop ftrace as soon as + * possible. This is like ftrace_kill but does not grab the + * mutexes nor does it call the kstop machine. + * + * This one is save to use in atomic. + */ +void __ftrace_kill(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -1; + + mutex_lock(&ftrace_sysctl_lock); + ret = __register_ftrace_function(ops); + ftrace_startup(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + mutex_lock(&ftrace_sysctl_lock); + ret = __unregister_ftrace_function(ops); + ftrace_shutdown(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ftrace_sysctl_lock); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + goto out; + + last_ftrace_enabled = ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_list != &ftrace_list_end) { + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + else + ftrace_trace_function = ftrace_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_sysctl_lock); + return ret; +} Index: linux-2.6.24.7-rt27/kernel/trace/trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace.c 2009-02-08 00:05:24.000000000 -0500 @@ -0,0 +1,3516 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "trace.h" + +unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_thresh; + +static unsigned long __read_mostly tracing_nr_buffers; +static cpumask_t __read_mostly tracing_buffer_mask; + +#define for_each_cpu_mask_nr(cpu, mask) for_each_cpu_mask(cpu, mask) +#define for_each_tracing_cpu(cpu) \ + for_each_cpu_mask_nr(cpu, tracing_buffer_mask) + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = { + .name = "none", +}; + +static int trace_alloc_page(void); +static int trace_free_page(void); + +static int tracing_disabled = 1; + +static unsigned long tracing_pages_allocated; + +long +ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +cycle_t ftrace_now(int cpu) +{ +// return cpu_clock(cpu); + return sched_clock(); +} + +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +/* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum + * trace while it continues examining live traces. + * + * The buffers for the max_tr are set up the same as the global_trace. + * When a snapshot is taken, the link list of the max_tr is swapped + * with the link list of the global_trace and the buffers are reset for + * the global_trace so the tracing can continue. + */ +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_data); + +/* tracer_enabled is used to toggle activation of a tracer */ +static int tracer_enabled = 1; + +/* + * trace_nr_entries is the number of entries that is allocated + * for a buffer. Note, the number of entries is always rounded + * to ENTRIES_PER_PAGE. + */ +static unsigned long trace_nr_entries = 65536UL; + +/* trace_types holds a link list of available tracers. */ +static struct tracer *trace_types __read_mostly; + +/* current_trace points to the tracer that is currently active */ +static struct tracer *current_trace __read_mostly; + +/* + * max_tracer_type_len is used to simplify the allocating of + * buffers to read userspace tracer names. We keep track of + * the longest tracer name registered. + */ +static int max_tracer_type_len; + +/* + * trace_types_lock is used to protect the trace_types list. + * This lock is also used to keep user access serialized. + * Accesses from userspace will grab this lock while userspace + * activities happen inside the kernel. + */ +static DEFINE_MUTEX(trace_types_lock); + +/* trace_wait is a waitqueue for tasks blocked on trace_poll */ +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +/* trace_flags holds iter_ctrl options */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; + +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Simply wakes up any task that is blocked on the trace_wait + * queue. These is used with trace_poll for tasks polling the trace. + */ +void trace_wake_up(void) +{ + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) +#ifdef CONFIG_PREEMPT_RT + if (!irqs_disabled()) +#endif + wake_up(&trace_wait); +} + +#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) + +static int __init set_nr_entries(char *str) +{ + unsigned long nr_entries; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &nr_entries); + /* nr_entries can not be zero */ + if (ret < 0 || nr_entries == 0) + return 0; + trace_nr_entries = nr_entries; + return 1; +} +__setup("trace_entries=", set_nr_entries); + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +/* These must match the bit postions in trace_iterator_flags */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + "raw", + "hex", + "bin", + "block", + "stacktrace", + "sched-tree", + NULL +}; + +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + */ +static __raw_spinlock_t ftrace_max_lock = + (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + +#define CHECK_COND(cond) \ + if (unlikely(cond)) { \ + tracing_disabled = 1; \ + WARN_ON(1); \ + return -1; \ + } + +/** + * check_pages - integrity check of trace buffers + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +int check_pages(struct trace_array_cpu *data) +{ + struct page *page, *tmp; + + CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); + CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); + + list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { + CHECK_COND(page->lru.next->prev != &page->lru); + CHECK_COND(page->lru.prev->next != &page->lru); + } + + return 0; +} + +/** + * head_page - page address of the first page in per_cpu buffer. + * + * head_page returns the page address of the first page in + * a per_cpu buffer. This also preforms various consistency + * checks to make sure the buffer has not been corrupted. + */ +void *head_page(struct trace_array_cpu *data) +{ + struct page *page; + + if (list_empty(&data->trace_pages)) + return NULL; + + page = list_entry(data->trace_pages.next, struct page, lru); + BUG_ON(&page->lru == &data->trace_pages); + + return page_address(page); +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + int ret; + + if (!len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +static int +trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +static int +trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +static int +trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + +#define HEX_CHARS 17 +static const char hex2asc[] = "0123456789abcdef"; + +static int +trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data = mem; + unsigned char byte; + int i, j; + + BUG_ON(len >= HEX_CHARS); + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + byte = data[i]; + + hex[j++] = hex2asc[byte & 0x0f]; + hex[j++] = hex2asc[byte >> 4]; + } + hex[j++] = ' '; + + return trace_seq_putmem(s, hex, j); +} + +static void +trace_seq_reset(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + +ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +{ + int len; + int ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret) + return -EFAULT; + + s->readpos += len; + return cnt; +} + +static void +trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_reset(s); +} + +/* + * flip the trace buffers between two trace descriptors. + * This usually is the buffers between the global_trace and + * the max_tr to record a snapshot of a current trace. + * + * The ftrace_max_lock must be held. + */ +static void +flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) +{ + struct list_head flip_pages; + + INIT_LIST_HEAD(&flip_pages); + + memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, + sizeof(struct trace_array_cpu) - + offsetof(struct trace_array_cpu, trace_head_idx)); + + check_pages(tr1); + check_pages(tr2); + list_splice_init(&tr1->trace_pages, &flip_pages); + list_splice_init(&tr2->trace_pages, &tr1->trace_pages); + list_splice_init(&flip_pages, &tr2->trace_pages); + BUG_ON(!list_empty(&flip_pages)); + check_pages(tr1); + check_pages(tr2); +} + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data; + int i; + + WARN_ON_ONCE(!irqs_disabled()); + __raw_spin_lock(&ftrace_max_lock); + /* clear out all the previous traces */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + flip_trace(max_tr.data[i], data); + tracing_reset(data); + } + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + int i; + + WARN_ON_ONCE(!irqs_disabled()); + __raw_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(i) + tracing_reset(max_tr.data[i]); + + flip_trace(max_tr.data[cpu], data); + tracing_reset(data); + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +} + +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int len; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Trace %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + +#ifdef CONFIG_FTRACE_STARTUP_TEST + if (type->selftest) { + struct tracer *saved_tracer = current_trace; + struct trace_array_cpu *data; + struct trace_array *tr = &global_trace; + int saved_ctrl = tr->ctrl; + int i; + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } + current_trace = type; + tr->ctrl = 0; + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + current_trace = saved_tracer; + tr->ctrl = saved_ctrl; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + goto out; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + for_each_tracing_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } + printk(KERN_CONT "PASSED\n"); + } +#endif + + type->next = trace_types; + trace_types = type; + len = strlen(type->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + int len; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Trace %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + if (strlen(type->name) != max_tracer_type_len) + goto out; + + max_tracer_type_len = 0; + for (t = &trace_types; *t; t = &(*t)->next) { + len = strlen((*t)->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + } + out: + mutex_unlock(&trace_types_lock); +} + +void tracing_reset(struct trace_array_cpu *data) +{ + data->trace_idx = 0; + data->overrun = 0; + data->trace_head = data->trace_tail = head_page(data); + data->trace_head_idx = 0; + data->trace_tail_idx = 0; +} + +#define SAVED_CMDLINES 128 +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static DEFINE_RAW_SPINLOCK(trace_cmdline_lock); + +/* temporary disable recording */ +atomic_t trace_record_cmdline_disabled __read_mostly; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +void trace_stop_cmdline_recording(void); + +static void trace_save_cmdline(struct task_struct *tsk) +{ + unsigned map; + unsigned idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!spin_trylock(&trace_cmdline_lock)) + return; + + /* from the pid, find the index of the cmdline array */ + idx = map_pid_to_cmdline[tsk->pid]; + + if (idx >= SAVED_CMDLINES) { + /* this is new */ + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + /* check the reverse map and reset it if needed */ + map = map_cmdline_to_pid[idx]; + if (map <= PID_MAX_DEFAULT) + map_pid_to_cmdline[map] = (unsigned)-1; + + map_cmdline_to_pid[idx] = tsk->pid; + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + spin_unlock(&trace_cmdline_lock); +} + +static char *trace_find_cmdline(int pid) +{ + char *cmdline = "<...>"; + unsigned map; + + if (!pid) + return ""; + + if (pid > PID_MAX_DEFAULT) + goto out; + + map = map_pid_to_cmdline[pid]; + if (map >= SAVED_CMDLINES) + goto out; + + if (map_cmdline_to_pid[map] != pid) + goto out; + + cmdline = saved_cmdlines[map]; + + out: + return cmdline; +} + +void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled)) + return; + + trace_save_cmdline(tsk); +} + +static inline struct list_head * +trace_next_list(struct trace_array_cpu *data, struct list_head *next) +{ + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = next->next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + + return next; +} + +static inline void * +trace_next_page(struct trace_array_cpu *data, void *addr) +{ + struct list_head *next; + struct page *page; + + page = virt_to_page(addr); + + next = trace_next_list(data, &page->lru); + page = list_entry(next, struct page, lru); + + return page_address(page); +} + +static inline struct trace_entry * +tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) +{ + unsigned long idx, idx_next; + struct trace_entry *entry; + + data->trace_idx++; + idx = data->trace_head_idx; + idx_next = idx + 1; + + BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); + + entry = data->trace_head + idx * TRACE_ENTRY_SIZE; + + if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { + data->trace_head = trace_next_page(data, data->trace_head); + idx_next = 0; + } + + if (data->trace_head == data->trace_tail && + idx_next == data->trace_tail_idx) { + /* overrun */ + data->overrun++; + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = + trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + } + + data->trace_head_idx = idx_next; + + return entry; +} + +static inline void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, + unsigned long pc) +{ + struct task_struct *tsk = current; + + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->t = ftrace_now(raw_smp_processor_id()); + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} + +void +trace_function(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags, + unsigned long pc) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, pc); + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); +} + +void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + if (likely(!atomic_read(&data->disabled))) + trace_function(tr, data, ip, parent_ip, flags, + preempt_count()); +} + +#ifdef CONFIG_MMIOTRACE +void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0, preempt_count()); + entry->type = TRACE_MMIO_RW; + entry->mmiorw = *rw; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0, preempt_count()); + entry->type = TRACE_MMIO_MAP; + entry->mmiomap = *map; + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} +#endif + +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + struct trace_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_STACK; + + memset(&entry->stack, 0, sizeof(entry->stack)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->stack.caller; + + save_stack_trace(&trace); +} + +void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0, preempt_count()); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + __trace_stack(tr, data, irq_flags, 4); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; + entry->ctx.next_state = next->state; + __trace_stack(tr, data, flags, 5); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); +} + +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_WAKE; + entry->ctx.prev_pid = curr->pid; + entry->ctx.prev_prio = curr->prio; + entry->ctx.prev_state = curr->state; + entry->ctx.next_pid = wakee->pid; + entry->ctx.next_prio = wakee->prio; + entry->ctx.next_state = wakee->state; + __trace_stack(tr, data, flags, 6); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +void tracing_event_irq(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + int irq, int usermode, + unsigned long retip) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_IRQ; + entry->irq.ip = ip; + entry->irq.irq = irq; + entry->irq.ret_ip = retip; + entry->irq.usermode = usermode; +} + +void tracing_event_fault(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long retip, + unsigned long error_code, + unsigned long address) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_FAULT; + entry->fault.ip = ip; + entry->fault.ret_ip = retip; + entry->fault.errorcode = error_code; + entry->fault.address = address; +} + +void tracing_event_timer_set(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, void *timer) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_TIMER_SET; + entry->timer.ip = ip; + entry->timer.expire = *expires; + entry->timer.timer = timer; +} + +void tracing_event_program_event(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, int64_t *delta) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_PROGRAM_EVENT; + entry->program.ip = ip; + entry->program.expire = *expires; + entry->program.delta = *delta; +} + +void tracing_event_timer_triggered(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expired, void *timer) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_TIMER_TRIG; + entry->timer.ip = ip; + entry->timer.expire = *expired; + entry->timer.timer = timer; +} + +void tracing_event_timestamp(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *now) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_TIMESTAMP; + entry->timestamp.ip = ip; + entry->timestamp.now = *now; +} + +void tracing_event_task_activate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int task_cpu) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_TASK_ACT; + entry->task.ip = ip; + entry->task.pid = p->pid; + entry->task.prio = p->prio; + entry->task.cpu = task_cpu; +} + +void tracing_event_task_deactivate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int task_cpu) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_TASK_DEACT; + entry->task.ip = ip; + entry->task.pid = p->pid; + entry->task.prio = p->prio; + entry->task.cpu = task_cpu; +} + +void tracing_event_syscall(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long nr, + unsigned long p1, + unsigned long p2, + unsigned long p3) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_SYSCALL; + entry->syscall.ip = ip; + entry->syscall.nr = nr; + entry->syscall.p1 = p1; + entry->syscall.p2 = p2; + entry->syscall.p3 = p3; +} + +void tracing_event_sysret(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long ret) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags, preempt_count()); + entry->type = TRACE_SYSRET; + entry->sysret.ip = ip; + entry->sysret.ret = ret; +} + +#ifdef CONFIG_FTRACE +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); + tracing_record_cmdline(current); +} + +void tracing_stop_function_trace(void) +{ + tracing_record_cmdline(current); + unregister_ftrace_function(&trace_ops); +} +#endif + +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +}; + +static struct trace_entry * +trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, + struct trace_iterator *iter, int cpu) +{ + struct page *page; + struct trace_entry *array; + + if (iter->next_idx[cpu] >= tr->entries || + iter->next_idx[cpu] >= data->trace_idx || + (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx)) + return NULL; + + if (!iter->next_page[cpu]) { + /* Initialize the iterator for this cpu trace buffer */ + WARN_ON(!data->trace_tail); + page = virt_to_page(data->trace_tail); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_tail_idx; + } + + page = list_entry(iter->next_page[cpu], struct page, lru); + BUG_ON(&data->trace_pages == &page->lru); + + array = page_address(page); + + WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); + return &array[iter->next_page_idx[cpu]]; +} + +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) +{ + struct trace_array *tr = iter->tr; + struct trace_entry *ent, *next = NULL; + int next_cpu = -1; + int cpu; + + for_each_tracing_cpu(cpu) { + if (!head_page(tr->data[cpu])) + continue; + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ent->t < next->t)) { + next = ent; + next_cpu = cpu; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + return next; +} + +static void trace_iterator_increment(struct trace_iterator *iter) +{ + iter->idx++; + iter->next_idx[iter->cpu]++; + iter->next_page_idx[iter->cpu]++; + + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + iter->next_page_idx[iter->cpu] = 0; + iter->next_page[iter->cpu] = + trace_next_list(data, iter->next_page[iter->cpu]); + } +} + +static void trace_consume(struct trace_iterator *iter) +{ + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + + /* Check if we empty it, then reset the index */ + if (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx) + data->trace_idx = 0; +} + +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); + + iter->prev_ent = iter->ent; + iter->prev_cpu = iter->cpu; + + iter->ent = next; + iter->cpu = next_cpu; + + if (next) + trace_iterator_increment(iter); + + return next ? iter : NULL; +} + +static void *s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *last_ent = iter->ent; + int i = (int)*pos; + void *ent; + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = find_next_entry_inc(iter); + + iter->pos = *pos; + + if (last_ent && !ent) + seq_puts(m, "\n\nvim:ft=help\n"); + + return ent; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *p = NULL; + loff_t l = 0; + int i; + + mutex_lock(&trace_types_lock); + + if (!current_trace || current_trace != iter->trace) { + mutex_unlock(&trace_types_lock); + return NULL; + } + + atomic_inc(&trace_record_cmdline_disabled); + + /* let the tracer grab locks here if needed */ + if (current_trace->start) + current_trace->start(iter); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + iter->prev_ent = NULL; + iter->prev_cpu = -1; + + for_each_tracing_cpu(i) { + iter->next_idx[i] = 0; + iter->next_page[i] = NULL; + } + + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + l = *pos - 1; + p = s_next(m, p, &l); + } + + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + + /* let the tracer release locks here if needed */ + if (current_trace && current_trace == iter->trace && iter->trace->stop) + iter->trace->stop(iter); + + mutex_unlock(&trace_types_lock); +} + +static int +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + return trace_seq_printf(s, fmt, str); +#endif + return 1; +} + +static int +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, address); + return trace_seq_printf(s, fmt, str); +#endif + return 1; +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + ret = seq_print_sym_offset(s, "%s", ip); + else + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; + + if (sym_flags & TRACE_ITER_SYM_ADDR) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +static void print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +static void +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long total = 0; + unsigned long entries = 0; + int cpu; + const char *name = "preemption"; + + if (type) + name = type->name; + + for_each_tracing_cpu(cpu) { + if (head_page(tr->data[cpu])) { + total += tr->data[cpu]->trace_idx; + if (tr->data[cpu]->trace_idx > tr->entries) + entries += tr->entries; + else + entries += tr->data[cpu]->trace_idx; + } + } + + seq_printf(m, "%s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "-----------------------------------" + "---------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, + total, + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, " -----------------\n"); + + if (data->critical_start) { + seq_puts(m, " => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n"); + } + + seq_puts(m, "\n"); +} + +static void +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) { + trace_seq_putc(s, 'H'); + } else { + if (hardirq) { + trace_seq_putc(s, 'h'); + } else { + if (softirq) + trace_seq_putc(s, 's'); + else + trace_seq_putc(s, '.'); + } + } + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_puts(s, "."); +} + +unsigned long preempt_mark_thresh = 100; + +static void +lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, + unsigned long rel_usecs) +{ + trace_seq_printf(s, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + trace_seq_puts(s, "!: "); + else if (rel_usecs > 1) + trace_seq_puts(s, "+: "); + else + trace_seq_puts(s, " : "); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + +extern unsigned long sys_call_table[NR_syscalls]; + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +extern unsigned long ia32_sys_call_table[], ia32_syscall_end[]; +# define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table) +#endif + +static void trace_print_ktime(struct trace_seq *s, ktime_t t) +{ + struct timespec ts = ktime_to_timespec(t); + + trace_seq_printf(s, " (%ld.%09ld)", ts.tv_sec, ts.tv_nsec); +} + +static int +print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *next_entry = find_next_entry(iter, NULL); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_entry *entry = iter->ent; + unsigned long abs_usecs; + unsigned long rel_usecs; + unsigned long nr; + char *comm; + int S, T; + int i; + + if (!next_entry) + next_entry = entry; + rel_usecs = ns2usecs(next_entry->t - entry->t); + abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + + if (verbose) { + comm = trace_find_cmdline(entry->pid); + trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, + rel_usecs % 1000); + } else { + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(s, entry->fn.ip, sym_flags); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + trace_seq_puts(s, ")\n"); + break; + case TRACE_CTX: + case TRACE_WAKE: + T = task_state_char(entry->ctx.next_state); + S = task_state_char(entry->ctx.prev_state); + comm = trace_find_cmdline(entry->ctx.next_pid); + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, entry->type == TRACE_CTX ? "==>" : " +", + entry->ctx.next_pid, + entry->ctx.next_prio, + T, comm); + break; + case TRACE_SPECIAL: + trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) + trace_seq_puts(s, " <= "); + seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + } + trace_seq_puts(s, "\n"); + break; + case TRACE_IRQ: + seq_print_ip_sym(s, entry->irq.ip, sym_flags); + if (entry->irq.irq >= 0) + trace_seq_printf(s, " %d ", entry->irq.irq); + if (entry->irq.usermode) + trace_seq_puts(s, " (usermode)\n "); + else { + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->irq.ret_ip, sym_flags); + trace_seq_puts(s, ")\n"); + } + break; + case TRACE_FAULT: + seq_print_ip_sym(s, entry->fault.ip, sym_flags); + trace_seq_printf(s, " %lx ", entry->fault.errorcode); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fault.ret_ip, sym_flags); + trace_seq_puts(s, ")"); + trace_seq_printf(s, " [%lx]\n", entry->fault.address); + break; + case TRACE_TIMER_SET: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_print_ktime(s, entry->timer.expire); + trace_seq_printf(s, " (%p)\n", entry->timer.timer); + break; + case TRACE_TIMER_TRIG: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_print_ktime(s, entry->timer.expire); + trace_seq_printf(s, " (%p)\n", entry->timer.timer); + break; + case TRACE_TIMESTAMP: + seq_print_ip_sym(s, entry->timestamp.ip, sym_flags); + trace_print_ktime(s, entry->timestamp.now); + trace_seq_puts(s, "\n"); + break; + case TRACE_PROGRAM_EVENT: + seq_print_ip_sym(s, entry->program.ip, sym_flags); + trace_print_ktime(s, entry->program.expire); + trace_seq_printf(s, " (%Ld)\n", entry->program.delta); + break; + case TRACE_TASK_ACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_TASK_DEACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_SYSCALL: + seq_print_ip_sym(s, entry->syscall.ip, sym_flags); + nr = entry->syscall.nr; + trace_seq_putc(s, ' '); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + seq_print_ip_sym(s, ia32_sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + } else +#endif + if (nr < NR_syscalls) + seq_print_ip_sym(s, sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + + trace_seq_printf(s, " (%lx %lx %lx)\n", + entry->syscall.p1, + entry->syscall.p2, + entry->syscall.p3); + break; + case TRACE_SYSRET: + seq_print_ip_sym(s, entry->sysret.ip, sym_flags); + trace_seq_printf(s, " < (%ld)\n", + entry->sysret.ret); + break; + default: + trace_seq_printf(s, "Unknown type %d\n", entry->type); + } + return 1; +} + +static int print_trace_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry; + unsigned long usec_rem; + unsigned long long t; + unsigned long secs; + long nr; + char *comm; + int ret; + int S, T; + int i; + + entry = iter->ent; + + comm = trace_find_cmdline(iter->ent->pid); + + t = ns2usecs(entry->t); + usec_rem = do_div(t, 1000000ULL); + secs = (unsigned long)t; + + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + + ret = trace_seq_printf(s, "%c%c %2d ", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'), + entry->preempt_count); + if (!ret) + return 0; + + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + if (!ret) + return 0; + if ((sym_flags & TRACE_ITER_PRINT_PARENT) && + entry->fn.parent_ip) { + ret = trace_seq_printf(s, " <-"); + if (!ret) + return 0; + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_printf(s, "\n"); + if (!ret) + return 0; + break; + case TRACE_CTX: + case TRACE_WAKE: + T = task_state_char(entry->ctx.next_state); + S = task_state_char(entry->ctx.prev_state); + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->type == TRACE_CTX ? "==>" : " +", + entry->ctx.next_pid, + entry->ctx.next_prio, + T); + if (!ret) + return 0; + break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + ret = trace_seq_puts(s, " <= "); + if (!ret) + return 0; + } + ret = seq_print_ip_sym(s, entry->stack.caller[i], + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_puts(s, "\n"); + if (!ret) + return 0; + break; + case TRACE_IRQ: + seq_print_ip_sym(s, entry->irq.ip, sym_flags); + if (entry->irq.irq >= 0) + trace_seq_printf(s, " %d ", entry->irq.irq); + if (entry->irq.usermode) + trace_seq_puts(s, " (usermode)\n "); + else { + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->irq.ret_ip, sym_flags); + trace_seq_puts(s, ")\n"); + } + break; + case TRACE_FAULT: + seq_print_ip_sym(s, entry->fault.ip, sym_flags); + trace_seq_printf(s, " %lx ", entry->fault.errorcode); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fault.ret_ip, sym_flags); + trace_seq_puts(s, ")"); + trace_seq_printf(s, " [%lx]\n", entry->fault.address); + break; + case TRACE_TIMER_SET: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_print_ktime(s, entry->timer.expire); + trace_seq_printf(s, " (%p)\n", entry->timer.timer); + break; + case TRACE_TIMER_TRIG: + seq_print_ip_sym(s, entry->timer.ip, sym_flags); + trace_print_ktime(s, entry->timer.expire); + trace_seq_printf(s, " (%p)\n", entry->timer.timer); + break; + case TRACE_TIMESTAMP: + seq_print_ip_sym(s, entry->timestamp.ip, sym_flags); + trace_print_ktime(s, entry->timestamp.now); + trace_seq_puts(s, "\n"); + break; + case TRACE_PROGRAM_EVENT: + seq_print_ip_sym(s, entry->program.ip, sym_flags); + trace_print_ktime(s, entry->program.expire); + trace_seq_printf(s, " (%Ld)\n", entry->program.delta); + break; + case TRACE_TASK_ACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_TASK_DEACT: + seq_print_ip_sym(s, entry->task.ip, sym_flags); + comm = trace_find_cmdline(entry->task.pid); + trace_seq_printf(s, " %s %d %d [%d]\n", + comm, entry->task.pid, + entry->task.prio, entry->task.cpu); + break; + case TRACE_SYSCALL: + seq_print_ip_sym(s, entry->syscall.ip, sym_flags); + nr = entry->syscall.nr; + trace_seq_putc(s, ' '); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + seq_print_ip_sym(s, ia32_sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + } else +#endif + if (nr < NR_syscalls) + seq_print_ip_sym(s, sys_call_table[nr], 0); + else + trace_seq_printf(s, "", nr); + + trace_seq_printf(s, " (%lx %lx %lx)\n", + entry->syscall.p1, + entry->syscall.p2, + entry->syscall.p3); + break; + case TRACE_SYSRET: + seq_print_ip_sym(s, entry->sysret.ip, sym_flags); + trace_seq_printf(s, "< (%ld)\n", + entry->sysret.ret); + break; + default: + trace_seq_printf(s, "Unknown type %d\n", entry->type); + } + return 1; +} + +static int print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + int ret; + int S, T; + + entry = iter->ent; + + ret = trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, entry->t); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = trace_seq_printf(s, "%x %x\n", + entry->fn.ip, entry->fn.parent_ip); + if (!ret) + return 0; + break; + case TRACE_CTX: + case TRACE_WAKE: + T = task_state_char(entry->ctx.next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(entry->ctx.prev_state); + + ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + T); + if (!ret) + return 0; + break; + case TRACE_SPECIAL: + case TRACE_STACK: + ret = trace_seq_printf(s, "# %ld %ld %ld\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; + } + return 1; +} + +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static int print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + int S, T; + + entry = iter->ent; + + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + case TRACE_WAKE: + T = task_state_char(entry->ctx.next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(entry->ctx.prev_state); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, T); + break; + case TRACE_SPECIAL: + case TRACE_STACK: + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + break; + } + SEQ_PUT_FIELD_RET(s, newline); + + return 1; +} + +static int print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + + entry = iter->ent; + + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + break; + case TRACE_SPECIAL: + case TRACE_STACK: + SEQ_PUT_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_FIELD_RET(s, entry->special.arg3); + break; + } + return 1; +} + +static int trace_empty(struct trace_iterator *iter) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_tracing_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (head_page(data) && data->trace_idx && + (data->trace_tail != data->trace_head || + data->trace_tail_idx != data->trace_head_idx)) + return 0; + } + return 1; +} + +static int print_trace_line(struct trace_iterator *iter) +{ + if (iter->trace && iter->trace->print_line) + return iter->trace->print_line(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + return print_lat_fmt(iter, iter->idx, iter->cpu); + + return print_trace_fmt(iter); +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return 0; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } + } else { + print_trace_line(iter); + trace_print_seq(m, &iter->seq); + } + + return 0; +} + +static struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator * +__tracing_open(struct inode *inode, struct file *file, int *ret) +{ + struct trace_iterator *iter; + + if (tracing_disabled) { + *ret = -ENODEV; + return NULL; + } + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + *ret = -ENOMEM; + goto out; + } + + mutex_lock(&trace_types_lock); + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; + iter->trace = current_trace; + iter->pos = -1; + + /* TODO stop tracer */ + *ret = seq_open(file, &tracer_seq_ops); + if (!*ret) { + struct seq_file *m = file->private_data; + m->private = iter; + + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + } else { + kfree(iter); + iter = NULL; + } + mutex_unlock(&trace_types_lock); + + out: + return iter; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + filp->private_data = inode->i_private; + return 0; +} + +int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct trace_iterator *iter = m->private; + + mutex_lock(&trace_types_lock); + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + if (iter->tr->ctrl) + tracer_enabled = 1; + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + int ret; + + __tracing_open(inode, file, &ret); + + return ret; +} + +static int tracing_lt_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret; + + iter = __tracing_open(inode, file, &ret); + + if (!ret) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + return ret; +} + + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = m->private; + + (*pos)++; + + if (t) + t = t->next; + + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t = m->private; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + int ret; + + if (tracing_disabled) + return -ENODEV; + + ret = seq_open(file, &show_traces_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = trace_types; + } + + return ret; +} + +static struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations tracing_lt_fops = { + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask = CPU_MASK_ALL; + +/* + * When tracing/tracing_cpu_mask is modified then this holds + * the new bitmask we are about to install: + */ +static cpumask_t tracing_cpumask_new; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + int len; + + mutex_lock(&tracing_cpumask_update_lock); + + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + if (count - len < 2) { + count = -EINVAL; + goto out_err; + } + len += sprintf(mask_str + len, "\n"); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err, cpu; + + mutex_lock(&tracing_cpumask_update_lock); + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_unlock; + + raw_local_irq_disable(); + __raw_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpu_isset(cpu, tracing_cpumask) && + !cpu_isset(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + if (!cpu_isset(cpu, tracing_cpumask) && + cpu_isset(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_enable(); + + tracing_cpumask = tracing_cpumask_new; + + mutex_unlock(&tracing_cpumask_update_lock); + + return count; + +err_unlock: + mutex_unlock(&tracing_cpumask_update_lock); + + return err; +} + +static struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, +}; + +static ssize_t +tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + /* calulate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); + len += 3; /* "no" and space */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + r += sprintf(buf + r, "%s ", trace_options[i]); + else + r += sprintf(buf + r, "no%s ", trace_options[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { + if (neg) + trace_flags &= ~(1 << i); + else + trace_flags |= (1 << i); + break; + } + } + /* + * If no option could be set, return an error: + */ + if (!trace_options[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, +}; + +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# mkdir /debug\n" + "# mount -t debugfs nodev /debug\n\n" + "# cat /debug/tracing/available_tracers\n" + "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" + "# cat /debug/tracing/current_tracer\n" + "none\n" + "# echo sched_switch > /debug/tracing/current_tracer\n" + "# cat /debug/tracing/current_tracer\n" + "sched_switch\n" + "# cat /debug/tracing/iter_ctrl\n" + "noprint-parent nosym-offset nosym-addr noverbose\n" + "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo 1 > /debug/tracing/tracing_enabled\n" + "# cat /debug/tracing/trace > /tmp/trace.txt\n" + "echo 0 > /debug/tracing/tracing_enabled\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", tr->ctrl); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tr->ctrl ^ val) { + if (val) + tracer_enabled = 1; + else + tracer_enabled = 0; + + tr->ctrl = val; + + if (current_trace && current_trace->ctrl_update) + current_trace->ctrl_update(tr); + } + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + struct tracer *t; + char buf[max_tracer_type_len+1]; + int i; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t || t == current_trace) + goto out; + + if (current_trace && current_trace->reset) + current_trace->reset(tr); + + current_trace = t; + if (t->init) + t->init(tr); + + out: + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + *ptr = val * 1000; + + return cnt; +} + +static atomic_t tracing_reader; + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_iterator *iter; + + if (tracing_disabled) + return -ENODEV; + + /* We only allow for reader of the pipe */ + if (atomic_inc_return(&tracing_reader) != 1) { + atomic_dec(&tracing_reader); + return -EBUSY; + } + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&trace_types_lock); + iter->tr = &global_trace; + iter->trace = current_trace; + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + + kfree(iter); + atomic_dec(&tracing_reader); + + return 0; +} + +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if (trace_flags & TRACE_ITER_BLOCK) { + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + } else { + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait(filp, &trace_wait, poll_table); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + + return 0; + } +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + struct trace_array_cpu *data; + static cpumask_t mask; + unsigned long flags; +#ifdef CONFIG_FTRACE + int ftrace_save; +#endif + int cpu; + ssize_t sret; + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + sret = 0; + + trace_seq_reset(&iter->seq); + + mutex_lock(&trace_types_lock); + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { + sret = -EAGAIN; + goto out; + } + + /* + * This is a make-shift waitqueue. The reason we don't use + * an actual wait queue is because: + * 1) we only ever have one waiter + * 2) the tracing, traces all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * Anyway, this is really very primitive wakeup. + */ + set_current_state(TASK_INTERRUPTIBLE); + iter->tr->waiter = current; + + mutex_unlock(&trace_types_lock); + + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ/10); + + mutex_lock(&trace_types_lock); + + iter->tr->waiter = NULL; + + if (signal_pending(current)) { + sret = -EINTR; + goto out; + } + + if (iter->trace != current_trace) + goto out; + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_enabled && iter->pos) + break; + + continue; + } + + /* stop when tracing is finished */ + if (trace_empty(iter)) + goto out; + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + /* reset all but tr, trace, and overruns */ + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter->pos = -1; + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + cpus_clear(mask); + local_irq_save(flags); +#ifdef CONFIG_FTRACE + ftrace_save = ftrace_enabled; + ftrace_enabled = 0; +#endif + smp_wmb(); + for_each_tracing_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!head_page(data) || !data->trace_idx) + continue; + + atomic_inc(&data->disabled); + cpu_set(cpu, mask); + } + + for_each_cpu_mask_nr(cpu, mask) { + data = iter->tr->data[cpu]; + __raw_spin_lock(&data->lock); + + if (data->overrun > iter->last_overrun[cpu]) + iter->overrun[cpu] += + data->overrun - iter->last_overrun[cpu]; + iter->last_overrun[cpu] = data->overrun; + } + + while (find_next_entry_inc(iter) != NULL) { + int ret; + int len = iter->seq.len; + + ret = print_trace_line(iter); + if (!ret) { + /* don't print partial lines */ + iter->seq.len = len; + break; + } + + trace_consume(iter); + + if (iter->seq.len >= cnt) + break; + } + + for_each_cpu_mask_nr(cpu, mask) { + data = iter->tr->data[cpu]; + __raw_spin_unlock(&data->lock); + } + + for_each_cpu_mask_nr(cpu, mask) { + data = iter->tr->data[cpu]; + atomic_dec(&data->disabled); + } +#ifdef CONFIG_FTRACE + ftrace_enabled = ftrace_save; +#endif + local_irq_restore(flags); + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) + trace_seq_reset(&iter->seq); + if (sret == -EBUSY) + sret = 0; + +out: + mutex_unlock(&trace_types_lock); + + return sret; +} + +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%lu\n", tr->entries); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int i, ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + if (current_trace != &no_tracer) { + cnt = -EBUSY; + pr_info("ftrace: set current_tracer to none" + " before modifying buffer size\n"); + goto out; + } + + if (val > global_trace.entries) { + long pages_requested; + unsigned long freeable_pages; + + /* make sure we have enough memory before mapping */ + pages_requested = + (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; + + /* account for each buffer (and max_tr) */ + pages_requested *= tracing_nr_buffers * 2; + + /* Check for overflow */ + if (pages_requested < 0) { + cnt = -ENOMEM; + goto out; + } + + freeable_pages = determine_dirtyable_memory(); + + /* we only allow to request 1/4 of useable memory */ + if (pages_requested > + ((freeable_pages + tracing_pages_allocated) / 4)) { + cnt = -ENOMEM; + goto out; + } + + while (global_trace.entries < val) { + if (trace_alloc_page()) { + cnt = -ENOMEM; + goto out; + } + /* double check that we don't go over the known pages */ + if (tracing_pages_allocated > pages_requested) + break; + } + + } else { + /* include the number of entries in val (inc of page entries) */ + while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) + trace_free_page(); + } + + /* check integrity */ + for_each_tracing_cpu(i) + check_pages(global_trace.data[i]); + + filp->f_pos += cnt; + + /* If check pages failed, return ENOMEM */ + if (tracing_disabled) + cnt = -ENOMEM; + out: + max_tr.entries = global_trace.entries; + mutex_unlock(&trace_types_lock); + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + +static struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, +}; + +static struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, +}; + +static struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .poll = tracing_poll_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, +}; + +static struct file_operations tracing_entries_fops = { + .open = tracing_open_generic, + .read = tracing_entries_read, + .write = tracing_entries_write, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_long(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", *p); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static struct file_operations tracing_read_long_fops = { + .open = tracing_open_generic, + .read = tracing_read_long, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + +static __init void tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + + entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + NULL, &tracing_iter_fops); + if (!entry) + pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + + entry = debugfs_create_file("latency_trace", 0444, d_tracer, + &global_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'latency_trace' entry\n"); + + entry = debugfs_create_file("trace", 0444, d_tracer, + &global_trace, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + entry = debugfs_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + if (!entry) + pr_warning("Could not create debugfs 'README' entry\n"); + + entry = debugfs_create_file("trace_pipe", 0644, d_tracer, + NULL, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + + entry = debugfs_create_file("trace_entries", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + +#ifdef CONFIG_DYNAMIC_FTRACE + entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, + &tracing_read_long_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'dyn_ftrace_total_info' entry\n"); +#endif +#ifdef CONFIG_SYSPROF_TRACER + init_tracer_sysprof_debugfs(d_tracer); +#endif +} + +/** + * ftrace_stop - called when we need to drastically disable the tracer. + */ +void ftrace_stop(void) +{ + struct tracer *saved_tracer = current_trace; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + int i; + + __ftrace_kill(); + for_each_tracing_cpu(i) { + data = tr->data[i]; + atomic_inc(&data->disabled); + } + tracer_enabled = 0; + + /* + * TODO: make a safe method to ctrl_update. + * ctrl_update may schedule, but currently only + * does when ftrace is enabled. + */ + if (tr->ctrl) { + tr->ctrl = 0; + if (saved_tracer && saved_tracer->ctrl_update) + saved_tracer->ctrl_update(tr); + } + + +} + +static int trace_alloc_page(void) +{ + struct trace_array_cpu *data; + struct page *page, *tmp; + LIST_HEAD(pages); + void *array; + unsigned pages_allocated = 0; + int i; + + /* first allocate a page for each CPU */ + for_each_tracing_cpu(i) { + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + + pages_allocated++; + page = virt_to_page(array); + list_add(&page->lru, &pages); + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + pages_allocated++; + page = virt_to_page(array); + list_add(&page->lru, &pages); +#endif + } + + /* Now that we successfully allocate a page per CPU, add them */ + for_each_tracing_cpu(i) { + data = global_trace.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del_init(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + ClearPageLRU(page); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del_init(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + SetPageLRU(page); +#endif + } + tracing_pages_allocated += pages_allocated; + global_trace.entries += ENTRIES_PER_PAGE; + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + return -ENOMEM; +} + +static int trace_free_page(void) +{ + struct trace_array_cpu *data; + struct page *page; + struct list_head *p; + int i; + int ret = 0; + + /* free one page from each buffer */ + for_each_tracing_cpu(i) { + data = global_trace.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + tracing_pages_allocated--; + tracing_pages_allocated--; + __free_page(page); + + tracing_reset(data); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); +#endif + } + global_trace.entries -= ENTRIES_PER_PAGE; + + return ret; +} + +__init static int tracer_alloc_buffers(void) +{ + struct trace_array_cpu *data; + void *array; + struct page *page; + int pages = 0; + int ret = -ENOMEM; + int i; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_nr_buffers = num_possible_cpus(); + tracing_buffer_mask = cpu_possible_map; + + /* Allocate the first page for all buffers */ + for_each_tracing_cpu(i) { + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_buffers; + } + + /* set the array to the list */ + INIT_LIST_HEAD(&data->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &data->trace_pages); + /* use the LRU flag to differentiate the two buffers */ + ClearPageLRU(page); + + data->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + max_tr.data[i]->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_buffers; + } + + INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &max_tr.data[i]->trace_pages); + SetPageLRU(page); +#endif + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + global_trace.entries = ENTRIES_PER_PAGE; + pages++; + + while (global_trace.entries < trace_nr_entries) { + if (trace_alloc_page()) + break; + pages++; + } + max_tr.entries = global_trace.entries; + + pr_info("tracer: %d pages allocated for %ld", + pages, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info(" actual entries %ld\n", global_trace.entries); + + tracer_init_debugfs(); + + trace_init_cmdlines(); + + register_tracer(&no_tracer); + current_trace = &no_tracer; + + /* All seems OK, enable tracing */ + global_trace.ctrl = tracer_enabled; + tracing_disabled = 0; + + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct page *page, *tmp; + struct trace_array_cpu *data = global_trace.data[i]; + + if (data) { + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data) { + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del_init(&page->lru); + __free_page(page); + } + } +#endif + } + return ret; +} +fs_initcall(tracer_alloc_buffers); Index: linux-2.6.24.7-rt27/kernel/trace/trace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace.h 2009-02-08 00:05:24.000000000 -0500 @@ -0,0 +1,527 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, + TRACE_IRQ, + TRACE_FAULT, + TRACE_TIMER_SET, + TRACE_TIMER_TRIG, + TRACE_TIMESTAMP, + TRACE_PROGRAM_EVENT, + TRACE_TASK_ACT, + TRACE_TASK_DEACT, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +/* + * Function trace entry - function address and parent function addres: + */ +struct ftrace_entry { + unsigned long ip; + unsigned long parent_ip; +}; + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + */ +struct ctx_switch_entry { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; + unsigned char next_state; +}; + +/* + * Special (free-form) trace entry: + */ +struct special_entry { + unsigned long arg1; + unsigned long arg2; + unsigned long arg3; +}; + +struct irq_entry { + unsigned long ip; + unsigned long ret_ip; + unsigned irq; + unsigned usermode; +}; + +struct fault_entry { + unsigned long ip; + unsigned long ret_ip; + unsigned long errorcode; + unsigned long address; +}; + +struct timer_entry { + unsigned long ip; + ktime_t expire; + void *timer; +}; + +struct program_entry { + unsigned long ip; + ktime_t expire; + int64_t delta; +}; + +struct timestamp_entry { + unsigned long ip; + ktime_t now; +}; + +struct task_entry { + unsigned long ip; + pid_t pid; + unsigned prio; + int cpu; +}; + +struct wakeup_entry { + unsigned long ip; + pid_t pid; + unsigned prio; + unsigned curr_prio; +}; + +struct syscall_entry { + unsigned long ip; + unsigned long nr; + unsigned long p1; + unsigned long p2; + unsigned long p3; +}; + +struct sysret_entry { + unsigned long ip; + unsigned long ret; +}; + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +struct stack_entry { + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; + int pid; + cycle_t t; + union { + struct ftrace_entry fn; + struct ctx_switch_entry ctx; + struct special_entry special; + struct stack_entry stack; + struct mmiotrace_rw mmiorw; + struct mmiotrace_map mmiomap; + struct irq_entry irq; + struct fault_entry fault; + struct timer_entry timer; + struct timestamp_entry timestamp; + struct program_entry program; + struct task_entry task; + struct wakeup_entry wakeup; + struct syscall_entry syscall; + struct sysret_entry sysret; + }; +}; + +#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + struct list_head trace_pages; + atomic_t disabled; + __raw_spinlock_t lock; + struct lock_class_key lock_key; + + /* these fields get copied into max-trace: */ + unsigned trace_head_idx; + unsigned trace_tail_idx; + void *trace_head; /* producer */ + void *trace_tail; /* consumer */ + unsigned long trace_idx; + unsigned long overrun; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct trace_iterator; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + unsigned long entries; + long ctrl; + int cpu; + cycle_t time_start; + struct task_struct *waiter; + struct trace_array_cpu *data[NR_CPUS]; +}; + +/* + * A specific tracer, represented by methods that operate on a trace array: + */ +struct tracer { + const char *name; + void (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*start)(struct trace_iterator *iter); + void (*stop)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); + void (*ctrl_update)(struct trace_array *tr); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif + int (*print_line)(struct trace_iterator *iter); + struct tracer *next; + int print_max; +}; + +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; + unsigned int readpos; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; + long last_overrun[NR_CPUS]; + long overrun[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + int cpu; + + struct trace_entry *prev_ent; + int prev_cpu; + + unsigned long iter_flags; + loff_t pos; + unsigned long next_idx[NR_CPUS]; + struct list_head *next_page[NR_CPUS]; + unsigned next_page_idx[NR_CPUS]; + long idx; +}; + +void tracing_reset(struct trace_array_cpu *data); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *tracing_init_dentry(void); +void init_tracer_sysprof_debugfs(struct dentry *d_tracer); + +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); +void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags); +void trace_special(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3); +void trace_function(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, + unsigned long pc); +void tracing_event_irq(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + int irq, int usermode, + unsigned long retip); +void tracing_event_fault(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long retip, + unsigned long error_code, + unsigned long address); +void tracing_event_timer_set(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, void *timer); +void tracing_event_timer_triggered(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expired, void *timer); +void tracing_event_timestamp(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *now); +void tracing_event_task_activate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int cpu); +void tracing_event_task_deactivate(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + struct task_struct *p, + int cpu); +void tracing_event_program_event(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + ktime_t *expires, int64_t *delta); +void tracing_event_wakeup(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + pid_t pid, int prio, + int curr_prio); +void tracing_event_syscall(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long nr, + unsigned long p1, + unsigned long p2, + unsigned long p3); +void tracing_event_sysret(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + unsigned long ip, + unsigned long ret); + +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +extern cycle_t ftrace_now(int cpu); + +#ifdef CONFIG_FTRACE +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +#else +# define tracing_start_function_trace() do { } while (0) +# define tracing_stop_function_trace() do { } while (0) +#endif + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +typedef void +(*tracer_switch_func_t)(void *private, + void *__rq, + struct task_struct *prev, + struct task_struct *next); + +struct tracer_switch_ops { + tracer_switch_func_t func; + void *private; + struct tracer_switch_ops *next; +}; + +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); +#endif + +#ifdef CONFIG_MMIOTRACE +extern void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw); +extern void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map); +#endif + +#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_FTRACE +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_IRQSOFF_TRACER +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_PREEMPT_TRACER +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +#endif +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SCHED_TRACER +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SYSPROF_TRACER +extern int trace_selftest_startup_sysprof(struct tracer *trace, + struct trace_array *tr); +#endif +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +extern void *head_page(struct trace_array_cpu *data); +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +extern long ns2usecs(cycle_t nsec); + +extern unsigned long trace_flags; + +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_SCHED_TREE = 0x200, +}; + +/* COMPAT FOR 2.6.24 */ +#define define_strict_strtoux(type, valtype) \ +static inline int strict_strtou##type(const char *cp, unsigned int base, valtype *res)\ +{ \ + char *tail; \ + valtype val; \ + size_t len; \ + \ + *res = 0; \ + len = strlen(cp); \ + if (len == 0) \ + return -EINVAL; \ + \ + val = simple_strtoul(cp, &tail, base); \ + if ((*tail == '\0') || \ + ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) {\ + *res = val; \ + return 0; \ + } \ + \ + return -EINVAL; \ +} \ + +#define define_strict_strtox(type, valtype) \ +static inline int strict_strto##type(const char *cp, unsigned int base, valtype *res) \ +{ \ + int ret; \ + if (*cp == '-') { \ + ret = strict_strtou##type(cp+1, base, res); \ + if (!ret) \ + *res = -(*res); \ + } else \ + ret = strict_strtou##type(cp, base, res); \ + \ + return ret; \ +} \ + +define_strict_strtoux(l, unsigned long) +define_strict_strtox(l, long) +define_strict_strtoux(ll, unsigned long long) +define_strict_strtox(ll, long long) + +#endif /* _LINUX_KERNEL_TRACE_H */ Index: linux-2.6.24.7-rt27/kernel/trace/trace_functions.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_functions.c 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,78 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include + +#include "trace.h" + +static void function_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void start_function_trace(struct trace_array *tr) +{ + function_reset(tr); + tracing_start_cmdline_record(); + tracing_start_function_trace(); +} + +static void stop_function_trace(struct trace_array *tr) +{ + tracing_stop_function_trace(); + tracing_stop_cmdline_record(); +} + +static void function_trace_init(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); +} + +static void function_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_function_trace(tr); +} + +static void function_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); + else + stop_function_trace(tr); +} + +static struct tracer function_trace __read_mostly = +{ + .name = "ftrace", + .init = function_trace_init, + .reset = function_trace_reset, + .ctrl_update = function_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif +}; + +static __init int init_function_trace(void) +{ + return register_tracer(&function_trace); +} + +device_initcall(init_function_trace); Index: linux-2.6.24.7-rt27/kernel/trace/trace_irqsoff.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_irqsoff.c 2009-02-08 00:05:24.000000000 -0500 @@ -0,0 +1,513 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_hist.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +static DEFINE_PER_CPU(int, tracing_cpu); + +static DEFINE_RAW_SPINLOCK(max_trace_lock); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!ftrace_enabled)) + return; + + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return; + + local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags, + preempt_count()); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + spin_lock_irqsave(&max_trace_lock, flags); + + /* check if we are still the max latency */ + if (!report_latency(delta)) + goto out_unlock; + + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, + preempt_count()); + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out_unlock; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + max_sequence++; + +out_unlock: + spin_unlock_irqrestore(&max_trace_lock, flags); + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + tracing_reset(data); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, + preempt_count()); +} + +static inline void +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + tracing_reset(data); + + local_save_flags(flags); + + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); + + per_cpu(tracing_cpu, cpu) = 1; + + atomic_dec(&data->disabled); +} + +static inline void +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; + else + return; + + if (!tracer_enabled) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!head_page(data)) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + local_save_flags(flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +/* start and stop critical timings used to for stoppage (in idle) */ +void start_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + + tracing_hist_preempt_start(); +} + +void stop_critical_timings(void) +{ + tracing_hist_preempt_stop(TRACE_STOP); + + if (preempt_trace() || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_IRQSOFF_TRACER +#ifdef CONFIG_PROVE_LOCKING +void time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + tracing_hist_preempt_stop(1); + + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); +} + +void time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); + + tracing_hist_preempt_start(); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void trace_hardirqs_on(void) +{ + tracing_hist_preempt_stop(1); + + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + + tracing_hist_preempt_start(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracing_hist_preempt_stop(1); + + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, caller_addr); + + tracing_hist_preempt_start(); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracing_hist_preempt_stop(0); + if (preempt_trace()) + stop_critical_timing(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + tracing_hist_preempt_start(); + if (preempt_trace()) + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + register_ftrace_function(&trace_ops); + tracer_enabled = 1; +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); +} + +static void __irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visible */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif +}; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif + +__init static int init_irqsoff_tracer(void) +{ + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); Index: linux-2.6.24.7-rt27/kernel/trace/trace_mmiotrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_mmiotrace.c 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,295 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen + */ + +#define DEBUG 1 + +#include +#include +#include + +#include "trace.h" + +struct header_iter { + struct pci_dev *dev; +}; + +static struct trace_array *mmio_trace_array; +static bool overrun_detected; + +static void mmio_reset_data(struct trace_array *tr) +{ + int cpu; + + overrun_detected = false; + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static void mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + if (tr->ctrl) { + mmio_reset_data(tr); + enable_mmiotrace(); + } +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; +} + +static void mmio_trace_ctrl_update(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) { + mmio_reset_data(tr); + enable_mmiotrace(); + } else { + disable_mmiotrace(); + } +} + +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) +{ + struct header_iter *hiter; + struct trace_seq *s = &iter->seq; + + trace_seq_printf(s, "VERSION 20070824\n"); + + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static unsigned long count_overruns(struct trace_iterator *iter) +{ + int cpu; + unsigned long cnt = 0; + for_each_online_cpu(cpu) { + cnt += iter->overrun[cpu]; + iter->overrun[cpu] = 0; + } + return cnt; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + +print_out: + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static int mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, 0); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +static int mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_map *m = &entry->mmiomap; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %lu.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +/* return 0 to abort printing without consuming current entry in pipe mode */ +static int mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + default: + return 1; /* ignore unknown entries */ + } +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, + .ctrl_update = mmio_trace_ctrl_update, + .print_line = mmio_print_line, +}; + +__init static int init_mmio_trace(void) +{ + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +void mmio_trace_rw(struct mmiotrace_rw *rw) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} + +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); +} Index: linux-2.6.24.7-rt27/kernel/trace/trace_sched_switch.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_sched_switch.c 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,196 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; +static atomic_t sched_ref; + +static void +sched_switch_func(void *private, void *__rq, struct task_struct *prev, + struct task_struct *next) +{ + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_switch_trace(tr, data, prev, next, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + if (!atomic_read(&sched_ref)) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + sched_switch_func(probe_data, __rq, prev, next); +} + +static void sched_switch_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static int tracing_sched_register(void) +{ + int ret; + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &ctx_trace); + if (ret) + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + + return ret; +} + +static void tracing_sched_unregister(void) +{ + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &ctx_trace); +} + +void tracing_start_sched_switch(void) +{ + long ref; + + ref = atomic_inc_return(&sched_ref); + if (ref == 1) + tracing_sched_register(); +} + +void tracing_stop_sched_switch(void) +{ + long ref; + + ref = atomic_dec_and_test(&sched_ref); + if (ref) + tracing_sched_unregister(); +} + +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} + +static void start_sched_trace(struct trace_array *tr) +{ + sched_switch_reset(tr); + tracer_enabled = 1; + tracing_start_cmdline_record(); +} + +static void stop_sched_trace(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + tracer_enabled = 0; +} + +static void sched_switch_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_sched_trace(tr); +} + +static void sched_switch_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_sched_trace(tr); +} + +static void sched_switch_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_sched_trace(tr); + else + stop_sched_trace(tr); +} + +static struct tracer sched_switch_trace __read_mostly = +{ + .name = "sched_switch", + .init = sched_switch_trace_init, + .reset = sched_switch_trace_reset, + .ctrl_update = sched_switch_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sched_switch, +#endif +}; + +__init static int init_sched_switch_trace(void) +{ + int ret = 0; + + if (atomic_read(&sched_ref)) + ret = tracing_sched_register(); + if (ret) { + pr_info("error registering scheduler trace\n"); + return ret; + } + return register_tracer(&sched_switch_trace); +} +device_initcall(init_sched_switch_trace); Index: linux-2.6.24.7-rt27/kernel/trace/trace_sched_wakeup.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_sched_wakeup.c 2009-02-08 00:05:24.000000000 -0500 @@ -0,0 +1,455 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static __raw_spinlock_t wakeup_lock = + (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static void __wakeup_reset(struct trace_array *tr); + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int resched; + int cpu; + unsigned long pc; + + if (likely(!wakeup_task) || !ftrace_enabled) + return; + + pc = preempt_count(); + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + raw_local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); + + if (unlikely(!wakeup_task)) + goto unlock; + + /* + * The task can't disappear because it needs to + * wake up first, and we have the wakeup_lock. + */ + if (task_cpu(wakeup_task) != cpu) + goto unlock; + + trace_function(tr, data, ip, parent_ip, flags, pc); + + unlock: + __raw_spin_unlock(&wakeup_lock); + raw_local_irq_restore(flags); + + out: + atomic_dec(&data->disabled); + + /* + * To prevent recursion from the scheduler, if the + * resched flag was set before we entered, then + * don't reschedule. + */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = wakeup_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long latency = 0, t0 = 0, t1 = 0; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waiting for is waking up */ + data = tr->data[wakeup_cpu]; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags, + preempt_count()); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + latency = nsecs_to_usecs(delta); + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + +out_unlock: + __wakeup_reset(tr); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + wakeup_sched_switch(probe_data, __rq, prev, next); +} + +static void __wakeup_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); + __wakeup_reset(tr); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +} + +static void +wakeup_check_start(struct trace_array *tr, struct task_struct *p, + struct task_struct *curr) +{ + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + __raw_spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); + trace_function(tr, tr->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags, preempt_count()); + +out_locked: + __raw_spin_unlock(&wakeup_lock); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array **ptr = probe_data; + struct trace_array *tr = *ptr; + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + + if (likely(!tracer_enabled)) + return; + + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); + tracing_record_cmdline(curr); + + wakeup_check_start(tr, task, curr); +} + +static void start_wakeup_tracer(struct trace_array *tr) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &wakeup_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + tracer_enabled = 1; + register_ftrace_function(&trace_ops); + + return; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); +} + +static void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); +} + +static void wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_trace = tr; + + if (tr->ctrl) + start_wakeup_tracer(tr); +} + +static void wakeup_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_wakeup_tracer(tr); + else + stop_wakeup_tracer(tr); +} + +static void wakeup_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_tracer(iter->tr); +} + +static void wakeup_tracer_close(struct trace_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_tracer(iter->tr); +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .open = wakeup_tracer_open, + .close = wakeup_tracer_close, + .ctrl_update = wakeup_tracer_ctrl_update, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); Index: linux-2.6.24.7-rt27/kernel/trace/trace_selftest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_selftest.c 2009-02-08 00:01:15.000000000 -0500 @@ -0,0 +1,572 @@ +/* Include in trace.c */ + +#include +#include + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: + case TRACE_STACK: + case TRACE_SPECIAL: + case TRACE_IRQ: + case TRACE_FAULT: + case TRACE_TIMER_SET: + case TRACE_TIMER_TRIG: + case TRACE_TIMESTAMP: + case TRACE_TASK_ACT: + case TRACE_TASK_DEACT: + case TRACE_SYSCALL: + case TRACE_SYSRET: + return 1; + } + return 0; +} + +static int +trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +{ + struct trace_entry *entries; + struct page *page; + int idx = 0; + int i; + + BUG_ON(list_empty(&data->trace_pages)); + page = list_entry(data->trace_pages.next, struct page, lru); + entries = page_address(page); + + check_pages(data); + if (head_page(data) != entries) + goto failed; + + /* + * The starting trace buffer always has valid elements, + * if any element exists. + */ + entries = head_page(data); + + for (i = 0; i < tr->entries; i++) { + + if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", + entries[idx].type); + goto failed; + } + + idx++; + if (idx >= ENTRIES_PER_PAGE) { + page = virt_to_page(entries); + if (page->lru.next == &data->trace_pages) { + if (i != tr->entries - 1) { + printk(KERN_CONT ".. entries buffer mismatch"); + goto failed; + } + } else { + page = list_entry(page->lru.next, struct page, lru); + entries = page_address(page); + } + idx = 0; + } + } + + page = virt_to_page(entries); + if (page->lru.next != &data->trace_pages) { + printk(KERN_CONT ".. too many entries"); + goto failed; + } + + return 0; + + failed: + /* disable tracing */ + tracing_disabled = 1; + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +{ + unsigned long flags, cnt = 0; + int cpu, ret = 0; + + /* Don't allow flipping of max traces now */ + raw_local_irq_save(flags); + __raw_spin_lock(&ftrace_max_lock); + for_each_possible_cpu(cpu) { + if (!head_page(tr->data[cpu])) + continue; + + cnt += tr->data[cpu]->trace_idx; + + ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + if (ret) + break; + } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_restore(flags); + + if (count) + *count = cnt; + + return ret; +} + +#ifdef CONFIG_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE + +#define __STR(x) #x +#define STR(x) __STR(x) + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* update the records */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* + * Some archs *cough*PowerPC*cough* add charachters to the + * start of the function names. We simply put a '*' to + * accomodate them. + */ + func_name = "*" STR(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + /* we should only have one item */ + if (!ret && count != 1) { + printk(KERN_CONT ".. filter failed count=%ld ..", count); + ret = -1; + goto out; + } + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_filter(NULL, 0, 1); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + + /* make sure msleep has been recorded */ + msleep(1); + + /* force the recorded functions to be traced */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* start the tracing */ + ftrace_enabled = 1; + tracer_enabled = 1; + + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + + return ret; +} +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + + /* reset the max latency */ + tracing_max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tr->ctrl = 1; + trace->ctrl_update(tr); + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + out: + trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_SCHED_TRACER +static int trace_wakeup_test_thread(void *data) +{ + /* Make this a RT thread, doesn't need to be too high */ + struct sched_param param = { .sched_priority = 5 }; + struct completion *x = data; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* Make it know we have a new prio */ + complete(x); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + /* + * This is an RT task, do short sleeps to let + * others run. + */ + msleep(100); + } + + return 0; +} + +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + struct task_struct *p; + struct completion isrt; + unsigned long count; + int ret; + + init_completion(&isrt); + + /* create a high prio thread */ + p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + if (IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at an RT prio */ + wait_for_completion(&isrt); + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + + /* sleep to let the RT thread sleep too */ + msleep(100); + + /* + * Yes this is slightly racy. It is possible that for some + * strange reason that the RT thread we created, did not + * call schedule for 100ms after doing the completion, + * and we do a wakeup on a task that already is awake. + * But that is extremely unlikely, and the worst thing that + * happens in such a case, is that we disable tracing. + * Honestly, if this race does happen something is horrible + * wrong with the system. + */ + + wake_up_process(p); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + + + trace->reset(tr); + + tracing_max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_SYSPROF_TRACER +int +trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + return ret; +} +#endif /* CONFIG_SYSPROF_TRACER */ Index: linux-2.6.24.7-rt27/kernel/trace/trace_selftest_dynamic.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_selftest_dynamic.c 2009-02-08 00:01:09.000000000 -0500 @@ -0,0 +1,7 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} Index: linux-2.6.24.7-rt27/lib/Kconfig.debug =================================================================== --- linux-2.6.24.7-rt27.orig/lib/Kconfig.debug 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/lib/Kconfig.debug 2009-02-08 00:05:09.000000000 -0500 @@ -189,6 +189,21 @@ config DEBUG_RT_MUTEXES help This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. + When realtime preemption is enabled this includes spinlocks, + rwlocks, mutexes and (rw)semaphores + +config RTMUTEX_CHECK + bool "RT Mutex integrity checker" + depends on PREEMPT_RT + default y + help + When PREEMPT_RT is configured, most spinlocks and semaphores + are converted into mutexes. There still exists true spin locks + and old style semaphores. There are places in the kernel that + passes the lock via pointer and typecasts it back. This + can circumvent the compiler conversions. This option will add + a magic number to all converted locks and check to make sure + the lock is appropriate for the function being used. config DEBUG_PI_LIST bool @@ -212,7 +227,7 @@ config DEBUG_SPINLOCK config DEBUG_MUTEXES bool "Mutex debugging: basic checks" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help This feature allows mutex semantics violations to be detected and reported. @@ -227,6 +242,17 @@ config DEBUG_SEMAPHORE verbose debugging messages. If you suspect a semaphore problem or a kernel hacker asks for this option then say Y. Otherwise say N. +config RWLOCK_TORTURE_TEST + tristate "torture tests for Priority Inheritance RW locks" + depends on DEBUG_KERNEL + depends on m + default n + help + This option provides a kernel modules that runs a torture test + of several threads that try to grab mutexes, rwlocks and rwsems. + + Say N if you are unsure. + config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -517,4 +543,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER help Provide stacktrace filter for fault-injection capabilities +source kernel/trace/Kconfig + source "samples/Kconfig" Index: linux-2.6.24.7-rt27/lib/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/lib/Makefile 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/lib/Makefile 2009-02-08 00:03:02.000000000 -0500 @@ -3,11 +3,20 @@ # lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o \ + rbtree.o radix-tree.o dump_stack.o lock_list.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o +ifdef CONFIG_FTRACE +# Do not profile string.o, since it may be used in early boot or vdso +CFLAGS_REMOVE_string.o = -pg +# Also do not profile any debug utilities +CFLAGS_REMOVE_spinlock_debug.o = -pg +CFLAGS_REMOVE_list_debug.o = -pg +CFLAGS_REMOVE_debugobjects.o = -pg +endif + lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o @@ -27,7 +36,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_PREEMPT_RT) += plist.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o Index: linux-2.6.24.7-rt27/mm/page-writeback.c =================================================================== --- linux-2.6.24.7-rt27.orig/mm/page-writeback.c 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/mm/page-writeback.c 2009-02-08 00:03:09.000000000 -0500 @@ -120,8 +120,6 @@ static void background_writeout(unsigned static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -280,7 +278,13 @@ static unsigned long highmem_dirtyable_m #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; @@ -1004,9 +1008,11 @@ int __set_page_dirty_nobuffers(struct pa if (!mapping) return 1; - write_lock_irq(&mapping->tree_lock); + lock_page_ref_irq(page); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); + BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { @@ -1015,10 +1021,12 @@ int __set_page_dirty_nobuffers(struct pa BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } - radix_tree_tag_set(&mapping->page_tree, + radix_tree_lock(&ctx); + radix_tree_tag_set(ctx.tree, page_index(page), PAGECACHE_TAG_DIRTY); + radix_tree_unlock(&ctx); } - write_unlock_irq(&mapping->tree_lock); + unlock_page_ref_irq(page); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1174,18 +1182,21 @@ int test_clear_page_writeback(struct pag struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + lock_page_ref_irqsave(page, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); + + radix_tree_lock(&ctx); + radix_tree_tag_clear(ctx.tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + radix_tree_unlock(&ctx); if (bdi_cap_writeback_dirty(bdi)) { __dec_bdi_stat(bdi, BDI_WRITEBACK); __bdi_writeout_inc(bdi); } } - write_unlock_irqrestore(&mapping->tree_lock, flags); + unlock_page_ref_irqrestore(page, flags); } else { ret = TestClearPageWriteback(page); } @@ -1202,21 +1213,25 @@ int test_set_page_writeback(struct page if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; + DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree); - write_lock_irqsave(&mapping->tree_lock, flags); + lock_page_ref_irqsave(page, flags); ret = TestSetPageWriteback(page); if (!ret) { - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_lock(&ctx); + radix_tree_tag_set(ctx.tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + radix_tree_unlock(&ctx); if (bdi_cap_writeback_dirty(bdi)) __inc_bdi_stat(bdi, BDI_WRITEBACK); } - if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + if (!PageDirty(page)) { + radix_tree_lock(&ctx); + radix_tree_tag_clear(ctx.tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + radix_tree_unlock(&ctx); + } + unlock_page_ref_irqrestore(page, flags); } else { ret = TestSetPageWriteback(page); } Index: linux-2.6.24.7-rt27/scripts/Makefile.lib =================================================================== --- linux-2.6.24.7-rt27.orig/scripts/Makefile.lib 2009-02-08 00:00:27.000000000 -0500 +++ linux-2.6.24.7-rt27/scripts/Makefile.lib 2009-02-08 00:01:09.000000000 -0500 @@ -90,7 +90,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUI modname_flags = $(if $(filter 1,$(words $(modname))),\ -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") -_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) +orig_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) +_c_flags = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags)) _a_flags = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o) _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F)) Index: linux-2.6.24.7-rt27/arch/x86/kernel/apic_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/apic_32.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/apic_32.c 2009-02-08 00:01:32.000000000 -0500 @@ -45,6 +45,8 @@ #include "io_ports.h" +#include + /* * Sanity check */ @@ -581,6 +583,7 @@ void fastcall smp_apic_timer_interrupt(s { struct pt_regs *old_regs = set_irq_regs(regs); + ftrace_event_irq(-1, user_mode(regs), regs->eip); /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -1308,6 +1311,7 @@ void smp_error_interrupt(struct pt_regs */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } Index: linux-2.6.24.7-rt27/arch/x86/kernel/irq_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/irq_32.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/irq_32.c 2009-02-08 00:02:55.000000000 -0500 @@ -16,6 +16,8 @@ #include #include +#include + #include #include @@ -77,6 +79,10 @@ fastcall unsigned int do_IRQ(struct pt_r u32 *isp; #endif +#ifdef CONFIG_X86_LOCAL_APIC + irq_show_regs_callback(smp_processor_id(), regs); +#endif + if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __FUNCTION__, irq); @@ -85,6 +91,7 @@ fastcall unsigned int do_IRQ(struct pt_r old_regs = set_irq_regs(regs); irq_enter(); + ftrace_event_irq(irq, user_mode(regs), regs->eip); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -93,7 +100,7 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } Index: linux-2.6.24.7-rt27/arch/x86/kernel/irq_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/irq_64.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/irq_64.c 2009-02-08 00:01:32.000000000 -0500 @@ -18,6 +18,8 @@ #include #include +#include + atomic_t irq_err_count; #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -145,10 +147,14 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned vector = ~regs->orig_rax; unsigned irq; + irq_show_regs_callback(smp_processor_id(), regs); + exit_idle(); irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; + ftrace_event_irq(irq, user_mode(regs), regs->rip); + #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif Index: linux-2.6.24.7-rt27/arch/x86/kernel/traps_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/traps_32.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/traps_32.c 2009-02-08 00:04:52.000000000 -0500 @@ -30,6 +30,8 @@ #include #include +#include + #ifdef CONFIG_EISA #include #include @@ -237,6 +239,7 @@ show_trace_log_lvl(struct task_struct *t { dump_trace(task, regs, stack, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); + print_preempt_trace(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -266,8 +269,15 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } + + pause_on_oops_head(); + printk("\n%sCall Trace:\n", log_lvl); show_trace_log_lvl(task, regs, esp, log_lvl); + + pause_on_oops_tail(); + + debug_show_held_locks(task); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -293,6 +303,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -362,19 +378,21 @@ void die(const char * str, struct pt_reg u32 lock_owner; int lock_owner_depth; } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; static int die_counter; unsigned long flags; + ftrace_stop(); + oops_enter(); if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); + spin_lock(&die.lock); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); @@ -423,7 +441,7 @@ void die(const char * str, struct pt_reg bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); + spin_unlock(&die.lock); raw_local_irq_restore(flags); if (!regs) @@ -463,6 +481,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { /* * We want error_code and trap_no set for userspace faults and @@ -720,6 +743,7 @@ void __kprobes die_nmi(struct pt_regs *r crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } @@ -769,6 +793,8 @@ fastcall __kprobes void do_nmi(struct pt nmi_enter(); + ftrace_event_irq(-1, user_mode(regs), regs->eip); + cpu = smp_processor_id(); ++nmi_count(cpu); Index: linux-2.6.24.7-rt27/arch/x86/kernel/traps_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/traps_64.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/traps_64.c 2009-02-08 00:05:05.000000000 -0500 @@ -33,6 +33,8 @@ #include #include +#include + #if defined(CONFIG_EDAC) #include #endif @@ -80,20 +82,22 @@ static inline void conditional_sti(struc local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) +static inline void preempt_conditional_sti(struct pt_regs *regs, int stack) { - preempt_disable(); + if (stack) + preempt_disable(); if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void preempt_conditional_cli(struct pt_regs *regs, int stack) { if (regs->eflags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ - preempt_enable_no_resched(); + if (stack) + preempt_enable_no_resched(); } int kstack_depth_to_print = 12; @@ -129,10 +133,14 @@ static unsigned long *in_exception_stack unsigned *usedp, char **idp) { static char ids[][8] = { +#if DEBUG_STACK > 0 [DEBUG_STACK - 1] = "#DB", +#endif [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", +#if STACKFAULT_STACK > 0 [STACKFAULT_STACK - 1] = "#SS", +#endif [MCE_STACK - 1] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" @@ -218,7 +226,7 @@ void dump_trace(struct task_struct *tsk, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); + const unsigned cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -309,7 +317,6 @@ void dump_trace(struct task_struct *tsk, tinfo = task_thread_info(tsk); HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK - put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -347,9 +354,13 @@ static const struct stacktrace_ops print void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) { + pause_on_oops_head(); printk("\nCall Trace:\n"); dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); + pause_on_oops_tail(); + debug_show_held_locks(tsk); + print_preempt_trace(tsk); } static void @@ -357,7 +368,7 @@ _show_stack(struct task_struct *tsk, str { unsigned long *stack; int i; - const int cpu = smp_processor_id(); + const int cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -469,7 +480,7 @@ void out_of_line_bug(void) EXPORT_SYMBOL(out_of_line_bug); #endif -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static raw_spinlock_t die_lock = RAW_SPIN_LOCK_UNLOCKED(die_lock); static int die_owner = -1; static unsigned int die_nest_count; @@ -483,11 +494,11 @@ unsigned __kprobes long oops_begin(void) /* racy, but better than risking deadlock. */ raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { + if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - __raw_spin_lock(&die_lock); + spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -503,7 +514,7 @@ void __kprobes oops_end(unsigned long fl die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); + spin_unlock(&die_lock); raw_local_irq_restore(flags); if (panic_on_oops) panic("Fatal exception"); @@ -513,6 +524,9 @@ void __kprobes oops_end(unsigned long fl void __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; + + ftrace_stop(); + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); @@ -660,9 +674,9 @@ asmlinkage void do_stack_segment(struct if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, STACKFAULT_STACK); do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, STACKFAULT_STACK); } asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) @@ -782,6 +796,8 @@ asmlinkage __kprobes void default_do_nmi cpu = smp_processor_id(); + ftrace_event_irq(-1, user_mode(regs), regs->rip); + /* Only the BSP gets external NMIs from the system. */ if (!cpu) reason = get_nmi_reason(); @@ -820,9 +836,9 @@ asmlinkage void __kprobes do_int3(struct if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } /* Help handler running on IST stack to switch back to user stack @@ -862,7 +878,7 @@ asmlinkage void __kprobes do_debug(struc SIGTRAP) == NOTIFY_STOP) return; - preempt_conditional_sti(regs); + preempt_conditional_sti(regs, DEBUG_STACK); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -907,13 +923,13 @@ asmlinkage void __kprobes do_debug(struc clear_dr7: set_debugreg(0UL, 7); - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); return; clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->eflags &= ~TF_MASK; - preempt_conditional_cli(regs); + preempt_conditional_cli(regs, DEBUG_STACK); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) @@ -1122,6 +1138,7 @@ void __init trap_init(void) /* * Should be a barrier for any external CPU state. */ + allocate_stacks(0); cpu_init(); } Index: linux-2.6.24.7-rt27/arch/x86/mm/fault_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/mm/fault_32.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/mm/fault_32.c 2009-02-08 00:03:13.000000000 -0500 @@ -27,6 +27,8 @@ #include #include +#include + #include #include #include @@ -311,6 +313,8 @@ fastcall void __kprobes do_page_fault(st /* get the address */ address = read_cr2(); + ftrace_event_fault(regs->eip, error_code, address); + tsk = current; si_code = SEGV_MAPERR; @@ -354,7 +358,7 @@ fastcall void __kprobes do_page_fault(st * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; /* When running in the kernel we expect faults to occur only to @@ -498,6 +502,7 @@ bad_area_nosemaphore: nr = (address - idt_descr.address) >> 3; if (nr == 6) { + zap_rt_locks(); do_invalid_op(regs, 0); return; } Index: linux-2.6.24.7-rt27/arch/x86/mm/fault_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/mm/fault_64.c 2009-02-08 00:00:26.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/mm/fault_64.c 2009-02-08 00:03:13.000000000 -0500 @@ -27,6 +27,8 @@ #include #include +#include + #include #include #include @@ -316,6 +318,8 @@ asmlinkage void __kprobes do_page_fault( /* get the address */ address = read_cr2(); + ftrace_event_fault(regs->rip, error_code, address); + info.si_code = SEGV_MAPERR; @@ -365,7 +369,7 @@ asmlinkage void __kprobes do_page_fault( * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (unlikely(in_atomic() || !mm)) + if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) goto bad_area_nosemaphore; /* Index: linux-2.6.24.7-rt27/kernel/trace/trace_events.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_events.c 2009-02-08 00:04:45.000000000 -0500 @@ -0,0 +1,702 @@ +/* + * trace task events + * + * Copyright (C) 2007 Steven Rostedt + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array __read_mostly *events_trace; +static int __read_mostly tracer_enabled; +static atomic_t event_ref; + +static void event_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + tr->time_start = ftrace_now(raw_smp_processor_id()); +} + +/* HACK */ +void notrace +sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + struct trace_array *tr; + struct trace_array_cpu *data; + unsigned long flags; + unsigned long ip; + int cpu; + + if (!tracer_enabled) + return; + + tr = events_trace; + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + atomic_inc(&data->disabled); + if (atomic_read(&data->disabled) != 1) + goto out; + + ip = CALLER_ADDR0; + + tracing_event_syscall(tr, data, flags, ip, nr, p1, p2, p3); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +void notrace +sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2, + unsigned long p3) +{ + struct trace_array *tr; + struct trace_array_cpu *data; + unsigned long flags; + unsigned long ip; + int cpu; + + if (!tracer_enabled) + return; + + tr = events_trace; + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + atomic_inc(&data->disabled); + if (atomic_read(&data->disabled) != 1) + goto out; + + ip = CALLER_ADDR0; + tracing_event_syscall(tr, data, flags, ip, nr | 0x80000000, p1, p2, p3); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} +#endif + +void notrace +sys_ret(unsigned long ret) +{ + struct trace_array *tr; + struct trace_array_cpu *data; + unsigned long flags; + unsigned long ip; + int cpu; + + if (!tracer_enabled) + return; + + tr = events_trace; + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + atomic_inc(&data->disabled); + if (atomic_read(&data->disabled) != 1) + goto out; + + ip = CALLER_ADDR0; + tracing_event_sysret(tr, data, flags, ip, ret); + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#define getarg(arg, ap) arg = va_arg(ap, typeof(arg)); + +static void +event_irq_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long ip, flags; + int irq, user, cpu; + long disable; + + if (!tracer_enabled) + return; + + getarg(irq, *args); + getarg(user, *args); + getarg(ip, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_irq(tr, data, flags, CALLER_ADDR1, irq, user, ip); + + out: + atomic_dec(&data->disabled); +} + +static void +event_fault_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long ip, flags, error, addr; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(ip, *args); + getarg(error, *args); + getarg(addr, *args); + + preempt_disable_notrace(); + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_fault(tr, data, flags, CALLER_ADDR1, ip, error, addr); + + out: + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static void +event_timer_set_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *expires; + void *timer; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(expires, *args); + getarg(timer, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_timer_set(tr, data, flags, CALLER_ADDR1, expires, timer); + + out: + atomic_dec(&data->disabled); +} + +static void +event_timer_triggered_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *expired; + void *timer; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(expired, *args); + getarg(timer, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_timer_triggered(tr, data, flags, CALLER_ADDR1, expired, timer); + + out: + atomic_dec(&data->disabled); +} + +static void +event_hrtimer_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *now; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(now, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_timestamp(tr, data, flags, CALLER_ADDR1, now); + + out: + atomic_dec(&data->disabled); +} + +static void +event_program_event_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + ktime_t *expires; + int64_t *delta; + long disable; + int cpu; + + if (!tracer_enabled) + return; + + getarg(expires, *args); + getarg(delta, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_program_event(tr, data, flags, CALLER_ADDR1, expires, delta); + + out: + atomic_dec(&data->disabled); +} + + +static void +event_task_activate_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *p; + long disable; + int cpu, rqcpu; + + if (!tracer_enabled) + return; + + getarg(p, *args); + getarg(rqcpu, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_task_activate(tr, data, flags, CALLER_ADDR1, p, rqcpu); + + out: + atomic_dec(&data->disabled); +} + +static void +event_task_deactivate_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *p; + long disable; + int cpu, rqcpu; + + if (!tracer_enabled) + return; + + getarg(p, *args); + getarg(rqcpu, *args); + + /* interrupts should be off, we are in an interrupt */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (disable != 1) + goto out; + + local_save_flags(flags); + tracing_event_task_deactivate(tr, data, flags, CALLER_ADDR1, p, rqcpu); + + out: + atomic_dec(&data->disabled); +} + +static void +event_wakeup_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *wakee, *curr; + long disable, ignore2; + void *ignore3; + int ignore1; + int cpu; + + if (!tracer_enabled) + return; + + getarg(ignore1, *args); + getarg(ignore2, *args); + getarg(ignore3, *args); + + getarg(wakee, *args); + getarg(curr, *args); + + /* interrupts should be disabled */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) + goto out; + + local_save_flags(flags); + /* record process's command line */ + tracing_record_cmdline(wakee); + tracing_record_cmdline(curr); + + tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + + out: + atomic_dec(&data->disabled); +} +static void +event_ctx_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct trace_array *tr = probe_data; + struct trace_array_cpu *data; + unsigned long flags; + struct task_struct *prev; + struct task_struct *next; + long disable, ignore2; + void *ignore3; + int ignore1; + int cpu; + + if (!tracer_enabled) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + getarg(ignore1, *args); + getarg(ignore1, *args); + getarg(ignore2, *args); + getarg(ignore3, *args); + + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + + /* interrupts should be disabled */ + cpu = smp_processor_id(); + data = tr->data[cpu]; + disable = atomic_inc_return(&data->disabled); + + if (likely(disable != 1)) + goto out; + + local_save_flags(flags); + tracing_sched_switch_trace(tr, data, prev, next, flags); + out: + atomic_dec(&data->disabled); +} + +static int event_register_marker(const char *name, const char *format, + marker_probe_func *probe, void *data) +{ + int ret; + + ret = marker_probe_register(name, format, probe, data); + if (ret) { + pr_info("event trace: Couldn't add marker" + " probe to %s\n", name); + return ret; + } + + return 0; +} + +static void event_tracer_register(struct trace_array *tr) +{ + int ret; + + ret = event_register_marker("ftrace_event_irq", "%d %d %ld", + event_irq_callback, tr); + if (ret) + return; + + ret = event_register_marker("ftrace_event_fault", "%ld %ld %ld", + event_fault_callback, tr); + if (ret) + goto out1; + + ret = event_register_marker("ftrace_event_timer_set", "%p %p", + event_timer_set_callback, tr); + if (ret) + goto out2; + + ret = event_register_marker("ftrace_event_timer_triggered", "%p %p", + event_timer_triggered_callback, tr); + if (ret) + goto out3; + + ret = event_register_marker("ftrace_event_hrtimer", "%p", + event_hrtimer_callback, tr); + if (ret) + goto out4; + + ret = event_register_marker("ftrace_event_task_activate", "%p %d", + event_task_activate_callback, tr); + if (ret) + goto out5; + + ret = event_register_marker("ftrace_event_task_deactivate", "%p %d", + event_task_deactivate_callback, tr); + if (ret) + goto out6; + + ret = event_register_marker("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + event_wakeup_callback, tr); + if (ret) + goto out7; + + ret = event_register_marker("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + event_wakeup_callback, tr); + if (ret) + goto out8; + + ret = event_register_marker("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + event_ctx_callback, tr); + if (ret) + goto out9; + + ret = event_register_marker("ftrace_event_timer", "%p %p", + event_program_event_callback, tr); + if (ret) + goto out10; + + return; + + out10: + marker_probe_unregister("kernel_sched_schedule", + event_ctx_callback, tr); + out9: + marker_probe_unregister("kernel_sched_wakeup_new", + event_wakeup_callback, tr); + out8: + marker_probe_unregister("kernel_sched_wakeup", + event_wakeup_callback, tr); + out7: + marker_probe_unregister("ftrace_event_task_deactivate", + event_task_deactivate_callback, tr); + out6: + marker_probe_unregister("ftrace_event_task_activate", + event_task_activate_callback, tr); + out5: + marker_probe_unregister("ftrace_event_hrtimer", + event_hrtimer_callback, tr); + out4: + marker_probe_unregister("ftrace_event_timer_triggered", + event_timer_triggered_callback, tr); + out3: + marker_probe_unregister("ftrace_event_timer_set", + event_timer_set_callback, tr); + out2: + marker_probe_unregister("ftrace_event_fault", + event_fault_callback, tr); + out1: + marker_probe_unregister("ftrace_event_irq", + event_irq_callback, tr); +} + +static void event_tracer_unregister(struct trace_array *tr) +{ + marker_probe_unregister("ftrace_event_timer", + event_program_event_callback, tr); + marker_probe_unregister("kernel_sched_schedule", + event_ctx_callback, tr); + marker_probe_unregister("kernel_sched_wakeup_new", + event_wakeup_callback, tr); + marker_probe_unregister("kernel_sched_wakeup", + event_wakeup_callback, tr); + marker_probe_unregister("ftrace_event_task_deactivate", + event_task_deactivate_callback, tr); + marker_probe_unregister("ftrace_event_task_activate", + event_task_activate_callback, tr); + marker_probe_unregister("ftrace_event_hrtimer", + event_hrtimer_callback, tr); + marker_probe_unregister("ftrace_event_timer_triggered", + event_timer_triggered_callback, tr); + marker_probe_unregister("ftrace_event_timer_set", + event_timer_set_callback, tr); + marker_probe_unregister("ftrace_event_fault", + event_fault_callback, tr); + marker_probe_unregister("ftrace_event_irq", + event_irq_callback, tr); +} + +void trace_event_register(struct trace_array *tr) +{ + long ref; + + ref = atomic_inc_return(&event_ref); + if (ref == 1) + event_tracer_register(tr); +} + +void trace_event_unregister(struct trace_array *tr) +{ + long ref; + + ref = atomic_dec_and_test(&event_ref); + if (ref) + event_tracer_unregister(tr); +} + +static void start_event_trace(struct trace_array *tr) +{ + event_reset(tr); + trace_event_register(tr); + tracing_start_function_trace(); + tracer_enabled = 1; +} + +static void stop_event_trace(struct trace_array *tr) +{ + tracer_enabled = 0; + tracing_stop_function_trace(); + trace_event_unregister(tr); +} + +static void event_trace_init(struct trace_array *tr) +{ + events_trace = tr; + + if (tr->ctrl) + start_event_trace(tr); +} + +static void event_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_event_trace(tr); +} + +static void event_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_event_trace(tr); + else + stop_event_trace(tr); +} + +static void event_trace_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; +} + +static void event_trace_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + tracer_enabled = 1; +} + +static struct tracer event_trace __read_mostly = +{ + .name = "events", + .init = event_trace_init, + .reset = event_trace_reset, + .open = event_trace_open, + .close = event_trace_close, + .ctrl_update = event_trace_ctrl_update, +}; + +__init static int init_event_trace(void) +{ + int ret; + + ret = register_tracer(&event_trace); + if (ret) + return ret; + + return 0; +} + +device_initcall(init_event_trace); Index: linux-2.6.24.7-rt27/kernel/trace/trace_hist.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_hist.c 2009-02-08 00:05:26.000000000 -0500 @@ -0,0 +1,657 @@ +/* + * kernel/trace/trace_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + * Converted to work with the new latency tracer. + * Copyright (C) 2008 Red Hat, Inc. + * Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_hist.h" + +enum { + INTERRUPT_LATENCY = 0, + PREEMPT_LATENCY, + PREEMPT_INTERRUPT_LATENCY, + WAKEUP_LATENCY, +}; + +#define MAX_ENTRY_NUM 10240 + +struct hist_data { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +}; + +static char *latency_hist_dir_root = "latency_hist"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, interrupt_off_hist); +static char *interrupt_off_hist_dir = "interrupt_off_latency"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(struct hist_data, preempt_off_hist); +static char *preempt_off_hist_dir = "preempt_off_latency"; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) +static DEFINE_PER_CPU(struct hist_data, preempt_irqs_off_hist); +static char *preempt_irqs_off_hist_dir = "preempt_interrupts_off_latency"; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist); +static char *wakeup_latency_hist_dir = "wakeup_latency"; +#endif + +void notrace latency_hist(int latency_type, int cpu, unsigned long latency) +{ + struct hist_data *my_hist; + + if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY) + || (latency_type > WAKEUP_LATENCY) || (latency < 0)) + return; + + switch (latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + my_hist = &per_cpu(interrupt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + my_hist = &per_cpu(preempt_off_hist, cpu); + break; +#endif + +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST) + case PREEMPT_INTERRUPT_LATENCY: + my_hist = &per_cpu(preempt_irqs_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = &per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) + my_hist->max_lat = latency; + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + return; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + struct hist_data *my_hist = m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + char avgstr[32]; + + atomic_dec(&my_hist->hist_mode); + if (likely(my_hist->total_samples)) { + unsigned long avg = (unsigned long) + div64_64(my_hist->accumulate_lat, + my_hist->total_samples); + sprintf(avgstr, "%lu", avg); + } else + strcpy(avgstr, ""); + + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %s microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal" + " than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , avgstr + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t *pos) +{ + loff_t *index_ptr = p; + struct hist_data *my_hist = m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + struct hist_data *my_hist = m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &latency_hist_seq_op); + if (!ret) { + struct seq_file *seq = file->private_data; + seq->private = inode->i_private; + } + return ret; +} + +static struct file_operations latency_hist_fops = { + .open = latency_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void hist_reset(struct hist_data *hist) +{ + atomic_dec(&hist->hist_mode); + + memset(hist->hist_array, 0, sizeof(hist->hist_array)); + hist->beyond_hist_bound_samples = 0ULL; + hist->min_lat = 0xFFFFFFFFUL; + hist->max_lat = 0UL; + hist->total_samples = 0ULL; + hist->accumulate_lat = 0ULL; + + atomic_inc(&hist->hist_mode); +} + +ssize_t latency_hist_reset(struct file *file, const char __user *a, + size_t size, loff_t *off) +{ + int cpu; + struct hist_data *hist; + int latency_type = (long)file->private_data; + + switch (latency_type) { + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(wakeup_latency_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preempt_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(interrupt_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + case PREEMPT_INTERRUPT_LATENCY: + for_each_online_cpu(cpu) { + hist = &per_cpu(preempt_irqs_off_hist, cpu); + hist_reset(hist); + } + break; +#endif + } + + return size; +} + +static struct file_operations latency_hist_reset_fops = { + .open = tracing_open_generic, + .write = latency_hist_reset, +}; + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start); +static DEFINE_PER_CPU(int, hist_irqsoff_tracing); +#endif +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start); +static DEFINE_PER_CPU(int, hist_preemptoff_tracing); +#endif +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start); +static DEFINE_PER_CPU(int, hist_preemptirqsoff_tracing); +#endif + +notrace void tracing_hist_preempt_start(void) +{ + cycle_t uninitialized_var(start); + int start_set = 0; + int cpu; + + if (!preempt_count() && !irqs_disabled()) + return; + + /* cpu is only used if we are in atomic */ + cpu = raw_smp_processor_id(); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if (irqs_disabled() && + !per_cpu(hist_irqsoff_tracing, cpu)) { + per_cpu(hist_irqsoff_tracing, cpu) = 1; + start_set++; + start = ftrace_now(cpu); + per_cpu(hist_irqsoff_start, cpu) = start; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if (preempt_count() && + !per_cpu(hist_preemptoff_tracing, cpu)) { + per_cpu(hist_preemptoff_tracing, cpu) = 1; + if (1 || !(start_set++)) + start = ftrace_now(cpu); + per_cpu(hist_preemptoff_start, cpu) = start; + + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if (!per_cpu(hist_preemptirqsoff_tracing, cpu)) { + per_cpu(hist_preemptirqsoff_tracing, cpu) = 1; + if (1 || !(start_set)) + start = ftrace_now(cpu); + per_cpu(hist_preemptirqsoff_start, cpu) = start; + } +#endif +} + +notrace void tracing_hist_preempt_stop(int irqs_on) +{ + long latency; + cycle_t start; + cycle_t uninitialized_var(stop); + int stop_set = 0; + int cpu; + + /* irqs_on == TRACE_STOP if we must stop tracing. */ + + /* cpu is only used if we are in atomic */ + cpu = raw_smp_processor_id(); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + if (irqs_on && + per_cpu(hist_irqsoff_tracing, cpu)) { + WARN_ON(!irqs_disabled()); + stop = ftrace_now(cpu); + stop_set++; + start = per_cpu(hist_irqsoff_start, cpu); + + if (stop > start) { + latency = (long)nsecs_to_usecs(stop - start); + if (latency > 1000000) { + printk("%d: latency = %ld (%lu)\n", __LINE__, + latency, latency); + printk("%d: start=%Ld stop=%Ld\n", __LINE__, + start, stop); + } + } else + latency = 0; + barrier(); + per_cpu(hist_irqsoff_tracing, cpu) = 0; + latency_hist(INTERRUPT_LATENCY, cpu, latency); + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + if ((!irqs_on || irqs_on == TRACE_STOP) && + per_cpu(hist_preemptoff_tracing, cpu)) { + WARN_ON(!preempt_count()); + if (1 || !(stop_set++)) + stop = ftrace_now(cpu); + start = per_cpu(hist_preemptoff_start, cpu); + + if (stop > start) { + latency = (long)nsecs_to_usecs(stop - start); + if (latency > 1000000) { + printk("%d: latency = %ld (%lu)\n", __LINE__, + latency, latency); + printk("%d: start=%Ld stop=%Ld\n", __LINE__, + start, stop); + } + } else + latency = 0; + barrier(); + per_cpu(hist_preemptoff_tracing, cpu) = 0; + latency_hist(PREEMPT_LATENCY, cpu, latency); + } +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + if (((!irqs_on && !irqs_disabled()) || + (irqs_on && !preempt_count()) || + (irqs_on == TRACE_STOP)) && + per_cpu(hist_preemptirqsoff_tracing, cpu)) { + WARN_ON(!preempt_count() && !irqs_disabled()); + if (1 || !stop_set) + stop = ftrace_now(cpu); + start = per_cpu(hist_preemptirqsoff_start, cpu); + + if (stop > start) { + latency = (long)nsecs_to_usecs(stop - start); + if (latency > 1000000) { + printk("%d: latency = %ld (%lu)\n", __LINE__, + latency, latency); + printk("%d: start=%Ld stop=%Ld\n", __LINE__, + start, stop); + } + } else + latency = 0; + + barrier(); + per_cpu(hist_preemptirqsoff_tracing, cpu) = 0; + latency_hist(PREEMPT_INTERRUPT_LATENCY, cpu, latency); + } +#endif +} +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +int tracing_wakeup_hist __read_mostly = 1; + +static unsigned wakeup_prio = (unsigned)-1 ; +static struct task_struct *wakeup_task; +static cycle_t wakeup_start; +static DEFINE_RAW_SPINLOCK(wakeup_lock); + +notrace void tracing_hist_wakeup_start(struct task_struct *p, + struct task_struct *curr) +{ + unsigned long flags; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + spin_lock_irqsave(&wakeup_lock, flags); + if (wakeup_task) + put_task_struct(wakeup_task); + + get_task_struct(p); + wakeup_task = p; + wakeup_prio = p->prio; + wakeup_start = ftrace_now(raw_smp_processor_id()); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +notrace void tracing_hist_wakeup_stop(struct task_struct *next) +{ + unsigned long flags; + long latency; + cycle_t stop; + + if (next != wakeup_task) + return; + + stop = ftrace_now(raw_smp_processor_id()); + + spin_lock_irqsave(&wakeup_lock, flags); + if (wakeup_task != next) + goto out; + + latency = (long)nsecs_to_usecs(stop - wakeup_start); + + latency_hist(WAKEUP_LATENCY, smp_processor_id(), latency); + + put_task_struct(wakeup_task); + wakeup_task = NULL; + wakeup_prio = (unsigned)-1; + out: + spin_unlock_irqrestore(&wakeup_lock, flags); + +} + +static void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_hist_wakeup_stop(next); +} + +static void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_hist_wakeup_start(task, curr); +} + +#endif + +static __init int latency_hist_init(void) +{ + struct dentry *latency_hist_root = NULL; + struct dentry *dentry; + struct dentry *entry; + int i = 0, len = 0; + struct hist_data *my_hist; + char name[64]; + + dentry = tracing_init_dentry(); + + latency_hist_root = + debugfs_create_dir(latency_hist_dir_root, dentry); + +#ifdef CONFIG_INTERRUPT_OFF_HIST + dentry = debugfs_create_dir(interrupt_off_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(interrupt_off_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(interrupt_off_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)INTERRUPT_LATENCY, + &latency_hist_reset_fops); +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + dentry = debugfs_create_dir(preempt_off_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preempt_off_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(preempt_off_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)PREEMPT_LATENCY, + &latency_hist_reset_fops); +#endif + +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST) + dentry = debugfs_create_dir(preempt_irqs_off_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(preempt_off_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(preempt_irqs_off_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)PREEMPT_INTERRUPT_LATENCY, + &latency_hist_reset_fops); +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + + i = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, NULL); + if (i) { + pr_info("wakeup hist: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + goto out_wake; + } + + i = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, NULL); + if (i) { + pr_info("wakeup hist: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + i = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, NULL); + if (i) { + pr_info("wakeup hist: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + dentry = debugfs_create_dir(wakeup_latency_hist_dir, + latency_hist_root); + for_each_possible_cpu(i) { + len = sprintf(name, "CPU%d", i); + name[len] = '\0'; + entry = debugfs_create_file(name, 0444, dentry, + &per_cpu(wakeup_latency_hist, i), + &latency_hist_fops); + my_hist = &per_cpu(wakeup_latency_hist, i); + atomic_set(&my_hist->hist_mode, 1); + my_hist->min_lat = 0xFFFFFFFFUL; + } + entry = debugfs_create_file("reset", 0444, dentry, + (void *)WAKEUP_LATENCY, + &latency_hist_reset_fops); + + goto out_wake; + +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, NULL); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, NULL); + out_wake: +#endif + return 0; + +} + +__initcall(latency_hist_init); Index: linux-2.6.24.7-rt27/kernel/trace/trace_hist.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/trace_hist.h 2009-02-08 00:01:14.000000000 -0500 @@ -0,0 +1,39 @@ +/* + * kernel/trace/trace_hist.h + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + * Converted to work with the new latency tracer. + * Copyright (C) 2008 Red Hat, Inc. + * Steven Rostedt + * + */ +#ifndef _LIB_TRACING_TRACER_HIST_H_ +#define _LIB_TRACING_TRACER_HIST_H_ + +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST) +# define TRACE_STOP 2 +void tracing_hist_preempt_start(void); +void tracing_hist_preempt_stop(int irqs_on); +#else +# define tracing_hist_preempt_start() do { } while (0) +# define tracing_hist_preempt_stop(irqs_off) do { } while (0) +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +void tracing_hist_wakeup_start(struct task_struct *p, + struct task_struct *curr); +void tracing_hist_wakeup_stop(struct task_struct *next); +extern int tracing_wakeup_hist; +#else +# define tracing_hist_wakeup_start(p, curr) do { } while (0) +# define tracing_hist_wakeup_stop(next) do { } while (0) +# define tracing_wakeup_hist 0 +#endif + +#endif /* ifndef _LIB_TRACING_TRACER_HIST_H_ */ Index: linux-2.6.24.7-rt27/arch/x86/ia32/ia32entry.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/ia32/ia32entry.S 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/ia32/ia32entry.S 2009-02-08 00:01:15.000000000 -0500 @@ -132,7 +132,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -237,7 +239,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -328,8 +332,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -400,7 +406,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -726,4 +732,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate +.globl ia32_syscall_end ia32_syscall_end: Index: linux-2.6.24.7-rt27/include/asm-x86/calling.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/calling.h 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/calling.h 2009-02-08 00:01:15.000000000 -0500 @@ -160,3 +160,53 @@ .macro icebp .byte 0xf1 .endm + +/* + * latency-tracing helpers: + */ + + .macro TRACE_SYS_CALL + +#ifdef CONFIG_EVENT_TRACER + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_call + + RESTORE_ARGS +#endif + .endm + + + .macro TRACE_SYS_IA32_CALL + +#ifdef CONFIG_EVENT_TRACER + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_ia32_call + + RESTORE_ARGS +#endif + .endm + + .macro TRACE_SYS_RET + +#ifdef CONFIG_EVENT_TRACER + SAVE_ARGS + + mov %rax, %rdi + + call sys_ret + + RESTORE_ARGS +#endif + .endm Index: linux-2.6.24.7-rt27/include/asm-x86/unistd_64.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/unistd_64.h 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/unistd_64.h 2009-02-08 00:01:15.000000000 -0500 @@ -11,6 +11,8 @@ * Note: holes are not allowed. */ +#define NR_syscalls (__NR_syscall_max+1) + /* at least 8 syscall per cacheline */ #define __NR_read 0 __SYSCALL(__NR_read, sys_read) Index: linux-2.6.24.7-rt27/arch/arm/kernel/traps.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/kernel/traps.c 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/kernel/traps.c 2009-02-08 00:02:19.000000000 -0500 @@ -233,7 +233,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -276,7 +276,7 @@ void arm_notify_die(const char *str, str } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { @@ -354,6 +354,7 @@ asmlinkage void do_unexp_fiq (struct pt_ { printk("Hmm. Unexpected FIQ received, but trying to continue\n"); printk("You may have a hardware problem...\n"); + print_preempt_trace(current); } /* Index: linux-2.6.24.7-rt27/kernel/trace/preempt-trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/trace/preempt-trace.c 2009-02-08 00:01:16.000000000 -0500 @@ -0,0 +1,30 @@ +#include +#include +#include + +void print_preempt_trace(struct task_struct *task) +{ + unsigned int count; + unsigned int i, lim; + + if (!task) + task = current; + + count = task_thread_info(task)->preempt_count; + lim = count & PREEMPT_MASK; + + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} Index: linux-2.6.24.7-rt27/arch/arm/kernel/irq.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/kernel/irq.c 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/kernel/irq.c 2009-02-08 00:02:19.000000000 -0500 @@ -37,6 +37,8 @@ #include #include +#include + #include #include @@ -100,7 +102,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* @@ -108,11 +110,13 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage void __exception notrace asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc = irq_desc + irq; + ftrace_event_irq(irq, user_mode(regs), instruction_pointer(regs)); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux-2.6.24.7-rt27/arch/powerpc/xmon/xmon.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/xmon/xmon.c 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/xmon/xmon.c 2009-02-08 00:02:31.000000000 -0500 @@ -340,6 +340,7 @@ static int xmon_core(struct pt_regs *reg unsigned long timeout; #endif + preempt_disable(); local_irq_save(flags); bp = in_breakpoint_table(regs->nip, &offset); @@ -516,6 +517,7 @@ static int xmon_core(struct pt_regs *reg insert_cpu_bpts(); local_irq_restore(flags); + preempt_enable(); return cmd != 'X' && cmd != EOF; } @@ -2129,7 +2131,7 @@ print_address(unsigned long addr) static unsigned long mdest; /* destination address */ static unsigned long msrc; /* source address */ static unsigned long mval; /* byte value to set memory to */ -static unsigned long mcount; /* # bytes to affect */ +static unsigned long xmon_mcount; /* # bytes to affect */ static unsigned long mdiffs; /* max # differences to print */ void @@ -2141,19 +2143,20 @@ memops(int cmd) scanhex((void *)(cmd == 's'? &mval: &msrc)); if( termch != '\n' ) termch = 0; - scanhex((void *)&mcount); + scanhex((void *)&xmon_mcount); switch( cmd ){ case 'm': - memmove((void *)mdest, (void *)msrc, mcount); + memmove((void *)mdest, (void *)msrc, xmon_mcount); break; case 's': - memset((void *)mdest, mval, mcount); + memset((void *)mdest, mval, xmon_mcount); break; case 'd': if( termch != '\n' ) termch = 0; scanhex((void *)&mdiffs); - memdiffs((unsigned char *)mdest, (unsigned char *)msrc, mcount, mdiffs); + memdiffs((unsigned char *)mdest, (unsigned char *)msrc, + xmon_mcount, mdiffs); break; } } Index: linux-2.6.24.7-rt27/arch/powerpc/Kconfig =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/Kconfig 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/Kconfig 2009-02-08 00:02:05.000000000 -0500 @@ -46,13 +46,6 @@ config IRQ_PER_CPU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -79,6 +72,7 @@ config ARCH_NO_VIRT_TO_BUS config PPC bool default y + select HAVE_FTRACE config EARLY_PRINTK bool @@ -176,6 +170,18 @@ config HIGHMEM source kernel/time/Kconfig source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/Makefile 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/Makefile 2009-02-08 00:02:05.000000000 -0500 @@ -10,11 +10,24 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +ifdef CONFIG_FTRACE +# Do not trace early boot code +CFLAGS_REMOVE_cputable.o = -pg +CFLAGS_REMOVE_prom_init.o = -pg + +ifdef CONFIG_DYNAMIC_FTRACE +# dynamic ftrace setup. +CFLAGS_REMOVE_ftrace.o = -pg +endif + +endif + +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o \ signal.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_ppc970.o \ @@ -75,6 +88,8 @@ obj-$(CONFIG_KEXEC) += machine_kexec.o obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o + obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o ifneq ($(CONFIG_PPC_INDIRECT_IO),y) Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/entry_32.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/entry_32.S 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/entry_32.S 2009-02-08 00:01:54.000000000 -0500 @@ -30,6 +30,7 @@ #include #include #include +#include #undef SHOW_SYSCALLS #undef SHOW_SYSCALLS_TASK @@ -661,7 +662,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -896,7 +897,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -910,7 +911,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user @@ -1022,3 +1023,129 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 + stw r7, 28(r1) + mfcr r5 + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + stw r3, 44(r1) + stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE + .globl mcount_call +mcount_call: + bl ftrace_stub + nop + lwz r6, 8(r1) + lwz r0, 44(r1) + lwz r3, 12(r1) + mtctr r0 + lwz r4, 16(r1) + mtcr r6 + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r0, 52(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + mtlr r0 + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1, r1, 48 + bctr + +_GLOBAL(ftrace_caller) + /* Based off of objdump optput from glibc */ + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 + lwz r4, 52(r1) + mfcr r5 + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + stw r3, 44(r1) + stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop + lwz r6, 8(r1) + lwz r0, 44(r1) + lwz r3, 12(r1) + mtctr r0 + lwz r4, 16(r1) + mtcr r6 + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r0, 52(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + mtlr r0 + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1, r1, 48 + bctr +#else +_GLOBAL(mcount) +_GLOBAL(_mcount) + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 + lwz r4, 52(r1) + mfcr r5 + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + stw r3, 44(r1) + stw r5, 8(r1) + + subi r3, r3, MCOUNT_INSN_SIZE + LOAD_REG_ADDR(r5, ftrace_trace_function) + lwz r5,0(r5) + + mtctr r5 + bctrl + + nop + + lwz r6, 8(r1) + lwz r0, 44(r1) + lwz r3, 12(r1) + mtctr r0 + lwz r4, 16(r1) + mtcr r6 + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r0, 52(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + mtlr r0 + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1, r1, 48 + bctr +#endif + +_GLOBAL(ftrace_stub) + blr + +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/entry_64.S 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/entry_64.S 2009-02-08 00:05:02.000000000 -0500 @@ -29,6 +29,7 @@ #include #include #include +#include /* * System calls. @@ -470,7 +471,8 @@ _GLOBAL(ret_from_except_lite) #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ + li r0,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) + /* bits to check */ ld r3,_MSR(r1) ld r4,TI_FLAGS(r9) /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ @@ -578,31 +580,31 @@ do_work: cmpdi r0,0 crandc eq,cr1*4+eq,eq bne restore - /* here we are preempting the current task */ 1: - li r0,1 - stb r0,PACASOFTIRQEN(r13) - stb r0,PACAHARDIRQEN(r13) - ori r10,r10,MSR_EE - mtmsrd r10,1 /* reenable interrupts */ - bl .preempt_schedule + /* preempt_schedule_irq() expects interrupts disabled. */ + bl .preempt_schedule_irq mfmsr r10 clrrdi r9,r1,THREAD_SHIFT rldicl r10,r10,48,1 /* disable interrupts again */ rotldi r10,r10,16 mtmsrd r10,1 ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne 1b b restore user_work: #endif + /* here we are preempting the current task */ + li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + /* Enable interrupts */ ori r10,r10,MSR_EE mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq 1f bl .schedule b .ret_from_except_lite @@ -846,3 +848,67 @@ _GLOBAL(enter_prom) ld r0,16(r1) mtlr r0 blr + +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + stdu r1, -112(r1) + std r3, 128(r1) + subi r3, r3, MCOUNT_INSN_SIZE + .globl mcount_call +mcount_call: + bl ftrace_stub + nop + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 + blr + +_GLOBAL(ftrace_caller) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + subi r3, r3, MCOUNT_INSN_SIZE +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr +#else +_GLOBAL(mcount) + blr + +_GLOBAL(_mcount) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + + subi r3, r3, MCOUNT_INSN_SIZE + LOAD_REG_ADDR(r5,ftrace_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + + nop + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr + +#endif +#endif Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/ftrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/ftrace.c 2009-02-08 00:01:18.000000000 -0500 @@ -0,0 +1,154 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + + +static unsigned int ftrace_nop = 0x60000000; + +#ifdef CONFIG_PPC32 +# define GET_ADDR(addr) addr +#else +/* PowerPC64's functions are data that points to the functions */ +# define GET_ADDR(addr) *(unsigned long *)addr +#endif + + +static unsigned int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)&ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static unsigned int op; + + /* + * It would be nice to just use create_function_call, but that will + * update the code itself. Here we need to just return the + * instruction that is going to be modified, without modifying the + * code. + */ + addr = GET_ADDR(addr); + + /* Set to "bl addr" */ + op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffc); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return (unsigned char *)&op; +} + +#ifdef CONFIG_PPC64 +# define _ASM_ALIGN " .align 3 " +# define _ASM_PTR " .llong " +#else +# define _ASM_ALIGN " .align 2 " +# define _ASM_PTR " .long " +#endif + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned replaced; + unsigned old = *(unsigned *)old_code; + unsigned new = *(unsigned *)new_code; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lwz %1, 0(%2)\n" + " cmpw %1, %5\n" + " bne 2f\n" + " stwu %3, 0(%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: li %0, 1\n" + " b 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + _ASM_ALIGN "\n" + _ASM_PTR "1b, 3b\n" + ".previous" + : "=r"(faulted), "=r"(replaced) + : "r"(ip), "r"(new), + "0"(faulted), "r"(old) + : "memory"); + + if (replaced != old && replaced != new) + faulted = 2; + + if (!faulted) + flush_icache_range(ip, ip + 8); + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[MCOUNT_INSN_SIZE], *new; + int ret; + + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[MCOUNT_INSN_SIZE], *new; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) +{ + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + + return 0; +} + Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/io.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/io.c 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/io.c 2009-02-08 00:01:17.000000000 -0500 @@ -120,7 +120,8 @@ EXPORT_SYMBOL(_outsl_ns); #define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0) -void _memset_io(volatile void __iomem *addr, int c, unsigned long n) +notrace void +_memset_io(volatile void __iomem *addr, int c, unsigned long n) { void *p = (void __force *)addr; u32 lc = c; Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/irq.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/irq.c 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/irq.c 2009-02-08 00:05:03.000000000 -0500 @@ -94,11 +94,9 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; -static inline unsigned long get_hard_enabled(void) +static inline notrace unsigned long get_hard_enabled(void) { unsigned long enabled; @@ -108,13 +106,13 @@ static inline unsigned long get_hard_ena return enabled; } -static inline void set_soft_enabled(unsigned long enable) +static inline notrace void set_soft_enabled(unsigned long enable) { __asm__ __volatile__("stb %0,%1(13)" : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void local_irq_restore(unsigned long en) +notrace void raw_local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; @@ -405,9 +403,8 @@ void do_softirq(void) #ifdef CONFIG_PPC_MERGE static LIST_HEAD(irq_hosts); -static DEFINE_SPINLOCK(irq_big_lock); -static DEFINE_PER_CPU(unsigned int, irq_radix_reader); -static unsigned int irq_radix_writer; +static DEFINE_RAW_SPINLOCK(irq_big_lock); +static atomic_t revmap_trees_allocated = ATOMIC_INIT(0); struct irq_map_entry irq_map[NR_IRQS]; static unsigned int irq_virq_count = NR_IRQS; static struct irq_host *irq_default_host; @@ -550,57 +547,6 @@ void irq_set_virq_count(unsigned int cou irq_virq_count = count; } -/* radix tree not lockless safe ! we use a brlock-type mecanism - * for now, until we can use a lockless radix tree - */ -static void irq_radix_wrlock(unsigned long *flags) -{ - unsigned int cpu, ok; - - spin_lock_irqsave(&irq_big_lock, *flags); - irq_radix_writer = 1; - smp_mb(); - do { - barrier(); - ok = 1; - for_each_possible_cpu(cpu) { - if (per_cpu(irq_radix_reader, cpu)) { - ok = 0; - break; - } - } - if (!ok) - cpu_relax(); - } while(!ok); -} - -static void irq_radix_wrunlock(unsigned long flags) -{ - smp_wmb(); - irq_radix_writer = 0; - spin_unlock_irqrestore(&irq_big_lock, flags); -} - -static void irq_radix_rdlock(unsigned long *flags) -{ - local_irq_save(*flags); - __get_cpu_var(irq_radix_reader) = 1; - smp_mb(); - if (likely(irq_radix_writer == 0)) - return; - __get_cpu_var(irq_radix_reader) = 0; - smp_wmb(); - spin_lock(&irq_big_lock); - __get_cpu_var(irq_radix_reader) = 1; - spin_unlock(&irq_big_lock); -} - -static void irq_radix_rdunlock(unsigned long flags) -{ - __get_cpu_var(irq_radix_reader) = 0; - local_irq_restore(flags); -} - static int irq_setup_virq(struct irq_host *host, unsigned int virq, irq_hw_number_t hwirq) { @@ -755,7 +701,6 @@ void irq_dispose_mapping(unsigned int vi { struct irq_host *host; irq_hw_number_t hwirq; - unsigned long flags; if (virq == NO_IRQ) return; @@ -787,15 +732,20 @@ void irq_dispose_mapping(unsigned int vi if (hwirq < host->revmap_data.linear.size) host->revmap_data.linear.revmap[hwirq] = NO_IRQ; break; - case IRQ_HOST_MAP_TREE: + case IRQ_HOST_MAP_TREE: { + DEFINE_RADIX_TREE_CONTEXT(ctx, &host->revmap_data.tree); + /* Check if radix tree allocated yet */ - if (host->revmap_data.tree.gfp_mask == 0) + if (atomic_read(&revmap_trees_allocated) == 0) break; - irq_radix_wrlock(&flags); - radix_tree_delete(&host->revmap_data.tree, hwirq); - irq_radix_wrunlock(flags); + + radix_tree_lock(&ctx); + radix_tree_delete(ctx.tree, hwirq); + radix_tree_unlock(&ctx); + break; } + } /* Destroy map */ smp_mb(); @@ -848,22 +798,20 @@ unsigned int irq_radix_revmap(struct irq struct radix_tree_root *tree; struct irq_map_entry *ptr; unsigned int virq; - unsigned long flags; WARN_ON(host->revmap_type != IRQ_HOST_MAP_TREE); - /* Check if the radix tree exist yet. We test the value of - * the gfp_mask for that. Sneaky but saves another int in the - * structure. If not, we fallback to slow mode - */ - tree = &host->revmap_data.tree; - if (tree->gfp_mask == 0) + /* Check if the radix tree exist yet. */ + if (atomic_read(&revmap_trees_allocated) == 0) return irq_find_mapping(host, hwirq); - /* Now try to resolve */ - irq_radix_rdlock(&flags); + /* + * Now try to resolve + * No rcu_read_lock(ing) needed, the ptr returned can't go under us + * as it's referencing an entry in the static irq_map table. + */ + tree = &host->revmap_data.tree; ptr = radix_tree_lookup(tree, hwirq); - irq_radix_rdunlock(flags); /* Found it, return */ if (ptr) { @@ -874,9 +822,10 @@ unsigned int irq_radix_revmap(struct irq /* If not there, try to insert it */ virq = irq_find_mapping(host, hwirq); if (virq != NO_IRQ) { - irq_radix_wrlock(&flags); - radix_tree_insert(tree, hwirq, &irq_map[virq]); - irq_radix_wrunlock(flags); + DEFINE_RADIX_TREE_CONTEXT(ctx, tree); + radix_tree_lock(&ctx); + radix_tree_insert(ctx.tree, hwirq, &irq_map[virq]); + radix_tree_unlock(&ctx); } return virq; } @@ -987,14 +936,15 @@ void irq_early_init(void) static int irq_late_init(void) { struct irq_host *h; - unsigned long flags; - irq_radix_wrlock(&flags); list_for_each_entry(h, &irq_hosts, link) { if (h->revmap_type == IRQ_HOST_MAP_TREE) INIT_RADIX_TREE(&h->revmap_data.tree, GFP_ATOMIC); } - irq_radix_wrunlock(flags); + + /* Make sure the radix trees inits are visible before setting the flag */ + smp_mb(); + atomic_set(&revmap_trees_allocated, 1); return 0; } Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/setup_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/setup_32.c 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/setup_32.c 2009-02-08 00:04:08.000000000 -0500 @@ -88,7 +88,7 @@ int ucache_bsize; * from the address that it was linked at, so we must use RELOC/PTRRELOC * to access static data (including strings). -- paulus */ -unsigned long __init early_init(unsigned long dt_ptr) +notrace unsigned long __init early_init(unsigned long dt_ptr) { unsigned long offset = reloc_offset(); struct cpu_spec *spec; @@ -118,7 +118,7 @@ unsigned long __init early_init(unsigned * This is called very early on the boot process, after a minimal * MMU environment has been set up but before MMU_init is called. */ -void __init machine_init(unsigned long dt_ptr, unsigned long phys) +notrace void __init machine_init(unsigned long dt_ptr, unsigned long phys) { /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); @@ -140,7 +140,7 @@ void __init machine_init(unsigned long d #ifdef CONFIG_BOOKE_WDT /* Checks wdt=x and wdt_period=xx command-line option */ -int __init early_parse_wdt(char *p) +notrace int __init early_parse_wdt(char *p) { if (p && strncmp(p, "0", 1) != 0) booke_wdt_enabled = 1; @@ -296,3 +296,22 @@ void __init setup_arch(char **cmdline_p) paging_init(); } + +#ifdef CONFIG_STACKTRACE +#include +void notrace save_stack_trace(struct stack_trace *trace) +{ +} +#endif /* CONFIG_STACKTRACE */ + +#ifdef CONFIG_EARLY_PRINTK +void notrace early_printk(const char *fmt, ...) +{ + BUG(); +} +#endif /* CONFIG_EARLY_PRINTK */ + +#ifdef CONFIG_MCOUNT +extern void _mcount(void); +EXPORT_SYMBOL(_mcount); +#endif /* CONFIG_MCOUNT */ Index: linux-2.6.24.7-rt27/arch/powerpc/platforms/powermac/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/platforms/powermac/Makefile 2009-02-08 00:00:25.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/platforms/powermac/Makefile 2009-02-08 00:01:17.000000000 -0500 @@ -1,5 +1,10 @@ CFLAGS_bootx_init.o += -fPIC +ifdef CONFIG_FTRACE +# Do not trace early boot code +CFLAGS_REMOVE_bootx_init.o = -pg +endif + obj-y += pic.o setup.o time.o feature.o pci.o \ sleep.o low_i2c.o cache.o pfunc_core.o \ pfunc_base.o Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/ppc_ksyms.c 2009-02-08 00:00:24.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/ppc_ksyms.c 2009-02-08 00:02:30.000000000 -0500 @@ -15,7 +15,6 @@ #include #include -#include #include #include #include @@ -44,9 +43,10 @@ #include #include #include +#include #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(local_irq_restore); +EXPORT_SYMBOL(raw_local_irq_restore); #endif #ifdef CONFIG_PPC32 @@ -72,6 +72,10 @@ EXPORT_SYMBOL(single_step_exception); EXPORT_SYMBOL(sys_sigreturn); #endif +#ifdef CONFIG_FTRACE +EXPORT_SYMBOL(_mcount); +#endif + EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strncpy); EXPORT_SYMBOL(strcat); @@ -162,7 +166,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux-2.6.24.7-rt27/include/asm-powerpc/ftrace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/include/asm-powerpc/ftrace.h 2009-02-08 00:01:18.000000000 -0500 @@ -0,0 +1,14 @@ +#ifndef _ASM_POWERPC_FTRACE +#define _ASM_POWERPC_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void _mcount(void); +#endif + +#endif + +#endif /* _ASM_POWERPC_FTRACE */ Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/traps.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/traps.c 2009-02-08 00:00:24.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/traps.c 2009-02-08 00:02:28.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -97,11 +98,11 @@ static inline void pmac_backlight_unblan int die(const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = _RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -111,6 +112,8 @@ int die(const char *str, struct pt_regs if (debugger(regs)) return 1; + ftrace_stop(); + oops_enter(); if (die.lock_owner != raw_smp_processor_id()) { @@ -188,6 +191,11 @@ void _exception(int signr, struct pt_reg addr, regs->nip, regs->link, code); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/process.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/process.c 2009-02-08 00:00:24.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/process.c 2009-02-08 00:02:07.000000000 -0500 @@ -54,9 +54,9 @@ EXPORT_SYMBOL(pm_power_off); static void default_idle(void) { local_irq_disable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { /* This stop will re-enable interrupts */ - __asm__("stop #0x2000" : : : "cc"); + __asm__("stop #0x2000" : : : "cc"); local_irq_disable(); } local_irq_enable(); @@ -74,10 +74,14 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { + stop_critical_timings(); idle(); - preempt_enable_no_resched(); - schedule(); + start_critical_timings(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/m68knommu/kernel/Makefile 2009-02-08 00:00:24.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/Makefile 2009-02-08 00:02:07.000000000 -0500 @@ -5,7 +5,9 @@ extra-y := vmlinux.lds obj-y += dma.o entry.o init_task.o irq.o m68k_ksyms.o process.o ptrace.o \ - semaphore.o setup.o signal.o syscalltable.o sys_m68k.o time.o traps.o + setup.o signal.o syscalltable.o sys_m68k.o time.o traps.o -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_COMEMPCI) += comempci.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_COMEMPCI) += comempci.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o Index: linux-2.6.24.7-rt27/arch/m68knommu/kernel/stacktrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/m68knommu/kernel/stacktrace.c 2009-02-08 00:01:20.000000000 -0500 @@ -0,0 +1,69 @@ +/* + * Quick & dirty stacktrace implementation. + */ +#include +#include + +typedef void (save_stack_addr_t)(void *data, unsigned long addr, int reliable); + +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + struct stack_trace *trace = data; + if (!reliable) + return; + if (trace->skip > 0) { + trace->skip--; + return; + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = addr; +} + +static void print_context_stack(unsigned long *stack, + save_stack_addr_t *sstack_func, struct stack_trace *trace) +{ + unsigned long *last_stack; + unsigned long *endstack; + unsigned long addr; + + addr = (unsigned long) stack; + endstack = (unsigned long *) PAGE_ALIGN(addr); + + last_stack = stack - 1; + while (stack <= endstack && stack > last_stack) { + + addr = *(stack + 1); + sstack_func(trace, addr, 1); + + last_stack = stack; + stack = (unsigned long *)*stack; + } +} + +static noinline long *get_current_stack(void) +{ + unsigned long *stack; + + stack = (unsigned long *)&stack; + stack++; + return stack; +} + +static void save_current_stack(save_stack_addr_t *sstack_func, + struct stack_trace *trace) +{ + unsigned long *stack; + + stack = get_current_stack(); + print_context_stack(stack, save_stack_address, trace); +} + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + */ +void save_stack_trace(struct stack_trace *trace) +{ + save_current_stack(save_stack_address, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} Index: linux-2.6.24.7-rt27/drivers/kvm/kvm_main.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/kvm/kvm_main.c 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/kvm/kvm_main.c 2009-02-08 00:01:21.000000000 -0500 @@ -1987,8 +1987,8 @@ static int __vcpu_run(struct kvm_vcpu *v int r; if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { - printk("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->sipi_vector); + vcpu_printf(vcpu, "vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->sipi_vector); kvm_lapic_reset(vcpu); kvm_x86_ops->vcpu_reset(vcpu); vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; @@ -2003,6 +2003,11 @@ again: if (unlikely(r)) goto out; + if (vcpu->migrate_apic_timer) { + vcpu->migrate_apic_timer = false; + __kvm_migrate_apic_timer(vcpu); + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -2010,6 +2015,13 @@ again: local_irq_disable(); + if (need_resched() || need_resched_delayed()) { + local_irq_enable(); + preempt_enable(); + r = 1; + goto out; + } + if (signal_pending(current)) { local_irq_enable(); preempt_enable(); Index: linux-2.6.24.7-rt27/drivers/kvm/irq.h =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/kvm/irq.h 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/kvm/irq.h 2009-02-08 00:01:21.000000000 -0500 @@ -160,6 +160,6 @@ void kvm_apic_timer_intr_post(struct kvm void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); #endif Index: linux-2.6.24.7-rt27/drivers/kvm/kvm.h =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/kvm/kvm.h 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/kvm/kvm.h 2009-02-08 00:01:21.000000000 -0500 @@ -325,6 +325,7 @@ struct kvm_vcpu { u64 pdptrs[4]; /* pae */ u64 shadow_efer; u64 apic_base; + bool migrate_apic_timer; struct kvm_lapic *apic; /* kernel irqchip context */ #define VCPU_MP_STATE_RUNNABLE 0 #define VCPU_MP_STATE_UNINITIALIZED 1 @@ -508,6 +509,7 @@ struct kvm_x86_ops { extern struct kvm_x86_ops *kvm_x86_ops; +#ifdef KVM_DEBUG /* The guest did something we don't support. */ #define pr_unimpl(vcpu, fmt, ...) \ do { \ @@ -517,6 +519,11 @@ extern struct kvm_x86_ops *kvm_x86_ops; } while(0) #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#else +#define pr_unimpl(vcpu, fmt ...) do { } while(0) +#define kvm_printf(kvm, fmt ...) do { } while(0) +#endif + #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); @@ -775,6 +782,11 @@ static inline u32 get_rdx_init_val(void) return 0x600; /* P6 family */ } +static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +{ + vcpu->migrate_apic_timer = true; +} + #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" Index: linux-2.6.24.7-rt27/drivers/kvm/lapic.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/kvm/lapic.c 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/kvm/lapic.c 2009-02-08 00:01:21.000000000 -0500 @@ -347,35 +347,35 @@ static int __apic_accept_irq(struct kvm_ break; case APIC_DM_REMRD: - printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + vcpu_printf(vcpu "Ignoring delivery mode 3\n"); break; case APIC_DM_SMI: - printk(KERN_DEBUG "Ignoring guest SMI\n"); + vcpu_printf(vcpu, "Ignoring guest SMI\n"); break; case APIC_DM_NMI: - printk(KERN_DEBUG "Ignoring guest NMI\n"); + vcpu_printf(vcpu, "Ignoring guest NMI\n"); break; case APIC_DM_INIT: if (level) { if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); + vcpu_printf(vcpu, + "INIT on a runnable vcpu %d\n", + vcpu->vcpu_id); vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); + vcpu_printf(vcpu, + "Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); } break; case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); + vcpu_printf(vcpu, "SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { vcpu->sipi_vector = vector; vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; @@ -1065,7 +1065,7 @@ void kvm_apic_post_state_restore(struct start_apic_timer(apic); } -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->apic; struct hrtimer *timer; Index: linux-2.6.24.7-rt27/arch/arm/mach-ep93xx/core.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/mach-ep93xx/core.c 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/mach-ep93xx/core.c 2009-02-08 00:01:23.000000000 -0500 @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include @@ -50,7 +52,6 @@ #include - /************************************************************************* * Static I/O mappings that are needed for all EP93xx platforms *************************************************************************/ @@ -93,59 +94,125 @@ void __init ep93xx_map_io(void) * to use this timer for something else. We also use timer 4 for keeping * track of lost jiffies. */ -static unsigned int last_jiffy_time; - -#define TIMER4_TICKS_PER_JIFFY ((CLOCK_TICK_RATE + (HZ/2)) / HZ) +static struct clock_event_device clockevent_ep93xx; static int ep93xx_timer_interrupt(int irq, void *dev_id) { - write_seqlock(&xtime_lock); + __raw_writel(EP93XX_TC_CLEAR, EP93XX_TIMER1_CLEAR); - __raw_writel(1, EP93XX_TIMER1_CLEAR); - while ((signed long) - (__raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time) - >= TIMER4_TICKS_PER_JIFFY) { - last_jiffy_time += TIMER4_TICKS_PER_JIFFY; - timer_tick(); - } - - write_sequnlock(&xtime_lock); + clockevent_ep93xx.event_handler(&clockevent_ep93xx); return IRQ_HANDLED; } +static int ep93xx_set_next_event(unsigned long evt, + struct clock_event_device *unused) +{ + u32 tmode = __raw_readl(EP93XX_TIMER1_CONTROL); + + /* stop timer */ + __raw_writel(tmode & ~EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + /* program timer */ + __raw_writel(evt, EP93XX_TIMER1_LOAD); + /* start timer */ + __raw_writel(tmode | EP93XX_TC123_ENABLE, EP93XX_TIMER1_CONTROL); + + return 0; +} + +static void ep93xx_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + u32 tmode = EP93XX_TC123_SEL_508KHZ; + + /* Disable timer */ + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* Set timer period */ + __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); + tmode |= EP93XX_TC123_PERIODIC; + + case CLOCK_EVT_MODE_ONESHOT: + tmode |= EP93XX_TC123_ENABLE; + __raw_writel(tmode, EP93XX_TIMER1_CONTROL); + break; + + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_RESUME: + return; + } +} + +static struct clock_event_device clockevent_ep93xx = { + .name = "ep93xx-timer1", + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, + .shift = 32, + .set_mode = ep93xx_set_mode, + .set_next_event = ep93xx_set_next_event, +}; + + static struct irqaction ep93xx_timer_irq = { .name = "ep93xx timer", .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, .handler = ep93xx_timer_interrupt, }; -static void __init ep93xx_timer_init(void) +static void __init ep93xx_clockevent_init(void) { - /* Enable periodic HZ timer. */ - __raw_writel(0x48, EP93XX_TIMER1_CONTROL); - __raw_writel((508469 / HZ) - 1, EP93XX_TIMER1_LOAD); - __raw_writel(0xc8, EP93XX_TIMER1_CONTROL); + setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); - /* Enable lost jiffy timer. */ - __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); + clockevent_ep93xx.mult = div_sc(508469, NSEC_PER_SEC, + clockevent_ep93xx.shift); + clockevent_ep93xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ep93xx); + clockevent_ep93xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ep93xx); + clockevent_ep93xx.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&clockevent_ep93xx); +} - setup_irq(IRQ_EP93XX_TIMER1, &ep93xx_timer_irq); +/* + * timer4 is a 40 Bit timer, separated in a 32bit and a 8 bit + * register, EP93XX_TIMER4_VALUE_LOW stores 32 bit word. The + * controlregister is in EP93XX_TIMER4_VALUE_HIGH + */ + +cycle_t ep93xx_get_cycles(void) +{ + return __raw_readl(EP93XX_TIMER4_VALUE_LOW); } -static unsigned long ep93xx_gettimeoffset(void) +static struct clocksource clocksource_ep93xx = { + .name = "ep93xx_timer4", + .rating = 200, + .read = ep93xx_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void __init ep93xx_clocksource_init(void) { - int offset; + /* Reset time-stamp counter */ + __raw_writel(0x100, EP93XX_TIMER4_VALUE_HIGH); - offset = __raw_readl(EP93XX_TIMER4_VALUE_LOW) - last_jiffy_time; + clocksource_ep93xx.mult = + clocksource_hz2mult(983040, clocksource_ep93xx.shift); + clocksource_register(&clocksource_ep93xx); +} - /* Calculate (1000000 / 983040) * offset. */ - return offset + (53 * offset / 3072); +static void __init ep93xx_timer_init(void) +{ + ep93xx_clocksource_init(); + ep93xx_clockevent_init(); } struct sys_timer ep93xx_timer = { - .init = ep93xx_timer_init, - .offset = ep93xx_gettimeoffset, + .init = ep93xx_timer_init, }; @@ -497,7 +564,6 @@ static struct platform_device ep93xx_ohc .resource = ep93xx_ohci_resources, }; - void __init ep93xx_init_devices(void) { unsigned int v; Index: linux-2.6.24.7-rt27/include/asm-arm/arch-ep93xx/ep93xx-regs.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-arm/arch-ep93xx/ep93xx-regs.h 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-arm/arch-ep93xx/ep93xx-regs.h 2009-02-08 00:01:23.000000000 -0500 @@ -67,6 +67,12 @@ #define EP93XX_TIMER3_CONTROL EP93XX_TIMER_REG(0x88) #define EP93XX_TIMER3_CLEAR EP93XX_TIMER_REG(0x8c) +#define EP93XX_TC_CLEAR 0x00000001 +#define EP93XX_TC123_ENABLE 0x00000080 +#define EP93XX_TC123_PERIODIC 0x00000040 +#define EP93XX_TC123_SEL_508KHZ 0x00000008 +#define EP93XX_TC4_ENABLE 0x00000100 + #define EP93XX_I2S_BASE (EP93XX_APB_VIRT_BASE + 0x00020000) #define EP93XX_SECURITY_BASE (EP93XX_APB_VIRT_BASE + 0x00030000) Index: linux-2.6.24.7-rt27/arch/arm/kernel/time.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/kernel/time.c 2009-02-08 00:00:23.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/kernel/time.c 2009-02-08 00:01:23.000000000 -0500 @@ -236,6 +236,13 @@ static inline void do_leds(void) #define do_leds() #endif +void arch_tick_leds(void) +{ +#ifdef CONFIG_LEDS_TIMER + do_leds(); +#endif +} + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { Index: linux-2.6.24.7-rt27/drivers/net/sungem.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/net/sungem.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/net/sungem.c 2009-02-08 00:01:24.000000000 -0500 @@ -1031,10 +1031,8 @@ static int gem_start_xmit(struct sk_buff (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { + if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { /* Tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } /* We raced with gem_do_stop() */ Index: linux-2.6.24.7-rt27/arch/x86/kernel/tsc_sync.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/tsc_sync.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/tsc_sync.c 2009-02-08 00:02:05.000000000 -0500 @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -97,6 +97,7 @@ static __cpuinit void check_tsc_warp(voi */ void __cpuinit check_tsc_sync_source(int cpu) { + unsigned long flags; int cpus = 2; /* @@ -117,8 +118,11 @@ void __cpuinit check_tsc_sync_source(int /* * Wait for the target to arrive: */ + local_save_flags(flags); + local_irq_enable(); while (atomic_read(&start_count) != cpus-1) cpu_relax(); + local_irq_restore(flags); /* * Trigger the target to continue into the measurement too: */ Index: linux-2.6.24.7-rt27/drivers/input/keyboard/atkbd.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/input/keyboard/atkbd.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/input/keyboard/atkbd.c 2009-02-08 00:01:24.000000000 -0500 @@ -1401,9 +1401,23 @@ static ssize_t atkbd_show_err_count(stru return sprintf(buf, "%lu\n", atkbd->err_count); } +static int __read_mostly noatkbd; + +static int __init noatkbd_setup(char *str) +{ + noatkbd = 1; + printk(KERN_INFO "debug: not setting up AT keyboard.\n"); + + return 1; +} + +__setup("noatkbd", noatkbd_setup); static int __init atkbd_init(void) { + if (noatkbd) + return 0; + return serio_register_driver(&atkbd_drv); } Index: linux-2.6.24.7-rt27/drivers/input/mouse/psmouse-base.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/input/mouse/psmouse-base.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/input/mouse/psmouse-base.c 2009-02-08 00:01:24.000000000 -0500 @@ -1598,10 +1598,25 @@ static int psmouse_get_maxproto(char *bu return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name); } +static int __read_mostly nopsmouse; + +static int __init nopsmouse_setup(char *str) +{ + nopsmouse = 1; + printk(KERN_INFO "debug: not setting up psmouse.\n"); + + return 1; +} + +__setup("nopsmouse", nopsmouse_setup); + static int __init psmouse_init(void) { int err; + if (nopsmouse) + return 0; + kpsmoused_wq = create_singlethread_workqueue("kpsmoused"); if (!kpsmoused_wq) { printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n"); Index: linux-2.6.24.7-rt27/kernel/rtmutex-debug.h =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/rtmutex-debug.h 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/rtmutex-debug.h 2009-02-08 00:01:25.000000000 -0500 @@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(s extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); +extern void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ +# define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline int +debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) { - return (waiter != NULL); + return waiter != NULL; } Index: linux-2.6.24.7-rt27/drivers/net/8139too.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/net/8139too.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/net/8139too.c 2009-02-08 00:01:25.000000000 -0500 @@ -2199,7 +2199,11 @@ static irqreturn_t rtl8139_interrupt (in */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } Index: linux-2.6.24.7-rt27/arch/x86/kernel/kprobes_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/kprobes_32.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/kprobes_32.c 2009-02-08 00:03:23.000000000 -0500 @@ -332,7 +332,7 @@ ss_probe: /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return 1; } #endif @@ -341,7 +341,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -573,7 +573,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, eflags @@ -607,7 +607,7 @@ int __kprobes kprobe_fault_handler(struc restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -668,12 +668,11 @@ int __kprobes kprobe_exceptions_notify(s ret = NOTIFY_STOP; break; case DIE_GPF: + // TODO: do this better on PREEMPT_RT /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && + if (per_cpu(current_kprobe, raw_smp_processor_id()) && kprobe_fault_handler(args->regs, args->trapnr)) ret = NOTIFY_STOP; - preempt_enable(); break; default: break; @@ -739,7 +738,7 @@ int __kprobes longjmp_break_handler(stru *regs = kcb->jprobe_saved_regs; memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; Index: linux-2.6.24.7-rt27/arch/x86/mm/highmem_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/mm/highmem_32.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/mm/highmem_32.c 2009-02-08 00:03:14.000000000 -0500 @@ -3,9 +3,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } @@ -18,6 +18,26 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -26,12 +46,12 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) @@ -39,19 +59,19 @@ void *kmap_atomic_prot(struct page *page idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); + WARN_ON_ONCE(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void *)vaddr; } -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -73,16 +93,18 @@ void kunmap_atomic(void *kvaddr, enum km arch_flush_lazy_mmu_mode(); pagefault_enable(); + preempt_enable(); } /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); @@ -93,7 +115,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -108,6 +130,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux-2.6.24.7-rt27/include/asm-x86/atomic_32.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/atomic_32.h 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/atomic_32.h 2009-02-08 00:01:26.000000000 -0500 @@ -195,10 +195,10 @@ static __inline__ int atomic_add_return( #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); + raw_local_irq_save(flags); __i = atomic_read(v); atomic_set(v, i + __i); - local_irq_restore(flags); + raw_local_irq_restore(flags); return i + __i; #endif } Index: linux-2.6.24.7-rt27/drivers/pci/msi.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/pci/msi.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/pci/msi.c 2009-02-08 00:04:20.000000000 -0500 @@ -30,7 +30,7 @@ static void msi_set_enable(struct pci_de int pos; u16 control; - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSI); if (pos) { pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); control &= ~PCI_MSI_FLAGS_ENABLE; @@ -45,7 +45,7 @@ static void msix_set_enable(struct pci_d int pos; u16 control; - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX); if (pos) { pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); control &= ~PCI_MSIX_FLAGS_ENABLE; @@ -241,6 +241,10 @@ static void __pci_restore_msi_state(stru return; entry = get_irq_msi(dev->irq); + if (!entry) { + WARN_ON(1); + return; + } pos = entry->msi_attrib.pos; pci_intx_for_msi(dev, 0); @@ -307,7 +311,7 @@ static int msi_capability_init(struct pc msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSI); pci_read_config_word(dev, msi_control_reg(pos), &control); /* MSI Entry Initialization */ entry = alloc_msi_entry(); @@ -380,7 +384,7 @@ static int msix_capability_init(struct p msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX); /* Request & Map MSI-X table region */ pci_read_config_word(dev, msi_control_reg(pos), &control); nr_entries = multi_msix_capable(control); @@ -487,7 +491,7 @@ static int pci_msi_check_device(struct p if (ret) return ret; - if (!pci_find_capability(dev, type)) + if (!pci_find_capability_cached(dev, type)) return -EINVAL; return 0; @@ -606,7 +610,7 @@ int pci_enable_msix(struct pci_dev* dev, if (status) return status; - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pos = pci_find_capability_cached(dev, PCI_CAP_ID_MSIX); pci_read_config_word(dev, msi_control_reg(pos), &control); nr_entries = multi_msix_capable(control); if (nvec > nr_entries) Index: linux-2.6.24.7-rt27/drivers/block/floppy.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/block/floppy.c 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/block/floppy.c 2009-02-08 00:01:27.000000000 -0500 @@ -4149,6 +4149,28 @@ static void floppy_device_release(struct complete(&device_release); } +static int floppy_suspend(struct platform_device *dev, pm_message_t state) +{ + floppy_release_irq_and_dma(); + + return 0; +} + +static int floppy_resume(struct platform_device *dev) +{ + floppy_grab_irq_and_dma(); + + return 0; +} + +static struct platform_driver floppy_driver = { + .suspend = floppy_suspend, + .resume = floppy_resume, + .driver = { + .name = "floppy", + }, +}; + static struct platform_device floppy_device[N_DRIVE]; static struct kobject *floppy_find(dev_t dev, int *part, void *data) @@ -4197,10 +4219,14 @@ static int __init floppy_init(void) if (err) goto out_put_disk; + err = platform_driver_register(&floppy_driver); + if (err) + goto out_unreg_blkdev; + floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); if (!floppy_queue) { err = -ENOMEM; - goto out_unreg_blkdev; + goto out_unreg_driver; } blk_queue_max_sectors(floppy_queue, 64); @@ -4349,6 +4375,8 @@ out_flush_work: out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); blk_cleanup_queue(floppy_queue); +out_unreg_driver: + platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4544,6 +4572,7 @@ void cleanup_module(void) init_completion(&device_release); blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); unregister_blkdev(FLOPPY_MAJOR, "fd"); + platform_driver_unregister(&floppy_driver); for (drive = 0; drive < N_DRIVE; drive++) { del_timer_sync(&motor_off_timer[drive]); Index: linux-2.6.24.7-rt27/include/linux/hrtimer.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/hrtimer.h 2009-02-08 00:00:22.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/hrtimer.h 2009-02-08 00:02:49.000000000 -0500 @@ -191,7 +191,7 @@ struct hrtimer_clock_base { * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { - spinlock_t lock; + raw_spinlock_t lock; struct lock_class_key lock_key; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; #ifdef CONFIG_HIGH_RES_TIMERS @@ -200,6 +200,9 @@ struct hrtimer_cpu_base { struct list_head cb_pending; unsigned long nr_events; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; #ifdef CONFIG_HIGH_RES_TIMERS @@ -270,6 +273,13 @@ static inline int hrtimer_restart(struct return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); @@ -297,6 +307,9 @@ static inline int hrtimer_is_queued(stru /* Forward a hrtimer so it expires after now: */ extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); +/* Overrun count: */ +extern unsigned long +hrtimer_overrun(struct hrtimer *timer, ktime_t now, ktime_t interval); /* Precise sleep: */ extern long hrtimer_nanosleep(struct timespec *rqtp, Index: linux-2.6.24.7-rt27/arch/x86/kernel/io_apic_32.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/io_apic_32.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/io_apic_32.c 2009-02-08 00:02:34.000000000 -0500 @@ -56,8 +56,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -261,14 +261,14 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +/* trigger = 0 (edge mode) */ +static void __pcix_mask_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + __modify_IO_APIC_irq(irq, 0, 0x00008000); } -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +/* mask = 0, trigger = 1 (level mode) */ +static void __pcix_unmask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); } @@ -291,6 +291,24 @@ static void unmask_IO_APIC_irq (unsigned spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -1236,23 +1254,28 @@ static int assign_irq_vector(int irq) return vector; } + static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, int vector, unsigned long trigger, + int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); } else { irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } set_intr_gate(vector, interrupt[irq]); } @@ -1322,7 +1345,8 @@ static void __init setup_IO_APIC_irqs(vo if (IO_APIC_IRQ(irq)) { vector = assign_irq_vector(irq); entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); + ioapic_register_intr(irq, vector, IOAPIC_AUTO, + apic > 0); if (!apic && (irq < 16)) disable_8259A_irq(irq); @@ -1493,7 +1517,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1900,7 +1924,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (jiffies - t1 > 4 && jiffies - t1 < 16) return 1; return 0; @@ -1989,8 +2013,10 @@ static void ack_ioapic_quirk_irq(unsigne if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } @@ -2015,6 +2041,18 @@ static struct irq_chip ioapic_chip __rea .retrigger = ioapic_retrigger_irq, }; +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; static inline void init_IO_APIC_traps(void) { @@ -2827,7 +2865,7 @@ int io_apic_set_pci_routing (int ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); - ioapic_register_intr(irq, entry.vector, edge_level); + ioapic_register_intr(irq, entry.vector, edge_level, ioapic > 0); if (!ioapic && (irq < 16)) disable_8259A_irq(irq); Index: linux-2.6.24.7-rt27/arch/x86/pci/Makefile_32 =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/pci/Makefile_32 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/pci/Makefile_32 2009-02-08 00:01:29.000000000 -0500 @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o Index: linux-2.6.24.7-rt27/include/linux/spinlock.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/spinlock.h 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/spinlock.h 2009-02-08 00:04:23.000000000 -0500 @@ -44,6 +44,42 @@ * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. + * + * + * Public types and naming conventions: + * ------------------------------------ + * spinlock_t: type: sleep-lock + * raw_spinlock_t: type: spin-lock (debug) + * + * spin_lock([raw_]spinlock_t): API: acquire lock, both types + * + * + * Internal types and naming conventions: + * ------------------------------------- + * __raw_spinlock_t: type: lowlevel spin-lock + * + * _spin_lock(struct rt_mutex): API: acquire sleep-lock + * __spin_lock(raw_spinlock_t): API: acquire spin-lock (highlevel) + * _raw_spin_lock(raw_spinlock_t): API: acquire spin-lock (debug) + * __raw_spin_lock(__raw_spinlock_t): API: acquire spin-lock (lowlevel) + * + * + * spin_lock(raw_spinlock_t) translates into the following chain of + * calls/inlines/macros, if spin-lock debugging is enabled: + * + * spin_lock() [include/linux/spinlock.h] + * -> __spin_lock() [kernel/spinlock.c] + * -> _raw_spin_lock() [lib/spinlock_debug.c] + * -> __raw_spin_lock() [include/asm/spinlock.h] + * + * spin_lock(spinlock_t) translates into the following chain of + * calls/inlines/macros: + * + * spin_lock() [include/linux/spinlock.h] + * -> _spin_lock() [include/linux/spinlock.h] + * -> rt_spin_lock() [kernel/rtmutex.c] + * -> rt_spin_lock_fastlock() [kernel/rtmutex.c] + * -> rt_spin_lock_slowlock() [kernel/rtmutex.c] */ #include @@ -51,29 +87,15 @@ #include #include #include +#include #include #include +#include +#include #include /* - * Must define these before including other files, inline functions need them - */ -#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME - -#define LOCK_SECTION_START(extra) \ - ".subsection 1\n\t" \ - extra \ - ".ifndef " LOCK_SECTION_NAME "\n\t" \ - LOCK_SECTION_NAME ":\n\t" \ - ".endif\n" - -#define LOCK_SECTION_END \ - ".previous\n\t" - -#define __lockfunc fastcall __attribute__((section(".spinlock.text"))) - -/* * Pull the raw_spinlock_t and raw_rwlock_t definitions: */ #include @@ -89,42 +111,10 @@ extern int __lockfunc generic__raw_read_ # include #endif -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __spin_lock_init(spinlock_t *lock, const char *name, - struct lock_class_key *key); -# define spin_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __spin_lock_init((lock), #lock, &__key); \ -} while (0) - -#else -# define spin_lock_init(lock) \ - do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) -#endif - -#ifdef CONFIG_DEBUG_SPINLOCK - extern void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key); -# define rwlock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __rwlock_init((lock), #lock, &__key); \ -} while (0) -#else -# define rwlock_init(lock) \ - do { *(lock) = RW_LOCK_UNLOCKED; } while (0) -#endif - -#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) - -/** - * spin_unlock_wait - wait until the spinlock gets unlocked - * @lock: the spinlock in question. +/* + * Pull the RT types: */ -#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) +#include /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: @@ -136,16 +126,16 @@ do { \ #endif #ifdef CONFIG_DEBUG_SPINLOCK - extern void _raw_spin_lock(spinlock_t *lock); -#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) - extern int _raw_spin_trylock(spinlock_t *lock); - extern void _raw_spin_unlock(spinlock_t *lock); - extern void _raw_read_lock(rwlock_t *lock); - extern int _raw_read_trylock(rwlock_t *lock); - extern void _raw_read_unlock(rwlock_t *lock); - extern void _raw_write_lock(rwlock_t *lock); - extern int _raw_write_trylock(rwlock_t *lock); - extern void _raw_write_unlock(rwlock_t *lock); + extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock); +# define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) + extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock); + extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock); + extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock); + extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock); + extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock); #else # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) # define _raw_spin_lock_flags(lock, flags) \ @@ -160,141 +150,446 @@ do { \ # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) #endif -#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) -#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) +extern int __bad_spinlock_type(void); +extern int __bad_rwlock_type(void); + +extern void +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); +extern int __lockfunc +rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic); + +/* + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. + */ +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); + +#ifdef CONFIG_PREEMPT_RT +# define _spin_lock(l) rt_spin_lock(l) +# define _spin_lock_nested(l, s) rt_spin_lock_nested(l, s) +# define _spin_lock_bh(l) rt_spin_lock(l) +# define _spin_lock_irq(l) rt_spin_lock(l) +# define _spin_unlock(l) rt_spin_unlock(l) +# define _spin_unlock_no_resched(l) rt_spin_unlock(l) +# define _spin_unlock_bh(l) rt_spin_unlock(l) +# define _spin_unlock_irq(l) rt_spin_unlock(l) +# define _spin_unlock_irqrestore(l, f) rt_spin_unlock(l) +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + rt_spin_lock(lock); + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + rt_spin_lock_nested(lock, subclass); + return 0; +} +#else +static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + return 0; +} +static inline unsigned long __lockfunc +_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + return 0; +} +# define _spin_lock(l) do { } while (0) +# define _spin_lock_nested(l, s) do { } while (0) +# define _spin_lock_bh(l) do { } while (0) +# define _spin_lock_irq(l) do { } while (0) +# define _spin_unlock(l) do { } while (0) +# define _spin_unlock_no_resched(l) do { } while (0) +# define _spin_unlock_bh(l) do { } while (0) +# define _spin_unlock_irq(l) do { } while (0) +# define _spin_unlock_irqrestore(l, f) do { } while (0) +#endif + +#define _spin_lock_init(sl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_spin_lock_init(sl, n, &__key); \ +} while (0) + +# ifdef CONFIG_PREEMPT_RT +# define _spin_can_lock(l) (!rt_mutex_is_locked(&(l)->lock)) +# define _spin_is_locked(l) rt_mutex_is_locked(&(l)->lock) +# define _spin_unlock_wait(l) rt_spin_unlock_wait(l) + +# define _spin_trylock(l) rt_spin_trylock(l) +# define _spin_trylock_bh(l) rt_spin_trylock(l) +# define _spin_trylock_irq(l) rt_spin_trylock(l) +# define _spin_trylock_irqsave(l,f) rt_spin_trylock_irqsave(l, f) +# else + + extern int this_should_never_be_called_on_non_rt(spinlock_t *lock); +# define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l) +# define _spin_can_lock(l) TSNBCONRT(l) +# define _spin_is_locked(l) TSNBCONRT(l) +# define _spin_unlock_wait(l) TSNBCONRT(l) + +# define _spin_trylock(l) TSNBCONRT(l) +# define _spin_trylock_bh(l) TSNBCONRT(l) +# define _spin_trylock_irq(l) TSNBCONRT(l) +# define _spin_trylock_irqsave(l,f) TSNBCONRT(l) +#endif + +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, + unsigned long *flags); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); +extern void +__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define _rwlock_init(rwl, n, f, l) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, n, &__key); \ +} while (0) + +#ifdef CONFIG_PREEMPT_RT +# define rt_read_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) +# define rt_write_can_lock(rwl) ((rwl)->owners.owner == NULL) +#else + extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock); +# define rt_read_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +# define rt_write_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) +#endif + +# define _read_can_lock(rwl) rt_read_can_lock(rwl) +# define _write_can_lock(rwl) rt_write_can_lock(rwl) + +# define _read_trylock(rwl) rt_read_trylock(rwl) +# define _write_trylock(rwl) rt_write_trylock(rwl) +# define _write_trylock_irqsave(rwl, flags) \ + rt_write_trylock_irqsave(rwl, flags) + +# define _read_lock(rwl) rt_read_lock(rwl) +# define _write_lock(rwl) rt_write_lock(rwl) +# define _read_unlock(rwl) rt_read_unlock(rwl) +# define _write_unlock(rwl) rt_write_unlock(rwl) + +# define _read_lock_bh(rwl) rt_read_lock(rwl) +# define _write_lock_bh(rwl) rt_write_lock(rwl) +# define _read_unlock_bh(rwl) rt_read_unlock(rwl) +# define _write_unlock_bh(rwl) rt_write_unlock(rwl) + +# define _read_lock_irq(rwl) rt_read_lock(rwl) +# define _write_lock_irq(rwl) rt_write_lock(rwl) +# define _read_unlock_irq(rwl) rt_read_unlock(rwl) +# define _write_unlock_irq(rwl) rt_write_unlock(rwl) + +# define _read_lock_irqsave(rwl) rt_read_lock_irqsave(rwl) +# define _write_lock_irqsave(rwl) rt_write_lock_irqsave(rwl) + +# define _read_unlock_irqrestore(rwl, f) rt_read_unlock(rwl) +# define _write_unlock_irqrestore(rwl, f) rt_write_unlock(rwl) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_spin_lock_init(lock, name, file, line) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_spin_lock_init((lock), #lock, &__key); \ +} while (0) + +#else +#define __raw_spin_lock_init(lock) \ + do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) +# define _raw_spin_lock_init(lock, name, file, line) __raw_spin_lock_init(lock) +#endif + +/* + * PICK_SPIN_OP()/PICK_RW_OP() are simple redirectors for PICK_FUNCTION + */ +#define PICK_SPIN_OP(...) \ + PICK_FUNCTION(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) +#define PICK_SPIN_OP_RET(...) \ + PICK_FUNCTION_RET(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) +#define PICK_RW_OP(...) PICK_FUNCTION(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) +#define PICK_RW_OP_RET(...) \ + PICK_FUNCTION_RET(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) + +#define spin_lock_init(lock) \ + PICK_SPIN_OP(_raw_spin_lock_init, _spin_lock_init, lock, #lock, \ + __FILE__, __LINE__) + +#ifdef CONFIG_DEBUG_SPINLOCK + extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, + struct lock_class_key *key); +# define _raw_rwlock_init(lock, name, file, line) \ +do { \ + static struct lock_class_key __key; \ + \ + __raw_rwlock_init((lock), #lock, &__key); \ +} while (0) +#else +#define __raw_rwlock_init(lock) \ + do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0) +# define _raw_rwlock_init(lock, name, file, line) __raw_rwlock_init(lock) +#endif + +#define rwlock_init(lock) \ + PICK_RW_OP(_raw_rwlock_init, _rwlock_init, lock, #lock, \ + __FILE__, __LINE__) + +#define __spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) + +#define spin_is_locked(lock) \ + PICK_SPIN_OP_RET(__spin_is_locked, _spin_is_locked, lock) + +#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) + +#define spin_unlock_wait(lock) \ + PICK_SPIN_OP(__spin_unlock_wait, _spin_unlock_wait, lock) /* * Define the various spin_lock and rw_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various * methods are defined as nops in the case they are not required. */ -#define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) -#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) -#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) +#define spin_trylock(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock, _spin_trylock, lock)) + +#define read_trylock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__read_trylock, _read_trylock, lock)) + +#define write_trylock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_trylock, _write_trylock, lock)) -#define spin_lock(lock) _spin_lock(lock) +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_trylock_irqsave, \ + _write_trylock_irqsave, lock, &flags)) + +#define __spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock) +#define __read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock) +#define __write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock) + +#define spin_can_lock(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_can_lock, _spin_can_lock,\ + lock)) + +#define read_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__read_can_lock, _read_can_lock, lock)) + +#define write_can_lock(lock) \ + __cond_lock(lock, PICK_RW_OP_RET(__write_can_lock, _write_can_lock,\ + lock)) + +#define spin_lock(lock) PICK_SPIN_OP(__spin_lock, _spin_lock, lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC -# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) +# define spin_lock_nested(lock, subclass) \ + PICK_SPIN_OP(__spin_lock_nested, _spin_lock_nested, lock, subclass) #else -# define spin_lock_nested(lock, subclass) _spin_lock(lock) +# define spin_lock_nested(lock, subclass) spin_lock(lock) #endif -#define write_lock(lock) _write_lock(lock) -#define read_lock(lock) _read_lock(lock) +#define write_lock(lock) PICK_RW_OP(__write_lock, _write_lock, lock) -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#define read_lock(lock) PICK_RW_OP(__read_lock, _read_lock, lock) -#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) -#define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) -#define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) +# define spin_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_SPIN_OP_RET(__spin_lock_irqsave, _spin_lock_irqsave, \ + lock); \ +} while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave_nested(lock, subclass) +# define spin_lock_irqsave_nested(lock, flags, subclass) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_SPIN_OP_RET(__spin_lock_irqsave_nested, \ + _spin_lock_irqsave_nested, lock, subclass); \ +} while (0) #else -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - flags = _spin_lock_irqsave(lock) +# define spin_lock_irqsave_nested(lock, flags, subclass) \ + spin_lock_irqsave(lock, flags) #endif -#else +# define read_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(__read_lock_irqsave, _read_lock_irqsave, lock);\ +} while (0) + +# define write_lock_irqsave(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + flags = PICK_RW_OP_RET(__write_lock_irqsave, _write_lock_irqsave,lock);\ +} while (0) -#define spin_lock_irqsave(lock, flags) _spin_lock_irqsave(lock, flags) -#define read_lock_irqsave(lock, flags) _read_lock_irqsave(lock, flags) -#define write_lock_irqsave(lock, flags) _write_lock_irqsave(lock, flags) -#define spin_lock_irqsave_nested(lock, flags, subclass) \ - spin_lock_irqsave(lock, flags) +#define spin_lock_irq(lock) PICK_SPIN_OP(__spin_lock_irq, _spin_lock_irq, lock) -#endif +#define spin_lock_bh(lock) PICK_SPIN_OP(__spin_lock_bh, _spin_lock_bh, lock) + +#define read_lock_irq(lock) PICK_RW_OP(__read_lock_irq, _read_lock_irq, lock) -#define spin_lock_irq(lock) _spin_lock_irq(lock) -#define spin_lock_bh(lock) _spin_lock_bh(lock) +#define read_lock_bh(lock) PICK_RW_OP(__read_lock_bh, _read_lock_bh, lock) -#define read_lock_irq(lock) _read_lock_irq(lock) -#define read_lock_bh(lock) _read_lock_bh(lock) +#define write_lock_irq(lock) PICK_RW_OP(__write_lock_irq, _write_lock_irq, lock) -#define write_lock_irq(lock) _write_lock_irq(lock) -#define write_lock_bh(lock) _write_lock_bh(lock) +#define write_lock_bh(lock) PICK_RW_OP(__write_lock_bh, _write_lock_bh, lock) + +#define spin_unlock(lock) PICK_SPIN_OP(__spin_unlock, _spin_unlock, lock) + +#define read_unlock(lock) PICK_RW_OP(__read_unlock, _read_unlock, lock) + +#define write_unlock(lock) PICK_RW_OP(__write_unlock, _write_unlock, lock) + +#define spin_unlock_no_resched(lock) \ + PICK_SPIN_OP(__spin_unlock_no_resched, _spin_unlock_no_resched, lock) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_SPIN_OP(__spin_unlock_irqrestore, _spin_unlock_irqrestore, \ + lock, flags); \ +} while (0) + +#define spin_unlock_irq(lock) \ + PICK_SPIN_OP(__spin_unlock_irq, _spin_unlock_irq, lock) +#define spin_unlock_bh(lock) \ + PICK_SPIN_OP(__spin_unlock_bh, _spin_unlock_bh, lock) + +#define read_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP(__read_unlock_irqrestore, _read_unlock_irqrestore, \ + lock, flags); \ +} while (0) + +#define read_unlock_irq(lock) \ + PICK_RW_OP(__read_unlock_irq, _read_unlock_irq, lock) +#define read_unlock_bh(lock) PICK_RW_OP(__read_unlock_bh, _read_unlock_bh, lock) + +#define write_unlock_irqrestore(lock, flags) \ +do { \ + BUILD_CHECK_IRQ_FLAGS(flags); \ + PICK_RW_OP(__write_unlock_irqrestore, _write_unlock_irqrestore, \ + lock, flags); \ +} while (0) +#define write_unlock_irq(lock) \ + PICK_RW_OP(__write_unlock_irq, _write_unlock_irq, lock) + +#define write_unlock_bh(lock) \ + PICK_RW_OP(__write_unlock_bh, _write_unlock_bh, lock) + +#define spin_trylock_bh(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_bh, _spin_trylock_bh,\ + lock)) + +#define spin_trylock_irq(lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irq, \ + _spin_trylock_irq, lock)) + +#define spin_trylock_irqsave(lock, flags) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irqsave, \ + _spin_trylock_irqsave, lock, &flags)) + +/* "lock on reference count zero" */ +#ifndef ATOMIC_DEC_AND_LOCK +# include + extern int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic); +#endif + +#define atomic_dec_and_lock(atomic, lock) \ + __cond_lock(lock, PICK_SPIN_OP_RET(__atomic_dec_and_spin_lock, \ + _atomic_dec_and_spin_lock, lock, atomic)) /* - * We inline the unlock functions in the nondebug case: + * bit-based spin_lock() + * + * Don't use this unless you really need to: spin_lock() and spin_unlock() + * are significantly faster. */ -#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ - !defined(CONFIG_SMP) -# define spin_unlock(lock) _spin_unlock(lock) -# define read_unlock(lock) _read_unlock(lock) -# define write_unlock(lock) _write_unlock(lock) -# define spin_unlock_irq(lock) _spin_unlock_irq(lock) -# define read_unlock_irq(lock) _read_unlock_irq(lock) -# define write_unlock_irq(lock) _write_unlock_irq(lock) -#else -# define spin_unlock(lock) \ - do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define read_unlock(lock) \ - do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define write_unlock(lock) \ - do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) -# define spin_unlock_irq(lock) \ -do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define read_unlock_irq(lock) \ -do { \ - __raw_read_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) -# define write_unlock_irq(lock) \ -do { \ - __raw_write_unlock(&(lock)->raw_lock); \ - __release(lock); \ - local_irq_enable(); \ -} while (0) +static inline void bit_spin_lock(int bitnum, unsigned long *addr) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + while (test_and_set_bit(bitnum, addr)) + while (test_bit(bitnum, addr)) + cpu_relax(); #endif + __acquire(bitlock); +} -#define spin_unlock_irqrestore(lock, flags) \ - _spin_unlock_irqrestore(lock, flags) -#define spin_unlock_bh(lock) _spin_unlock_bh(lock) - -#define read_unlock_irqrestore(lock, flags) \ - _read_unlock_irqrestore(lock, flags) -#define read_unlock_bh(lock) _read_unlock_bh(lock) - -#define write_unlock_irqrestore(lock, flags) \ - _write_unlock_irqrestore(lock, flags) -#define write_unlock_bh(lock) _write_unlock_bh(lock) - -#define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) - -#define spin_trylock_irq(lock) \ -({ \ - local_irq_disable(); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_enable(); 0; }); \ -}) +/* + * Return true if it was acquired + */ +static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + if (test_and_set_bit(bitnum, addr)) + return 0; +#endif + __acquire(bitlock); + return 1; +} -#define spin_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - spin_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/* + * bit-based spin_unlock() + */ +static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + BUG_ON(!test_bit(bitnum, addr)); + smp_mb__before_clear_bit(); + clear_bit(bitnum, addr); +#endif + __release(bitlock); +} -#define write_trylock_irqsave(lock, flags) \ -({ \ - local_irq_save(flags); \ - write_trylock(lock) ? \ - 1 : ({ local_irq_restore(flags); 0; }); \ -}) +/* + * Return true if the lock is held. + */ +static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) + return test_bit(bitnum, addr); +#else + return 1; +#endif +} + +/** + * __raw_spin_can_lock - would __raw_spin_trylock() succeed? + * @lock: the spinlock in question. + */ +#define __raw_spin_can_lock(lock) (!__raw_spin_is_locked(lock)) /* * Locks two spinlocks l1 and l2. * l1_first indicates if spinlock l1 should be taken first. */ -static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, - bool l1_first) +static inline void +raw_double_spin_lock(raw_spinlock_t *l1, raw_spinlock_t *l2, bool l1_first) __acquires(l1) __acquires(l2) { @@ -307,13 +602,29 @@ static inline void double_spin_lock(spin } } +static inline void +double_spin_lock(spinlock_t *l1, spinlock_t *l2, bool l1_first) + __acquires(l1) + __acquires(l2) +{ + if (l1_first) { + spin_lock(l1); + spin_lock(l2); + } else { + spin_lock(l2); + spin_lock(l1); + } +} + + /* * Unlocks two spinlocks l1 and l2. * l1_taken_first indicates if spinlock l1 was taken first and therefore * should be released after spinlock l2. */ -static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, - bool l1_taken_first) +static inline void +raw_double_spin_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2, + bool l1_taken_first) __releases(l1) __releases(l2) { @@ -326,24 +637,19 @@ static inline void double_spin_unlock(sp } } -/* - * Pull the atomic_t declaration: - * (asm-mips/atomic.h needs above definitions) - */ -#include -/** - * atomic_dec_and_lock - lock on reaching reference count zero - * @atomic: the atomic counter - * @lock: the spinlock in question - */ -extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); -#define atomic_dec_and_lock(atomic, lock) \ - __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) - -/** - * spin_can_lock - would spin_trylock() succeed? - * @lock: the spinlock in question. - */ -#define spin_can_lock(lock) (!spin_is_locked(lock)) +static inline void +double_spin_unlock(spinlock_t *l1, spinlock_t *l2, bool l1_taken_first) + __releases(l1) + __releases(l2) +{ + if (l1_taken_first) { + spin_unlock(l2); + spin_unlock(l1); + } else { + spin_unlock(l1); + spin_unlock(l2); + } +} #endif /* __LINUX_SPINLOCK_H */ + Index: linux-2.6.24.7-rt27/kernel/irq/migration.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/irq/migration.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/irq/migration.c 2009-02-08 00:01:30.000000000 -0500 @@ -61,6 +61,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_desc + irq; + int mask = 1; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -68,8 +69,17 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + /* + * If the irq is already in progress, it should be masked. + * If we unmask it, we might cause an interrupt storm on RT. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) + mask = 0; + + if (mask) + desc->chip->mask(irq); move_masked_irq(irq); - desc->chip->unmask(irq); + if (mask) + desc->chip->unmask(irq); } Index: linux-2.6.24.7-rt27/arch/x86/kernel/io_apic_64.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/io_apic_64.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/io_apic_64.c 2009-02-08 00:04:11.000000000 -0500 @@ -91,8 +91,8 @@ int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -205,6 +205,9 @@ static inline void io_apic_sync(unsigned reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -349,10 +352,11 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ + +DO_ACTION( __pcix_mask, 0, &= 0xffff7fff, ) /* edge */ +DO_ACTION( __pcix_unmask, 0, = (reg & 0xfffeffff) | 0x00008000, ) /* level */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -371,6 +375,23 @@ static void unmask_IO_APIC_irq (unsigned __unmask_IO_APIC_irq(irq); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { @@ -796,17 +817,20 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; +static struct irq_chip pcix_ioapic_chip; -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, unsigned long trigger, int pcix) { + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; + if (trigger) { irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); } else { irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } } @@ -851,7 +875,7 @@ static void setup_IO_APIC_irq(int apic, if (trigger) entry.mask = 1; - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, trigger, apic > 0); if (irq < 16) disable_8259A_irq(irq); @@ -1440,7 +1464,8 @@ static void ack_apic_level(unsigned int irq_complete_move(irq); #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING) && + !(irq_desc[irq].status & IRQ_INPROGRESS)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } @@ -1484,17 +1509,39 @@ static void ack_apic_level(unsigned int move_masked_irq(irq); unmask_IO_APIC_irq(irq); } +#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \ + defined(CONFIG_PREEMPT_HARDIRQS) + /* + * With threaded interrupts, we always have IRQ_INPROGRESS + * when acking. + */ + else if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) + move_masked_irq(irq); +#endif } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -1694,7 +1741,6 @@ static inline void __init check_timer(vo */ unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); @@ -1720,7 +1766,6 @@ static inline void __init check_timer(vo setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } Index: linux-2.6.24.7-rt27/kernel/audit.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/audit.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/audit.c 2009-02-08 00:01:32.000000000 -0500 @@ -1130,7 +1130,7 @@ struct audit_buffer *audit_log_start(str { struct audit_buffer *ab = NULL; struct timespec t; - unsigned int serial; + unsigned int serial = 0 /* shut up gcc */; int reserve; unsigned long timeout_start = jiffies; Index: linux-2.6.24.7-rt27/net/core/flow.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/core/flow.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/net/core/flow.c 2009-02-08 00:02:10.000000000 -0500 @@ -40,9 +40,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT( static u32 flow_hash_shift; #define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); + +#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) static struct kmem_cache *flow_cachep __read_mostly; @@ -169,24 +170,24 @@ static int flow_key_compare(struct flowi void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { - struct flow_cache_entry *fle, **head; + struct flow_cache_entry **table, *fle, **head = NULL /* shut up GCC */; unsigned int hash; int cpu; local_bh_disable(); - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!table) goto nocache; if (flow_hash_rnd_recalc(cpu)) flow_new_hash_rnd(cpu); hash = flow_hash_code(key, cpu); - head = &flow_table(cpu)[hash]; + head = &table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && @@ -196,6 +197,7 @@ void *flow_cache_lookup(struct flowi *ke if (ret) atomic_inc(fle->object_ref); + put_cpu_var_locked(flow_tables, cpu); local_bh_enable(); return ret; @@ -221,6 +223,8 @@ void *flow_cache_lookup(struct flowi *ke } nocache: + put_cpu_var_locked(flow_tables, cpu); + { int err; void *obj; @@ -250,14 +254,15 @@ nocache: static void flow_cache_flush_tasklet(unsigned long data) { struct flow_flush_info *info = (void *)data; + struct flow_cache_entry **table; int i; int cpu; - cpu = smp_processor_id(); + table = get_cpu_var_locked(flow_tables, &cpu); for (i = 0; i < flow_hash_size; i++) { struct flow_cache_entry *fle; - fle = flow_table(cpu)[i]; + fle = table[i]; for (; fle; fle = fle->next) { unsigned genid = atomic_read(&flow_cache_genid); @@ -268,6 +273,7 @@ static void flow_cache_flush_tasklet(uns atomic_dec(fle->object_ref); } } + put_cpu_var_locked(flow_tables, cpu); if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); Index: linux-2.6.24.7-rt27/net/sunrpc/svc.c =================================================================== --- linux-2.6.24.7-rt27.orig/net/sunrpc/svc.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/net/sunrpc/svc.c 2009-02-08 00:01:32.000000000 -0500 @@ -547,7 +547,7 @@ __svc_create_thread(svc_thread_fn func, struct svc_rqst *rqstp; int error = -ENOMEM; int have_oldmask = 0; - cpumask_t oldmask; + cpumask_t oldmask = CPU_MASK_NONE /* shut up GCC */; rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) Index: linux-2.6.24.7-rt27/sound/core/control_compat.c =================================================================== --- linux-2.6.24.7-rt27.orig/sound/core/control_compat.c 2009-02-08 00:00:21.000000000 -0500 +++ linux-2.6.24.7-rt27/sound/core/control_compat.c 2009-02-08 00:01:32.000000000 -0500 @@ -219,7 +219,7 @@ static int copy_ctl_value_from_user(stru struct snd_ctl_elem_value32 __user *data32, int *typep, int *countp) { - int i, type, count, size; + int i, type, count = 0 /* shut up gcc warning */, size; unsigned int indirect; if (copy_from_user(&data->id, &data32->id, sizeof(data->id))) Index: linux-2.6.24.7-rt27/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/net/netfilter/nf_conntrack.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/net/netfilter/nf_conntrack.h 2009-02-08 00:03:25.000000000 -0500 @@ -63,11 +63,14 @@ union nf_conntrack_help { #ifdef CONFIG_NETFILTER_DEBUG #define NF_CT_ASSERT(x) \ do { \ - if (!(x)) \ + if (!(x)) { \ /* Wooah! I'm tripping my conntrack in a frenzy of \ netplay... */ \ printk("NF_CT_ASSERT: %s:%i(%s)\n", \ __FILE__, __LINE__, __FUNCTION__); \ + if (printk_ratelimit()) \ + WARN_ON(1); \ + } \ } while(0) #else #define NF_CT_ASSERT(x) @@ -256,13 +259,13 @@ extern atomic_t nf_conntrack_count; extern int nf_conntrack_max; DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -#define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) #define NF_CT_STAT_INC_ATOMIC(count) \ do { \ local_bh_disable(); \ - __get_cpu_var(nf_conntrack_stat).count++; \ + __raw_get_cpu_var(nf_conntrack_stat).count++; \ local_bh_enable(); \ } while (0) +#define NF_CT_STAT_INC(count) (__raw_get_cpu_var(nf_conntrack_stat).count++) extern int nf_conntrack_register_cache(u_int32_t features, const char *name, size_t size); Index: linux-2.6.24.7-rt27/arch/x86/kernel/crash.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/x86/kernel/crash.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/x86/kernel/crash.c 2009-02-08 00:01:32.000000000 -0500 @@ -78,14 +78,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); -} - static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; Index: linux-2.6.24.7-rt27/include/asm-x86/apic_32.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/apic_32.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/apic_32.h 2009-02-08 00:01:32.000000000 -0500 @@ -118,6 +118,8 @@ extern int local_apic_timer_c2_ok; extern int local_apic_timer_disabled; +extern void smp_send_nmi_allbutself(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 Index: linux-2.6.24.7-rt27/include/asm-x86/apic_64.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-x86/apic_64.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-x86/apic_64.h 2009-02-08 00:02:55.000000000 -0500 @@ -87,6 +87,8 @@ extern void setup_APIC_extended_lvt(unsi extern int apic_is_clustered_box(void); +extern void smp_send_nmi_allbutself(void); + #define K8_APIC_EXT_LVT_BASE 0x500 #define K8_APIC_EXT_INT_MSG_FIX 0x0 #define K8_APIC_EXT_INT_MSG_SMI 0x2 @@ -94,6 +96,8 @@ extern int apic_is_clustered_box(void); #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 +extern void smp_send_nmi_allbutself(void); + #define ARCH_APICTIMER_STOPS_ON_C3 1 extern unsigned boot_cpu_id; Index: linux-2.6.24.7-rt27/include/linux/profile.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/profile.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/profile.h 2009-02-08 00:02:39.000000000 -0500 @@ -6,16 +6,18 @@ #include #include #include +#include #include #include extern int prof_on __read_mostly; -#define CPU_PROFILING 1 -#define SCHED_PROFILING 2 -#define SLEEP_PROFILING 3 -#define KVM_PROFILING 4 +#define CPU_PROFILING 1 +#define SCHED_PROFILING 2 +#define SLEEP_PROFILING 3 +#define KVM_PROFILING 4 +#define PREEMPT_PROFILING 5 struct proc_dir_entry; struct pt_regs; @@ -23,6 +25,7 @@ struct notifier_block; /* init basic kernel profiler */ void __init profile_init(void); +void __profile_tick(int type, struct pt_regs *regs); void profile_tick(int); /* @@ -53,6 +56,8 @@ enum profile_type { PROFILE_MUNMAP }; +extern int prof_pid; + #ifdef CONFIG_PROFILING struct task_struct; Index: linux-2.6.24.7-rt27/kernel/profile.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/profile.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/profile.c 2009-02-08 00:02:46.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +int prof_pid = -1; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -412,16 +414,20 @@ void profile_hits(int type, void *__pc, EXPORT_SYMBOL_GPL(profile_hits); -void profile_tick(int type) +void __profile_tick(int type, struct pt_regs *regs) { - struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && + (prof_pid == -1 || prof_pid == current->pid)) profile_hit(type, (void *)profile_pc(regs)); } +void profile_tick(int type) +{ + return __profile_tick(type, get_irq_regs()); +} + #ifdef CONFIG_PROC_FS #include #include Index: linux-2.6.24.7-rt27/kernel/time/tick-common.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/time/tick-common.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/time/tick-common.c 2009-02-08 00:05:09.000000000 -0500 @@ -32,7 +32,7 @@ DEFINE_PER_CPU(struct tick_device, tick_ ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = -1; -DEFINE_SPINLOCK(tick_device_lock); +DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -68,7 +68,6 @@ static void tick_periodic(int cpu) } update_process_times(user_mode(get_irq_regs())); - profile_tick(CPU_PROFILING); } /* @@ -159,6 +158,7 @@ static void tick_setup_device(struct tic } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; Index: linux-2.6.24.7-rt27/kernel/time/tick-sched.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/time/tick-sched.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/time/tick-sched.c 2009-02-08 00:04:57.000000000 -0500 @@ -150,7 +150,7 @@ void tick_nohz_update_jiffies(void) * Called either from the idle loop or from irq_exit() when an idle period was * just interrupted by an interrupt which did not cause a reschedule. */ -void tick_nohz_stop_sched_tick(void) +void tick_nohz_stop_sched_tick(int inidle) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; struct tick_sched *ts; @@ -178,10 +178,17 @@ void tick_nohz_stop_sched_tick(void) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - if (need_resched()) + if (!inidle && !ts->inidle) + goto end; + + ts->inidle = 1; + + if (need_resched() || need_resched_delayed()) goto end; cpu = smp_processor_id(); + +#ifndef CONFIG_PREEMPT_RT if (unlikely(local_softirq_pending())) { static int ratelimit; @@ -191,6 +198,7 @@ void tick_nohz_stop_sched_tick(void) ratelimit++; } } +#endif now = ktime_get(); /* @@ -249,6 +257,7 @@ void tick_nohz_stop_sched_tick(void) ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } /* @@ -334,8 +343,14 @@ void tick_nohz_restart_sched_tick(void) unsigned long ticks; ktime_t now, delta; - if (!ts->tick_stopped) + if (!ts->inidle || !ts->tick_stopped) { + ts->inidle = 0; return; + } + + ts->inidle = 0; + + rcu_exit_nohz(); /* Update jiffies first */ now = ktime_get(); @@ -440,7 +455,6 @@ static void tick_nohz_handler(struct clo } update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); /* Do not restart, when we are in the idle loop */ if (ts->tick_stopped) @@ -554,7 +568,6 @@ static enum hrtimer_restart tick_sched_t */ spin_unlock(&base->lock); update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); spin_lock(&base->lock); } Index: linux-2.6.24.7-rt27/arch/ppc/boot/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/ppc/boot/Makefile 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/ppc/boot/Makefile 2009-02-08 00:01:33.000000000 -0500 @@ -15,6 +15,15 @@ # KBUILD_CFLAGS used when building rest of boot (takes effect recursively) KBUILD_CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +KBUILD_CFLAGS := $(subst ${pg_flag},${space},${KBUILD_CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux-2.6.24.7-rt27/arch/arm/boot/compressed/head.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/boot/compressed/head.S 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/boot/compressed/head.S 2009-02-08 00:01:34.000000000 -0500 @@ -928,6 +928,19 @@ memdump: mov r12, r0 #endif .ltorg +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux-2.6.24.7-rt27/arch/arm/kernel/entry-common.S =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/kernel/entry-common.S 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/kernel/entry-common.S 2009-02-08 00:02:06.000000000 -0500 @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -44,7 +46,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_SIGPENDING beq no_work_pending @@ -54,7 +56,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -394,6 +397,114 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc, lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + mov r2, =mcount_trace_function + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl r2 +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr #endif Index: linux-2.6.24.7-rt27/arch/arm/kernel/fiq.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/kernel/fiq.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/kernel/fiq.c 2009-02-08 00:01:34.000000000 -0500 @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux-2.6.24.7-rt27/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/mm/copypage-v4mc.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/mm/copypage-v4mc.c 2009-02-08 00:02:19.000000000 -0500 @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-2.6.24.7-rt27/arch/arm/mm/copypage-xscale.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/mm/copypage-xscale.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/mm/copypage-xscale.c 2009-02-08 00:02:19.000000000 -0500 @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-2.6.24.7-rt27/arch/arm/mm/fault.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/mm/fault.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/mm/fault.c 2009-02-08 00:03:13.000000000 -0500 @@ -215,7 +215,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -229,7 +229,7 @@ do_page_fault(unsigned long addr, unsign * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* @@ -311,7 +311,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -354,7 +354,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { do_bad_area(addr, fsr, regs); @@ -364,7 +364,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -419,7 +419,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -433,7 +433,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void __exception +asmlinkage void __exception notrace do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -452,7 +452,7 @@ do_DataAbort(unsigned long addr, unsigne arm_notify_die("", regs, &info, fsr, 0); } -asmlinkage void __exception +asmlinkage void __exception notrace do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux-2.6.24.7-rt27/include/asm-arm/pgalloc.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-arm/pgalloc.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-arm/pgalloc.h 2009-02-08 00:01:34.000000000 -0500 @@ -109,7 +109,7 @@ static inline void __pmd_populate(pmd_t * * Ensure that we always set both PMD entries. */ -static inline void +static inline void notrace pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { unsigned long pte_ptr = (unsigned long)ptep; @@ -122,7 +122,7 @@ pmd_populate_kernel(struct mm_struct *mm __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); } -static inline void +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) { __pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); Index: linux-2.6.24.7-rt27/include/asm-arm/timex.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-arm/timex.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-arm/timex.h 2009-02-08 00:03:58.000000000 -0500 @@ -16,9 +16,17 @@ typedef unsigned long cycles_t; +#ifndef mach_read_cycles + #define mach_read_cycles() (0) +#ifdef CONFIG_EVENT_TRACE + #define mach_cycles_to_usecs(d) (d) + #define mach_usecs_to_cycles(d) (d) +#endif +#endif + static inline cycles_t get_cycles (void) { - return 0; + return mach_read_cycles(); } #endif Index: linux-2.6.24.7-rt27/include/asm-arm/unistd.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-arm/unistd.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-arm/unistd.h 2009-02-08 00:01:34.000000000 -0500 @@ -380,6 +380,10 @@ #define __NR_eventfd (__NR_SYSCALL_BASE+351) #define __NR_fallocate (__NR_SYSCALL_BASE+352) +#ifndef __ASSEMBLY__ +#define NR_syscalls (__NR_fallocate + 1 - __NR_SYSCALL_BASE) +#endif + /* * The following SWIs are ARM private. */ Index: linux-2.6.24.7-rt27/arch/arm/Kconfig =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/Kconfig 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/Kconfig 2009-02-08 00:01:45.000000000 -0500 @@ -33,6 +33,10 @@ config GENERIC_CLOCKEVENTS bool default n +config STACKTRACE_SUPPORT + bool + default y + config MMU bool default y @@ -618,18 +622,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source kernel/Kconfig.preempt config NO_IDLE_HZ bool "Dynamic tick timer" Index: linux-2.6.24.7-rt27/arch/arm/lib/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/arch/arm/lib/Makefile 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/arm/lib/Makefile 2009-02-08 00:01:34.000000000 -0500 @@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC) += ecard.o io-ac lib-$(CONFIG_ARCH_CLPS7500) += io-acorn.o lib-$(CONFIG_ARCH_L7200) += io-acorn.o lib-$(CONFIG_ARCH_SHARK) += io-shark.o +lib-$(CONFIG_STACKTRACE) += stacktrace.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S Index: linux-2.6.24.7-rt27/arch/arm/lib/stacktrace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/arch/arm/lib/stacktrace.c 2009-02-08 00:01:34.000000000 -0500 @@ -0,0 +1,7 @@ +#include +#include + +void save_stack_trace(struct stack_trace *trace) +{ +} + Index: linux-2.6.24.7-rt27/include/asm-arm/arch-ep93xx/timex.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/asm-arm/arch-ep93xx/timex.h 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/include/asm-arm/arch-ep93xx/timex.h 2009-02-08 00:01:34.000000000 -0500 @@ -1,5 +1,11 @@ /* * linux/include/asm-arm/arch-ep93xx/timex.h */ +#include +#include #define CLOCK_TICK_RATE 983040 + +#define mach_read_cycles() __raw_readl(EP93XX_TIMER4_VALUE_LOW) +#define mach_cycles_to_usecs(d) (((d) * ((1000000LL << 32) / CLOCK_TICK_RATE)) >> 32) +#define mach_usecs_to_cycles(d) (((d) * (((long long)CLOCK_TICK_RATE << 32) / 1000000)) >> 32) Index: linux-2.6.24.7-rt27/drivers/char/random.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/char/random.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/char/random.c 2009-02-08 00:01:35.000000000 -0500 @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, Index: linux-2.6.24.7-rt27/drivers/char/Kconfig =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/char/Kconfig 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/char/Kconfig 2009-02-08 00:03:19.000000000 -0500 @@ -753,6 +753,46 @@ config JS_RTC To compile this driver as a module, choose M here: the module will be called js-rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 @@ -1032,6 +1072,24 @@ config TELCLOCK /sys/devices/platform/telco_clock, with a number of files for controlling the behavior of this hardware. +config RMEM + tristate "Access to physical memory via /dev/rmem" + default m + help + The /dev/mem device only allows mmap() memory available to + I/O mapped memory; it does not allow access to "real" + physical memory. The /dev/rmem device is a hack which does + allow access to physical memory. We use this instead of + patching /dev/mem because we don't expect this functionality + to ever be accepted into mainline. + +config ALLOC_RTSJ_MEM + tristate "RTSJ-specific hack to reserve memory" + default m + help + The RTSJ TCK conformance test requires reserving some physical + memory for testing /dev/rmem. + config DEVPORT bool depends on !M68K Index: linux-2.6.24.7-rt27/drivers/char/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/char/Makefile 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/char/Makefile 2009-02-08 00:03:19.000000000 -0500 @@ -85,6 +85,8 @@ obj-$(CONFIG_TOSHIBA) += toshiba.o obj-$(CONFIG_I8K) += i8k.o obj-$(CONFIG_DS1620) += ds1620.o obj-$(CONFIG_HW_RANDOM) += hw_random/ +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o obj-$(CONFIG_COBALT_LCD) += lcd.o obj-$(CONFIG_PPDEV) += ppdev.o obj-$(CONFIG_NWBUTTON) += nwbutton.o @@ -96,6 +98,7 @@ obj-$(CONFIG_CS5535_GPIO) += cs5535_gpio obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu.o obj-$(CONFIG_GPIO_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_RMEM) += rmem.o obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ @@ -111,6 +114,8 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_JS_RTC) += js-rtc.o js-rtc-y = rtc.o +obj-$(CONFIG_ALLOC_RTSJ_MEM) += alloc_rtsj_mem.o + # Files generated that shall be removed upon make clean clean-files := consolemap_deftbl.c defkeymap.c Index: linux-2.6.24.7-rt27/drivers/char/blocker.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/drivers/char/blocker.c 2009-02-08 00:01:35.000000000 -0500 @@ -0,0 +1,109 @@ +/* + * priority inheritance testing device + */ + +#include +#include +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux-2.6.24.7-rt27/drivers/char/lpptest.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/drivers/char/lpptest.c 2009-02-08 00:01:35.000000000 -0500 @@ -0,0 +1,178 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux-2.6.24.7-rt27/drivers/char/rtc.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/char/rtc.c 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/char/rtc.c 2009-02-08 00:02:53.000000000 -0500 @@ -90,10 +90,35 @@ #include #include +#ifdef CONFIG_MIPS +# include +#endif static unsigned long rtc_port; static int rtc_irq = PCI_IRQ_NONE; #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef CONFIG_HPET_RTC_IRQ #undef RTC_IRQ #endif @@ -222,7 +247,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -266,9 +430,9 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -378,6 +542,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -610,6 +776,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -619,6 +790,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -717,6 +889,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -772,6 +945,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -1167,8 +1341,10 @@ static void rtc_dropped_irq(unsigned lon spin_unlock_irq(&rtc_lock); +#ifndef CONFIG_PREEMPT_RT if (printk_ratelimit()) printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); +#endif /* Now we have new data */ wake_up_interruptible(&rtc_wait); Index: linux-2.6.24.7-rt27/scripts/Makefile =================================================================== --- linux-2.6.24.7-rt27.orig/scripts/Makefile 2009-02-08 00:00:20.000000000 -0500 +++ linux-2.6.24.7-rt27/scripts/Makefile 2009-02-08 00:03:29.000000000 -0500 @@ -12,6 +12,12 @@ hostprogs-$(CONFIG_LOGO) += pnmt hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c +HOST_OS := $(shell uname) +ifeq ($(HOST_OS),Linux) +ifdef CONFIG_LPPTEST +hostprogs-y += testlpp +endif +endif always := $(hostprogs-y) $(hostprogs-m) Index: linux-2.6.24.7-rt27/scripts/testlpp.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/scripts/testlpp.c 2009-02-08 00:01:35.000000000 -0500 @@ -0,0 +1,159 @@ +/* + * testlpp.c: use the /dev/lpptest device to test IRQ handling + * latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner + * + * licensed under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +#define HIST_SIZE 10000 + +static int hist_total; +static unsigned long hist[HIST_SIZE]; + +static void hist_hit(unsigned long usecs) +{ + hist_total++; + if (usecs >= HIST_SIZE-1) + hist[HIST_SIZE-1]++; + else + hist[usecs]++; +} + +static void print_hist(void) +{ + int i; + + printf("LPP latency histogram:\n"); + + for (i = 0; i < HIST_SIZE; i++) { + if (hist[i]) + printf("%3d usecs: %9ld\n", i, hist[i]); + } +} + +static inline unsigned long long int rdtsc(void) +{ + unsigned long long int x, y; + for (;;) { + __asm__ volatile ("rdtsc" : "=A" (x)); + __asm__ volatile ("rdtsc" : "=A" (y)); + if (y - x < 1000) + return y; + } +} + +static unsigned long long calibrate_loop(void) +{ + unsigned long long mytime1, mytime2; + + mytime1 = rdtsc(); + usleep(500000); + mytime2 = rdtsc(); + + return (mytime2 - mytime1) * 2; +} + +#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec) + +#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec) + +int fd, total; +unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec; + +void cleanup(int sig) +{ + ioctl (fd, LPPTEST_ENABLE, &tim); + if (sig) + printf("[ interrupted - exiting ]\n"); + printf("\ntotal number of responses: %d\n", total); + printf("average reponse latency: %.2lf usecs\n", + time_to_usecs(sum_tim/total)); + printf("minimum latency: %.2lf usecs\n", + time_to_usecs(min_tim)); + printf("maximum latency: %.2lf usecs\n", + time_to_usecs(max_tim)); + print_hist(); + exit(0); +} + +#define HZ 3000 + +int main (int argc, char **argv) +{ + unsigned int nr_requests = 0; + + if (argc > 2) { + fprintf(stderr, "usage: testlpp []\n"); + exit(-1); + } + if (argc == 2) + nr_requests = atol(argv[1]); + + if (getuid() != 0) { + fprintf(stderr, "need to run as root!\n"); + exit(-1); + } + mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1)); + + fd = open("/dev/lpptest", O_RDWR); + if (fd == -1) { + fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n"); + exit(-1); + } + + signal(SIGINT,&cleanup); + + ioctl (fd, LPPTEST_DISABLE, &tim); + + fprintf(stderr, "calibrating cycles to usecs: "); + cycles_per_sec = calibrate_loop(); + fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000); + if (nr_requests) + fprintf(stderr, "[max # of requests: %u]\n", nr_requests); + fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ); + + while(1) { + ioctl (fd, LPPTEST_TEST, &tim); + if (tim == 0) + printf ("No response from target.\n"); + else { + hist_hit(time_to_usecs_l(tim)); + if (tim > max_tim) { + printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim); + max_tim = tim; + } + if (tim < min_tim) + min_tim = tim; + total++; + if (total == nr_requests) + break; + sum_tim += tim; + } + usleep(1000000/HZ); + } + cleanup(0); + + return 0; +} + + Index: linux-2.6.24.7-rt27/include/linux/lockdep.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/lockdep.h 2009-02-08 00:00:19.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/lockdep.h 2009-02-08 00:04:22.000000000 -0500 @@ -304,6 +304,9 @@ extern void lock_acquire(struct lockdep_ extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); +extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, + unsigned long ip); + # define INIT_LOCKDEP .lockdep_recursion = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -320,6 +323,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) @@ -357,6 +361,38 @@ do { \ lock_acquired(&(_lock)->dep_map); \ } while (0) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ +do { \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + f_lock(&(_lock)->lock); \ + } \ + lock_acquired(&(_lock)->dep_map); \ +} while (0) + + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ +({ \ + int ret = 0; \ + if (!f_try(&(_lock)->lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ret = f_lock(&(_lock)->lock); \ + } \ + if (!ret) \ + lock_acquired(&(_lock)->dep_map); \ + ret; \ +}) + +#define LOCK_CONTENDED_RT_RW(_lock, f_try, f_lock) \ +do { \ + if (!f_try(&(_lock)->owners)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + f_lock(&(_lock)->owners); \ + } \ + lock_acquired(&(_lock)->dep_map); \ +} while (0) + + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) @@ -365,6 +401,15 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RT(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + +#define LOCK_CONTENDED_RT_RET(_lock, f_try, f_lock) \ + f_lock(&(_lock)->lock) + +#define LOCK_CONTENDED_RT_RW(_lock, f_try, f_lock) \ + f_lock(&(_lock)->owners) + #endif /* CONFIG_LOCK_STAT */ #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) Index: linux-2.6.24.7-rt27/kernel/lockdep_internals.h =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/lockdep_internals.h 2009-02-08 00:00:19.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/lockdep_internals.h 2009-02-08 00:01:37.000000000 -0500 @@ -15,12 +15,12 @@ * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_KEYS_BITS 11 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) /* Index: linux-2.6.24.7-rt27/drivers/net/loopback.c =================================================================== --- linux-2.6.24.7-rt27.orig/drivers/net/loopback.c 2009-02-08 00:00:19.000000000 -0500 +++ linux-2.6.24.7-rt27/drivers/net/loopback.c 2009-02-08 00:02:53.000000000 -0500 @@ -154,13 +154,13 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = netdev_priv(dev); - lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); + lb_stats = per_cpu_ptr(pcpu_lstats, get_cpu()); lb_stats->bytes += skb->len; lb_stats->packets++; + put_cpu(); - netif_rx(skb); + netif_rx_ni(skb); return 0; } Index: linux-2.6.24.7-rt27/arch/powerpc/kernel/time.c =================================================================== --- linux-2.6.24.7-rt27.orig/arch/powerpc/kernel/time.c 2009-02-08 00:00:19.000000000 -0500 +++ linux-2.6.24.7-rt27/arch/powerpc/kernel/time.c 2009-02-08 00:01:38.000000000 -0500 @@ -751,7 +751,7 @@ static cycle_t rtc_read(void) return (cycle_t)get_rtc(); } -static cycle_t timebase_read(void) +static cycle_t notrace timebase_read(void) { return (cycle_t)get_tb(); } Index: linux-2.6.24.7-rt27/include/linux/rcuclassic.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/include/linux/rcuclassic.h 2009-02-08 00:03:47.000000000 -0500 @@ -0,0 +1,100 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (classic version) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2001 + * + * Author: Dipankar Sarma + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ + +#ifndef __LINUX_RCUCLASSIC_H +#define __LINUX_RCUCLASSIC_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +DECLARE_PER_CPU(int, rcu_data_bh_passed_quiesc); + +/* + * Increment the bottom-half quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_bh_qsctr_inc(int cpu) +{ + per_cpu(rcu_data_bh_passed_quiesc, cpu) = 1; +} + +#define __rcu_read_lock() \ + do { \ + preempt_disable(); \ + __acquire(RCU); \ + } while (0) +#define __rcu_read_unlock() \ + do { \ + __release(RCU); \ + preempt_enable(); \ + } while (0) +#define __rcu_read_lock_bh() \ + do { \ + local_bh_disable(); \ + __acquire(RCU_BH); \ + } while (0) +#define __rcu_read_unlock_bh() \ + do { \ + __release(RCU_BH); \ + local_bh_enable(); \ + } while (0) + +#define __synchronize_sched() synchronize_rcu() + +#define rcu_advance_callbacks_rt(cpu, user) do { } while (0) +#define rcu_check_callbacks_rt(cpu, user) do { } while (0) +#define rcu_init_rt() do { } while (0) +#define rcu_needs_cpu_rt(cpu) 0 +#define rcu_offline_cpu_rt(cpu) +#define rcu_online_cpu_rt(cpu) +#define rcu_pending_rt(cpu) 0 +#define rcu_process_callbacks_rt(unused) do { } while (0) +#define rcu_enter_nohz() do { } while (0) +#define rcu_exit_nohz() do { } while (0) +#define rcu_preempt_boost_init() do { } while (0) + +extern void FASTCALL(call_rcu_classic(struct rcu_head *head, + void (*func)(struct rcu_head *head))); + +struct softirq_action; +extern void rcu_process_callbacks(struct softirq_action *unused); + +#endif /* __KERNEL__ */ +#endif /* __LINUX_RCUCLASSIC_H */ Index: linux-2.6.24.7-rt27/include/linux/rcupdate.h =================================================================== --- linux-2.6.24.7-rt27.orig/include/linux/rcupdate.h 2009-02-08 00:00:19.000000000 -0500 +++ linux-2.6.24.7-rt27/include/linux/rcupdate.h 2009-02-08 00:03:45.000000000 -0500 @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2001 + * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma * @@ -53,6 +53,12 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; +#ifdef CONFIG_CLASSIC_RCU +#include +#else /* #ifdef CONFIG_CLASSIC_RCU */ +#include +#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ + #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT #define INIT_RCU_HEAD(ptr) do { \ @@ -61,79 +67,6 @@ struct rcu_head { -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - long cur; /* Current batch number. */ - long completed; /* Number of the last completed batch */ - int next_pending; /* Is the next batch already waiting? */ - - int signaled; - - spinlock_t lock ____cacheline_internodealigned_in_smp; - cpumask_t cpumask; /* CPUs that need to switch in order */ - /* for current batch to proceed. */ -} ____cacheline_internodealigned_in_smp; - -/* Is batch a before batch b ? */ -static inline int rcu_batch_before(long a, long b) -{ - return (a - b) < 0; -} - -/* Is batch a after batch b ? */ -static inline int rcu_batch_after(long a, long b) -{ - return (a - b) > 0; -} - -/* - * Per-CPU data for Read-Copy UPdate. - * nxtlist - new callbacks are added here - * curlist - current batch for which quiescent cycle started if any - */ -struct rcu_data { - /* 1) quiescent state handling : */ - long quiescbatch; /* Batch # for grace period */ - int passed_quiesc; /* User-mode/idle loop etc. */ - int qs_pending; /* core waits for quiesc state */ - - /* 2) batch handling */ - long batch; /* Batch # for current RCU batch */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail; - long qlen; /* # of queued callbacks */ - struct rcu_head *curlist; - struct rcu_head **curtail; - struct rcu_head *donelist; - struct rcu_head **donetail; - long blimit; /* Upper limit on a processed batch */ - int cpu; - struct rcu_head barrier; -}; - -DECLARE_PER_CPU(struct rcu_data, rcu_data); -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -static inline void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} -static inline void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -extern int rcu_pending(int cpu); -extern int rcu_needs_cpu(int cpu); - #ifdef CONFIG_DEBUG_LOCK_ALLOC extern struct lockdep_map rcu_lock_map; # define rcu_read_acquire() lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_) @@ -172,24 +105,14 @@ extern struct lockdep_map rcu_lock_map; * * It is illegal to block while in an RCU read-side critical section. */ -#define rcu_read_lock() \ - do { \ - preempt_disable(); \ - __acquire(RCU); \ - rcu_read_acquire(); \ - } while(0) +#define rcu_read_lock() __rcu_read_lock() /** * rcu_read_unlock - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ -#define rcu_read_unlock() \ - do { \ - rcu_read_release(); \ - __release(RCU); \ - preempt_enable(); \ - } while(0) +#define rcu_read_unlock() __rcu_read_unlock() /* * So where is rcu_write_lock()? It does not exist, as there is no @@ -212,24 +135,14 @@ extern struct lockdep_map rcu_lock_map; * can use just rcu_read_lock(). * */ -#define rcu_read_lock_bh() \ - do { \ - local_bh_disable(); \ - __acquire(RCU_BH); \ - rcu_read_acquire(); \ - } while(0) +#define rcu_read_lock_bh() __rcu_read_lock_bh() /* * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ -#define rcu_read_unlock_bh() \ - do { \ - rcu_read_release(); \ - __release(RCU_BH); \ - local_bh_enable(); \ - } while(0) +#define rcu_read_unlock_bh() __rcu_read_unlock_bh() /* * Prevent the compiler from merging or refetching accesses. The compiler @@ -293,21 +206,118 @@ extern struct lockdep_map rcu_lock_map; * In "classic RCU", these two guarantees happen to be one and * the same, but can differ in realtime RCU implementations. */ -#define synchronize_sched() synchronize_rcu() +#define synchronize_sched() __synchronize_sched() -extern void rcu_init(void); -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); -extern long rcu_batches_completed(void); -extern long rcu_batches_completed_bh(void); +/** + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +#ifdef CONFIG_CLASSIC_RCU +#define call_rcu call_rcu_classic +#else /* #ifdef CONFIG_CLASSIC_RCU */ +#define call_rcu call_rcu_preempt +#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ -/* Exported interfaces */ -extern void FASTCALL(call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *head))); +/** + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ extern void FASTCALL(call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head))); + +/* Exported common interfaces */ extern void synchronize_rcu(void); extern void rcu_barrier(void); +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); + +/* Internal to kernel */ +extern void rcu_check_callbacks(int cpu, int user); +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_init(void); +extern int rcu_needs_cpu(int cpu); +extern int rcu_pending(int cpu); +struct softirq_action; +extern void rcu_restart_cpu(int cpu); + +DECLARE_PER_CPU(int, rcu_data_passed_quiesc); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_qsctr_inc(int cpu) +{ + per_cpu(rcu_data_passed_quiesc, cpu) = 1; +} + +struct dentry; + +#ifdef CONFIG_PREEMPT_RCU_BOOST +extern void init_rcu_boost_late(void); +extern void rcu_boost_readers(void); +extern void rcu_unboost_readers(void); +extern void __rcu_preempt_boost(void); +#ifdef CONFIG_RCU_TRACE +extern int rcu_trace_boost_create(struct dentry *rcudir); +extern void rcu_trace_boost_destroy(void); +#endif /* CONFIG_RCU_TRACE */ +#define rcu_preempt_boost() /* cpp to avoid #include hell. */ \ + do { \ + if (unlikely(current->rcu_read_lock_nesting > 0)) \ + __rcu_preempt_boost(); \ + } while (0) +extern void __rcu_preempt_unboost(void); +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ +static inline void init_rcu_boost_late(void) +{ +} +static inline void rcu_preempt_boost(void) +{ +} +static inline void __rcu_preempt_unboost(void) +{ +} +static inline void rcu_boost_readers(void) +{ +} +static inline void rcu_unboost_readers(void) +{ +} +#ifdef CONFIG_RCU_TRACE +static inline int rcu_trace_boost_create(struct dentry *rcudir) +{ + return 0; +} +static inline void rcu_trace_boost_destroy(void) +{ +} +#endif /* CONFIG_RCU_TRACE */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPDATE_H */ Index: linux-2.6.24.7-rt27/kernel/rcuclassic.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.24.7-rt27/kernel/rcuclassic.c 2009-02-08 00:02:48.000000000 -0500 @@ -0,0 +1,636 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2001 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + long cur; /* Current batch number. */ + long completed; /* Number of the last completed batch */ + int next_pending; /* Is the next batch already waiting? */ + + int signaled; + + raw_spinlock_t lock ____cacheline_internodealigned_in_smp; + cpumask_t cpumask; /* CPUs that need to switch in order */ + /* for current batch to proceed. */ +} ____cacheline_internodealigned_in_smp; + +/* Is batch a before batch b ? */ +static inline int rcu_batch_before(long a, long b) +{ + return (a - b) < 0; +} + +/* + * Per-CPU data for Read-Copy UPdate. + * nxtlist - new callbacks are added here + * curlist - current batch for which quiescent cycle started if any + */ +struct rcu_data { + /* 1) quiescent state handling : */ + long quiescbatch; /* Batch # for grace period */ + int *passed_quiesc; /* User-mode/idle loop etc. */ + int qs_pending; /* core waits for quiesc state */ + + /* 2) batch handling */ + long batch; /* Batch # for current RCU batch */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail; + long qlen; /* # of queued callbacks */ + struct rcu_head *curlist; + struct rcu_head **curtail; + struct rcu_head *donelist; + struct rcu_head **donetail; + long blimit; /* Upper limit on a processed batch */ + int cpu; +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_ctrlblk = { + .cur = -300, + .completed = -300, + .lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), + .cpumask = CPU_MASK_NONE, +}; +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .cur = -300, + .completed = -300, + .lock = RAW_SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), + .cpumask = CPU_MASK_NONE, +}; + +static DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; +static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; +DEFINE_PER_CPU(int, rcu_data_bh_passed_quiesc); + +/* Fake initialization required by compiler */ +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(!rcp->signaled)) { + rcp->signaled = 1; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif + +/** + * call_rcu - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void fastcall call_rcu_classic(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu_classic); + +#ifdef CONFIG_CLASSIC_RCU + +/** + * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual update function to be invoked after the grace period + * + * The update function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by rcu_read_lock() and + * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() + * and rcu_read_unlock_bh(), if in process context. These may be nested. + */ +void fastcall call_rcu_bh(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = &__get_cpu_var(rcu_bh_data); + *rdp->nxttail = head; + rdp->nxttail = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + + local_irq_restore(flags); +} +#ifdef CONFIG_CLASSIC_RCU +EXPORT_SYMBOL_GPL(call_rcu_bh); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} +#ifdef CONFIG_CLASSIC_RCU +EXPORT_SYMBOL_GPL(rcu_batches_completed); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_ctrlblk.completed; +} +#ifdef CONFIG_CLASSIC_RCU +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + +/* + * Invoke the completed RCU callbacks. They are expected to be in + * a per-cpu list. + */ +static void rcu_do_batch(struct rcu_data *rdp) +{ + struct rcu_head *next, *list; + int count = 0; + + list = rdp->donelist; + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + if (++count >= rdp->blimit) + break; + } + rdp->donelist = list; + + local_irq_disable(); + rdp->qlen -= count; + local_irq_enable(); + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; + + if (!rdp->donelist) + rdp->donetail = &rdp->donelist; + else + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Grace period handling: + * The grace period handling consists out of two steps: + * - A new grace period is started. + * This is done by rcu_start_batch. The start is not broadcasted to + * all cpus, they must pick this up by comparing rcp->cur with + * rdp->quiescbatch. All cpus are recorded in the + * rcu_ctrlblk.cpumask bitmap. + * - All cpus must go through a quiescent state. + * Since the start of the grace period is not broadcasted, at least two + * calls to rcu_check_quiescent_state are required: + * The first call just notices that a new grace period is running. The + * following calls check if there was a quiescent state since the beginning + * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If + * the bitmap is empty, then the grace period is completed. + * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace + * period (if necessary). + */ +/* + * Register a new batch of callbacks, and start it up if there is currently no + * active batch and the batch to be registered has not already occurred. + * Caller must hold rcu_ctrlblk.lock. + */ +static void rcu_start_batch(struct rcu_ctrlblk *rcp) +{ + if (rcp->next_pending && + rcp->completed == rcp->cur) { + rcp->next_pending = 0; + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. + */ + smp_wmb(); + rcp->cur++; + + /* + * Accessing nohz_cpu_mask before incrementing rcp->cur needs a + * Barrier Otherwise it can cause tickless idle CPUs to be + * included in rcp->cpumask, which will extend graceperiods + * unnecessarily. + */ + smp_mb(); + cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + + rcp->signaled = 0; + } +} + +/* + * cpu went through a quiescent state since the beginning of the grace period. + * Clear it from the cpu mask and complete the grace period if it was the last + * cpu. Start another grace period if someone has further entries pending + */ +static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) +{ + cpu_clear(cpu, rcp->cpumask); + if (cpus_empty(rcp->cpumask)) { + /* batch completed ! */ + rcp->completed = rcp->cur; + rcu_start_batch(rcp); + } +} + +/* + * Check if the cpu has gone through a quiescent state (say context + * switch). If so and if it already hasn't done so in this RCU + * quiescent cycle, then indicate that it has done so. + */ +static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + if (rdp->quiescbatch != rcp->cur) { + /* start new grace period: */ + rdp->qs_pending = 1; + *rdp->passed_quiesc = 0; + rdp->quiescbatch = rcp->cur; + return; + } + + /* Grace period already completed for this cpu? + * qs_pending is checked instead of the actual bitmap to avoid + * cacheline trashing. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!*rdp->passed_quiesc) + return; + rdp->qs_pending = 0; + + spin_lock(&rcp->lock); + /* + * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync + * during cpu startup. Ignore the quiescent state. + */ + if (likely(rdp->quiescbatch == rcp->cur)) + cpu_quiet(rdp->cpu, rcp); + + spin_unlock(&rcp->lock); +} + + +#ifdef CONFIG_HOTPLUG_CPU + +/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing + * locking requirements, the list it's pulling from has to belong to a cpu + * which is dead and hence not processing interrupts. + */ +static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, + struct rcu_head **tail) +{ + local_irq_disable(); + *this_rdp->nxttail = list; + if (list) + this_rdp->nxttail = tail; + local_irq_enable(); +} + +static void __rcu_offline_cpu(struct rcu_data *this_rdp, + struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ + /* if the cpu going offline owns the grace period + * we can block indefinitely waiting for it, so flush + * it here + */ + spin_lock_bh(&rcp->lock); + if (rcp->cur != rcp->completed) + cpu_quiet(rdp->cpu, rcp); + spin_unlock_bh(&rcp->lock); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); + rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); +} + +static void rcu_offline_cpu(int cpu) +{ + struct rcu_data *this_rdp = &get_cpu_var(rcu_data); +#ifdef CONFIG_CLASSIC_RCU + struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + + __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, + &per_cpu(rcu_data, cpu)); +#ifdef CONFIG_CLASSIC_RCU + __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, + &per_cpu(rcu_bh_data, cpu)); + put_cpu_var(rcu_bh_data); +#endif /* #ifdef CONFIG_CLASSIC_RCU */ + put_cpu_var(rcu_data); + rcu_offline_cpu_rt(cpu); +} + +#else + +static void rcu_offline_cpu(int cpu) +{ +} + +#endif + +/* + * This does the RCU processing work from softirq context. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + unsigned long flags; + + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { + *rdp->donetail = rdp->curlist; + rdp->donetail = rdp->curtail; + rdp->curlist = NULL; + rdp->curtail = &rdp->curlist; + } + + if (rdp->nxtlist && !rdp->curlist) { + local_irq_save(flags); + rdp->curlist = rdp->nxtlist; + rdp->curtail = rdp->nxttail; + rdp->nxtlist = NULL; + rdp->nxttail = &rdp->nxtlist; + local_irq_restore(flags); + + /* + * start the next batch of callbacks + */ + + /* determine batch number */ + rdp->batch = rcp->cur + 1; + /* see the comment and corresponding wmb() in + * the rcu_start_batch() + */ + smp_rmb(); + + if (!rcp->next_pending) { + /* and start it/schedule start if it's a new batch */ + spin_lock(&rcp->lock); + rcp->next_pending = 1; + rcu_start_batch(rcp); + spin_unlock(&rcp->lock); + } + } + + rcu_check_quiescent_state(rcp, rdp); + if (rdp->donelist) + rcu_do_batch(rdp); +} + +void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); + __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + rcu_process_callbacks_rt(unused); +} + +static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ + /* This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) + return 1; + + /* This cpu has no pending entries, but there are new entries */ + if (!rdp->curlist && rdp->nxtlist) + return 1; + + /* This cpu has finished callbacks to invoke */ + if (rdp->donelist) + return 1; + + /* The rcu core waits for a quiescent state from the cpu */ + if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) + return 1; + + /* nothing to do */ + return 0; +} + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, returning 1 if so. This function is part of the + * RCU implementation; it is -not- an exported member of the RCU API. + */ +int rcu_pending(int cpu) +{ + return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || + __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)) || + rcu_pending_rt(cpu); +} + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + */ +int rcu_needs_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); + + return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu) || + rcu_needs_cpu_rt(cpu)); +} + +void rcu_advance_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); +} + +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); + rcu_check_callbacks_rt(cpu, user); + raise_softirq(RCU_SOFTIRQ); +} + +static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + memset(rdp, 0, sizeof(*rdp)); + rdp->curtail = &rdp->curlist; + rdp->nxttail = &rdp->nxtlist; + rdp->donetail = &rdp->donelist; + rdp->quiescbatch = rcp->completed; + rdp->qs_pending = 0; + rdp->cpu = cpu; + rdp->blimit = blimit; +} + +static void __cpuinit rcu_online_cpu(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); + + rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); + rdp->passed_quiesc = &per_cpu(rcu_data_passed_quiesc, cpu); + rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); + bh_rdp->passed_quiesc = &per_cpu(rcu_data_bh_passed_quiesc, cpu); + rcu_online_cpu_rt(cpu); +} + +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + rcu_online_cpu(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + rcu_offline_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata rcu_nb = { + .notifier_call = rcu_cpu_notify, +}; + +/* + * Initializes rcu mechanism. Assumed to be called early. + * That is before local timer(SMP) or jiffie timer (uniproc) is setup. + * Note that rcu_qsctr and friends are implicitly + * initialized due to the choice of ``0'' for RCU_CTR_INVALID. + */ +void __init rcu_init(void) +{ + rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + /* Register notifier for non-boot CPUs */ + register_cpu_notifier(&rcu_nb); + rcu_init_rt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); +} + +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); Index: linux-2.6.24.7-rt27/kernel/rcupdate.c =================================================================== --- linux-2.6.24.7-rt27.orig/kernel/rcupdate.c 2009-02-08 00:00:19.000000000 -0500 +++ linux-2.6.24.7-rt27/kernel/rcupdate.c 2009-02-08 00:03:45.000000000 -0500 @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2001 + * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma * Man